Python多线程爬虫实例——将贴吧帖子存入MySQL

要求：爬百度某个贴吧，可以设置爬取的页数，将每个帖子和回复存入mysql数据库。

运行效果如下，合理设置threadNum和maxURL可以把网速顶满~

数据库结构：

#
# Structure for table "tieba"
#

CREATE TABLE `tieba` (
  `Id` int(11) NOT NULL AUTO_INCREMENT,
  `url` varchar(255) DEFAULT NULL,
  `name` varchar(255) DEFAULT NULL,
  `context` text,
  PRIMARY KEY (`Id`)
) ENGINE=MyISAM AUTO_INCREMENT=958 DEFAULT CHARSET=utf8;

#
# Structure for table "url"
#

CREATE TABLE `url` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `url` varchar(50) NOT NULL,
  `title` varchar(255) DEFAULT NULL,
  `author` varchar(255) DEFAULT NULL,
  `time` varchar(32) DEFAULT NULL,
  `status` tinyint(3) NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=51 DEFAULT CHARSET=utf8;

# coding:utf-8
import MySQLdb
import re
import urllib2
import threading
import time

db = MySQLdb.connect("10.255.46.101", "py", "py", "py")
cursor = db.cursor()


def downloadURL(bar,maxpage=1):
    print "start download URL"
    cursor.execute("truncate table url")
    cursor.execute("truncate table tieba")
    p = r'<a href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">(?:[\s\S]*?)title="主题作者: (.*?)"(?:[\s\S]*?)title="创建时间">(.*?)</span>'
    p = p.replace('\n','\r\n')
    for page in range(maxpage):
        url = "http://tieba.baidu.com/f?kw=" + bar + "&ie=utf-8&pn=" + str(page*50)
        print url
        html = urllib2.urlopen(url).read()
        print len("page:"+str(page))
        for url, title, author, time in re.findall(p, html):
            cursor.execute("insert into url (title,url,author,time) values('%s','%s','%s','%s')" % (
            title, 'http://tieba.baidu.com' + url, author, time))


def getURL(limit):
    if cursor.execute("select url from url where status = 0 limit "+ str(limit)):
        urls = []
        for row in cursor.fetchall():
            cursor.execute("update url set status = 1 where url = %s",args=(row[0],))
            urls.append(row[0])
        return urls
    else:
        return None


def download_tie():
    while True:
        if lock.acquire():
            url_list = getURL(maxURL)
        lock.release()
        if not url_list:
            break
        print url_list
        for url in url_list:
            print "download:" + url,
            try:
                html = urllib2.urlopen(url).read()
                p = r'<a (?:[\s\S]*?)class="p_author_name j_user_card" (?:[\s\S]*?)>(.*?)</a>(?:[\s\S]*?) class="d_post_content j_d_post_content  clearfix">(.*?)</div>'
                p = p.replace('\n','\r\n')
                if lock.acquire():
                    for name, context in re.findall(p, html):
                        context = context.replace("'", '\\\'')
                        context = context.replace('"', '\\\"')
                        sql = r"insert into tieba (url,name,context) values ('%s','%s','%s')" % (url, name, context)
                        cursor.execute(sql)
                    cursor.execute("update url set status = 2 where url = '" + url + "'")
                    print "√"
            except Exception,msg:
                lock.release()
                if lock.acquire():
                    cursor.execute("update url set status = -1 where url = '" + url + "'")
                print "x:"+str(msg)
            finally:
                lock.release()


def start():
    threads = []
    for i in range(threadNum):
        t = threading.Thread(target=download_tie)
        threads.append(t)
    for i in range(threadNum):
        threads[i].start()
    for i in range(threadNum):
        threads[i].join()

# 线程数
threadNum = 100
# url缓存,避免对数据库的频繁操作
maxURL = 10

lock = threading.Lock()
oldtime  = time.time()
下载前3页的url
downloadURL("python",3)
start()
print "OK! spend time:"+ str(time.time()-oldtime)
cursor.close()
db.close()

2B博客

Python多线程爬虫实例——将贴吧帖子存入MySQL

发表评论