Python多线程爬虫实例——按分类批量下载小说

下载链接:http://pan.baidu.com/s/1jIBixLk 密码:qylv

一小时采集了6.67GB的小说,太多了只传了3000多部

QQ截图20170323160619

QQ截图20170323115052

 

简单爬虫,没什么好说的直接贴代码吧

# coding:utf-8
import urllib2
import re
import MySQLdb
import urllib
import threading
from _mysql import Error
import os
from macpath import dirname
import socket
import time

socket.setdefaulttimeout(10.0)

threadnum = 10
urlcache = 10
dirlinux = "~/novel/"
dirwindows = "D:/novel/"
isLinux = True

lock = threading.RLock()
lock1 = threading.RLock()
db = MySQLdb.connect("10.255.46.101", "py", "py", "novel")
cursor = db.cursor()


#Get menu
def downURL(url): 
    html = urllib2.urlopen(url).read()
    p = r'<li ><a href="(.*?)" title="">(.*?)</a></li>'
    p = p.replace('\n', '\r\n')
    print url 
    for url, name in re.findall(p, html):
        sql="insert into url(url, name) values ('%s', '%s')" % (url, name)
        try:
            cursor.execute(sql)
            db.commit()
        except Error,e:
            print e

# TODO  download next page


def geturl():
    rs = []
    sql = 'select id,url from url where p = 0 limit 10'
    try:
        cursor.execute(sql)
    except:
        print 'error:geturl'
    for r in cursor.fetchall():
        url = r[1]
        sql = 'update url set p = 1 where url = "%s"' % (url)
        try:
            cursor.execute(sql)
            db.commit()
        except:
            print 'error:geturl'
        rs.append(r)
    return rs

def progress():
    return time.ctime();


def downBookUrl():
    # thread = threading.current_thread()
    # print thread.getName()
    # print threading.activeCount()
    thread = threading.current_thread()
    print thread.getName(), u':开始'
    while True:
        if lock.acquire():
            rs = geturl()
        lock.release()
        if not rs:
            thread = threading.current_thread()
            print thread.getName(), u':完成'
            break
        #
        for r in rs:
            url = r[1]
            try:
                html = urllib2.urlopen('http://www.qisuu.com'+url).read()
                thread = threading.current_thread()
                print thread.getName()+':http://www.qisuu.com' + url+' y '
                sql = 'update url set p = 2 where url = "%s"' % (url)
                if lock.acquire():
                    cursor.execute(sql)
                lock.release()
                p = 'target="_blank">(.*?)</a></em><a href="(.*?)">(.*?)</a>'
                for zname, url, bname in re.findall(p, html):
                    html = urllib2.urlopen('http://www.qisuu.com' + url).read()
                    p = r'<a class="downButton" href=\'(.*?)\'.*?Txt格式下载</a>'
                    url = re.findall(p, html)[0]
                    sql = 'insert into book (zname, url, bname, cid) values ("%s","%s","%s",%d)'\
                          % (zname, url, bname, r[0])
                    if lock.acquire():
                        try:
                            cursor.execute(sql)
                            db.commit()
                        except:
                            pass
                    lock.release()
            except:
                thread = threading.current_thread()
                print thread.getName() + ':http://www.qisuu.com' + url + ' n '
                sql = 'update url set p = -1 where url = "%s"' % (url)
                if lock.acquire():
                    cursor.execute(sql)
                    db.commit()
                lock.release()

def createDir():
    sql = 'select name from url'
    cursor.execute(sql)

    for name in cursor.fetchall():
        if isLinux:
            dirname = dirlinux+name[0]
        else:
            dirname = (dirwindows+name[0]).decode('utf-8').encode('gbk')
        try:
            os.makedirs(dirname)
            print name[0],u'DIR create success'
        except Exception,e:
            print name[0],e

def getBook_url():
    rs = []
    sql = 'select book.url,book.bname,book.zname,url.name from book,url where cid = url.id and book.p = 0 limit '+str(urlcache)
    if lock.acquire():
        cursor.execute(sql)
        for r in cursor.fetchall():
            rs.append(r)
            sql = 'update book set p=1 where url = "%s"' % (r[0])
            try:
                cursor.execute(sql)
                db.commit()
            except:
                print 'error:mysql'
    return rs
    lock.release()


def downBook(): 
    thread = threading.current_thread()
    print thread.getName(), u':开始'    
    while True:  
        rs = getBook_url()
        if not rs:
            thread = threading.current_thread()
            print thread.getName(),u'完成'
            break
        for r in rs:
            url = r[0]
            bname = r[1]
            zname = r[2]
            menu = r[3]
            if isLinux:
                booknamedir = dirlinux+menu+"/"+bname+"——"+zname+'.txt'
            else:
                booknamedir = dirwindows+menu+"/"+bname+"——"+zname+'.txt'
                booknamedir = booknamedir.decode('utf-8').encode('gbk')
            try:
                urllib.urlretrieve(url, booknamedir)
                sql = 'update book set p=2 where url = "%s"' % (url)
                if lock1.acquire():
                    cursor.execute(sql)
                    db.commit()
                    print progress(),menu," / ",bname,u'成功'
                lock1.release()
            except:
                sql = 'update book set p=-1 where url = "%s"' % (url)
                if lock1.acquire():
                    try:
                        cursor.execute(sql)
                        db.commit()
                    except:
                        print "updata book:%s status -1 failed"%bname
                lock1.release()
                print progress(),menu," / ",bname,u'失败'

cursor.execute('truncate table book')
cursor.execute('truncate table url')
db.commit()



for x in xrange(1,11):
    url = r'http://www.qisuu.com/soft/sort0%s/' % (x)
    print url
    downURL(url)

#downBookUrl
threads = [] 
for x in xrange(7):
    t = threading.Thread(target=downBookUrl)
    threads.append(t)
for x in xrange(7):
    threads[x].start()
for x in xrange(7):
    threads[x].join()
#create direction
createDir()
threads = []
for x in xrange(threadnum):
    t = threading.Thread(target=downBook)
    threads.append(t)
for x in xrange(threadnum):
    threads[x].start()
for x in xrange(threadnum):
    threads[x].join()

db.close()

发表评论

[/0o0] [..^v^..] [0_0] [T.T] [=3-❤] [❤.❤] [^v^] [-.0] [!- -] [=x=] [→_→] [><] 更多 »
昵称

抢沙发~