下载链接:http://pan.baidu.com/s/1jIBixLk 密码:qylv
一小时采集了6.67GB的小说,太多了只传了3000多部
简单爬虫,没什么好说的直接贴代码吧
# coding:utf-8
import urllib2
import re
import MySQLdb
import urllib
import threading
from _mysql import Error
import os
from macpath import dirname
import socket
import time
socket.setdefaulttimeout(10.0)
threadnum = 10
urlcache = 10
dirlinux = "~/novel/"
dirwindows = "D:/novel/"
isLinux = True
lock = threading.RLock()
lock1 = threading.RLock()
db = MySQLdb.connect("10.255.46.101", "py", "py", "novel")
cursor = db.cursor()
#Get menu
def downURL(url):
html = urllib2.urlopen(url).read()
p = r'<li ><a href="(.*?)" title="">(.*?)</a></li>'
p = p.replace('\n', '\r\n')
print url
for url, name in re.findall(p, html):
sql="insert into url(url, name) values ('%s', '%s')" % (url, name)
try:
cursor.execute(sql)
db.commit()
except Error,e:
print e
# TODO download next page
def geturl():
rs = []
sql = 'select id,url from url where p = 0 limit 10'
try:
cursor.execute(sql)
except:
print 'error:geturl'
for r in cursor.fetchall():
url = r[1]
sql = 'update url set p = 1 where url = "%s"' % (url)
try:
cursor.execute(sql)
db.commit()
except:
print 'error:geturl'
rs.append(r)
return rs
def progress():
return time.ctime();
def downBookUrl():
# thread = threading.current_thread()
# print thread.getName()
# print threading.activeCount()
thread = threading.current_thread()
print thread.getName(), u':开始'
while True:
if lock.acquire():
rs = geturl()
lock.release()
if not rs:
thread = threading.current_thread()
print thread.getName(), u':完成'
break
#
for r in rs:
url = r[1]
try:
html = urllib2.urlopen('http://www.qisuu.com'+url).read()
thread = threading.current_thread()
print thread.getName()+':http://www.qisuu.com' + url+' y '
sql = 'update url set p = 2 where url = "%s"' % (url)
if lock.acquire():
cursor.execute(sql)
lock.release()
p = 'target="_blank">(.*?)</a></em><a href="(.*?)">(.*?)</a>'
for zname, url, bname in re.findall(p, html):
html = urllib2.urlopen('http://www.qisuu.com' + url).read()
p = r'<a class="downButton" href=\'(.*?)\'.*?Txt格式下载</a>'
url = re.findall(p, html)[0]
sql = 'insert into book (zname, url, bname, cid) values ("%s","%s","%s",%d)'\
% (zname, url, bname, r[0])
if lock.acquire():
try:
cursor.execute(sql)
db.commit()
except:
pass
lock.release()
except:
thread = threading.current_thread()
print thread.getName() + ':http://www.qisuu.com' + url + ' n '
sql = 'update url set p = -1 where url = "%s"' % (url)
if lock.acquire():
cursor.execute(sql)
db.commit()
lock.release()
def createDir():
sql = 'select name from url'
cursor.execute(sql)
for name in cursor.fetchall():
if isLinux:
dirname = dirlinux+name[0]
else:
dirname = (dirwindows+name[0]).decode('utf-8').encode('gbk')
try:
os.makedirs(dirname)
print name[0],u'DIR create success'
except Exception,e:
print name[0],e
def getBook_url():
rs = []
sql = 'select book.url,book.bname,book.zname,url.name from book,url where cid = url.id and book.p = 0 limit '+str(urlcache)
if lock.acquire():
cursor.execute(sql)
for r in cursor.fetchall():
rs.append(r)
sql = 'update book set p=1 where url = "%s"' % (r[0])
try:
cursor.execute(sql)
db.commit()
except:
print 'error:mysql'
return rs
lock.release()
def downBook():
thread = threading.current_thread()
print thread.getName(), u':开始'
while True:
rs = getBook_url()
if not rs:
thread = threading.current_thread()
print thread.getName(),u'完成'
break
for r in rs:
url = r[0]
bname = r[1]
zname = r[2]
menu = r[3]
if isLinux:
booknamedir = dirlinux+menu+"/"+bname+"——"+zname+'.txt'
else:
booknamedir = dirwindows+menu+"/"+bname+"——"+zname+'.txt'
booknamedir = booknamedir.decode('utf-8').encode('gbk')
try:
urllib.urlretrieve(url, booknamedir)
sql = 'update book set p=2 where url = "%s"' % (url)
if lock1.acquire():
cursor.execute(sql)
db.commit()
print progress(),menu," / ",bname,u'成功'
lock1.release()
except:
sql = 'update book set p=-1 where url = "%s"' % (url)
if lock1.acquire():
try:
cursor.execute(sql)
db.commit()
except:
print "updata book:%s status -1 failed"%bname
lock1.release()
print progress(),menu," / ",bname,u'失败'
cursor.execute('truncate table book')
cursor.execute('truncate table url')
db.commit()
for x in xrange(1,11):
url = r'http://www.qisuu.com/soft/sort0%s/' % (x)
print url
downURL(url)
#downBookUrl
threads = []
for x in xrange(7):
t = threading.Thread(target=downBookUrl)
threads.append(t)
for x in xrange(7):
threads[x].start()
for x in xrange(7):
threads[x].join()
#create direction
createDir()
threads = []
for x in xrange(threadnum):
t = threading.Thread(target=downBook)
threads.append(t)
for x in xrange(threadnum):
threads[x].start()
for x in xrange(threadnum):
threads[x].join()
db.close()
发表评论
抢沙发~