我写了一个多线程的爬虫:URL抓取页面中其他URL,继续抓取其页面。里面用到了Queue但是在xp命令行下运行的时候经常光标不动。应该是信号同步的问题,调试很久不得其解,帖代码,往大家指正:
- Python code
#coding=utf-8 from __future__ import with_statement from BeautifulSoup import BeautifulSoup import urllib2 from threading import Thread from Queue import Queue import time import socket socket.setdefaulttimeout(5) class Fetcher:#把操作封到一个类里面,从网上搜得例子 def __init__(self,th_num): self.opener = urllib2.build_opener(urllib2.HTTPHandler) self.lock = Lock() #线程锁 self.q_req = Queue() #任务队列 self.q_ans = Queue() #完成队列 self.urls = []#返回抓取页面中的url self.th_num = th_num for i in range(th_num):#抓取线程 t = Thread(target=self.thread_get) t.setDaemon(True) t.start() for i in range(th_num):#处理线程 t = Thread(target=self.thread_put) t.setDaemon(True) t.start() def join(self): #解构时需等待两个队列完成 time.sleep(0.5) print '=====================im done' self.q_req.join() self.q_ans.join() def push(self,req): self.q_req.put(req) def thread_put(self): while True: try: if not self.q_ans.empty(): url = self.q_ans.get() self.urls.extend(url) self.q_ans.task_done() except Queue.empty,qe: print qe,'Queue==========' continue except Exception ,e: print e,'other,excp========' continue def thread_get(self): print 'i am starting------' while True: try: if self.q_req.empty(): continue req = self.q_req.get() except Queue.empty,qe: print 'enmpty-----------' continue urls = [] ans = '' try: ans = self.opener.open(req).read() soup = BeautifulSoup(ans) for a in soup.findAll('a'): try: if a['href'].startswith('http'): urls.append(a['href']) except KeyError, e: print e ,'=======================KeyError=in=soup=findAll' continue except Exception,ex: print ex,'========================Exception=in=soup=findAll' continue self.q_ans.put(urls) self.q_req.task_done() except UnicodeEncodeError, ue: print 'unicode----------------------wrong' print ue print req continue except urllib2.URLError, ue: print 'conn-----------rufuse' print ue print req continue except Exception, what: print 'other--exception----------in- threadget----' print what print req continue time.sleep(0.1) # don't spam print 'get==========' def run(links,th_num=10): f = Fetcher(th_num) for url in links: f.push(url) f.join() return f.urls if __name__ == "__main__": links = ['http://kingdowin.com/',] deep = 2#抓取页面的深度 while deep > 0: urls = run(links) deep -= 1 links = urls print links print "Exiting Main Thread"
------解决方案--------------------
E:\project\PyCharmProject\proberServer>python test.py
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------