基本框架代码
基本原理是同时调用电脑的多核进行爬虫,同时下载
import multiprocessing as mpimport timefrom urllib.request import urlopenfrom urllib.parse import urljoinfrom bs4 import BeautifulSoupimport rebase_url='https://yulizi123.github.io/'def crawl(url):response = urlopen(url)time.sleep(0.1)return response.read().decode()def parse(html):soup=BeautifulSoup(html,features='html.parser')urls = soup.find_all('a',{'href':re.compile('^/.+?/$')})title=soup.find('h1').get_text().strip()page_urls=set([urljoin(base_url,url['href']) for url in urls])url = soup.find('meta',{'property':'og:url'})['content']return title,page_urls,urlunseen=set([base_url,])seen=set()count, t1 = 1, time.time()while len(unseen) != 0: # still get some url to visitif len(seen) > 20:breakprint('\nDistributed Crawling...')htmls = [crawl(url) for url in unseen]print('\nDistributed Parsing...')results = [parse(html) for html in htmls]print('\nAnalysing...')seen.update(unseen) # seen the crawledunseen.clear() # nothing unseenfor title, page_urls, url in results:print(count, title, url)count += 1unseen.update(page_urls - seen) # get new url to crawlprint('Total time: %.1f s' % (time.time() - t1,)) # 53 s
当前为一个正常的单线程的爬虫程序,多线程爬虫由于模块的种种问题,现在无法进行调试,待python的多线程学习完毕之后再来进行修改。
异步加载Asyncio
利用一个单线程的程序来控制爬虫的每一步的
