基本框架代码

基本原理是同时调用电脑的多核进行爬虫,同时下载

import multiprocessing as mp
import time
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re
base_url='https://yulizi123.github.io/'
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()
def parse(html):
    soup=BeautifulSoup(html,features='html.parser')
    urls = soup.find_all('a',{'href':re.compile('^/.+?/$')})
    title=soup.find('h1').get_text().strip()
    page_urls=set([urljoin(base_url,url['href']) for url in urls])
    url = soup.find('meta',{'property':'og:url'})['content']
    return  title,page_urls,url
unseen=set([base_url,])
seen=set()
count, t1 = 1, time.time()
while len(unseen) != 0:  # still get some url to visit
    if len(seen) > 20:
        break
    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]
    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]
    print('\nAnalysing...')
    seen.update(unseen)  # seen the crawled
    unseen.clear()  # nothing unseen
    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)  # get new url to crawl
print('Total time: %.1f s' % (time.time() - t1,))  # 53 s

当前为一个正常的单线程的爬虫程序，多线程爬虫由于模块的种种问题，现在无法进行调试，待python的多线程学习完毕之后再来进行修改。

异步加载Asyncio

利用一个单线程的程序来控制爬虫的每一步的

tensorflow机器学习

加速爬虫

基本框架代码

异步加载Asyncio