前言
asyncio函数:
异步IO采用消息循环的模式,重复“读取消息—处理消息”的过程,也就是说异步IO模型”需要一个消息循环,在消息循环中,主线程不断地重复“读取消息-处理消息”这一过程。
event_loop 事件循环:
程序开启一个无限的循环,程序员会把一些函数注册到事件循环上。当满足事件发生的时候,调用相应的协程函数。
coroutine 协程:
协程对象,指一个使用async关键字定义的函数,它的调用不会立即执行函数,而是会返回一个协程对象。协程对象需要注册到事件循环,由事件循环调用。
task 任务:
一个协程对象就是一个原生可以挂起的函数,任务则是对协程进一步封装,其中包含任务的各种状态。
async/await 关键字: 用于定义协程的关键字,async定义一个协程,await用于挂起阻塞的异步调用接口。
参考案列
https://cloud.tencent.com/developer/article/1907288?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1738086?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1785143?from=article.detail.1949693
安装
pip install asyncio
案列演示
from aiohttp import ClientSessionimport aiohttp, timeimport asyncioasync def main(url):t1 = time.time()print("启动时间: %s" % t1)async with aiohttp.ClientSession() as seesion:async with seesion.get(url=url) as resp:t2 = time.time()print("结束时间: %s" % t2)print('相差时间 cost: %.6f' % (t2 - t1))if __name__ == '__main__':url = "http://www.baidu.com"asyncio.run(main(url=url))""" 打印启动时间: 1651936215.269375结束时间: 1651936215.3499389相差时间 cost: 0.080564"""
from aiohttp import ClientSessionimport aiohttp, timeimport asynciotasks = []url = "https://www.baidu.com/{}"async def main(url):t1 = time.time()async with ClientSession() as session:async with session.get(url) as response:# response = await response.read()t2 = time.time()print(f'相差时间 cost: {t2 - t1} {url}')def run():for i in range(5):task = asyncio.ensure_future(main(url.format(i)))tasks.append(task)if __name__ == '__main__':loop = asyncio.get_event_loop()run()loop.run_until_complete(asyncio.wait(tasks))""" 打印相差时间 cost: 0.1047825813293457 https://www.baidu.com/1相差时间 cost: 0.1047821044921875 https://www.baidu.com/2相差时间 cost: 0.2055821418762207 https://www.baidu.com/0相差时间 cost: 0.11085987091064453 https://www.baidu.com/4相差时间 cost: 0.11085987091064453 https://www.baidu.com/3进程已结束,退出代码为 0"""
threading多线程+aiohttp+多循环
代码 结合了 threading + aiohttp 多线程 + 多循环次数
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错new_loop = asyncio.new_event_loop()th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )self.ThreadList.append(th) # 加入队列th.setDaemon(False) #守护线程th.start()#=========================================================上面的吧new_loop 带进来了new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))#==========================================================async with aiohttp.ClientSession() as seesion:async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:pass
在for 循环中 使用threading 启动了一个线程 调用了 Start_task 这个函数, 这个函数是执行每个个URL 循环指定次数
class Runthread_batch(QtCore.QThread):start_print = pyqtSignal(int, int, str)start_print_starus = pyqtSignal()start_print_api = pyqtSignal(dict)def __init__(self, batchlist, dataObj):super(Runthread_batch, self).__init__()self.ThreadList = [] # 后面需要对线程销毁self.start_num = 0def __del__(self):print("del 当程序关闭了会触发这个")def run(self):for index, item in enumerate(self.batchlist):isgo = Trueif stop_status_batch == 1: breakdata = item.split("----")# 投递任务asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错new_loop = asyncio.new_event_loop()th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )self.ThreadList.append(th) # 加入队列th.setDaemon(False) #守护线程th.start()self.start_num += 1# 这里实现 控制线程并行启动数量, 如果有空余的线程, 这里会跳出循环进行下一个url 的请求 并循环次数if self.start_num >= self.Therade_num:while isgo:if self.start_num < self.Therade_num:isgo = Falseself.sleep(2)isgo = True# print("当前线程数量:", len(threading.enumerate()))for th in self.ThreadList:print("销毁_ 投递任务线程:: ", th)th.join()self.ThreadList = []self.start_print_starus.emit() # 结束整个任务, 会给槽函数发送信号# 执行循环次数的代码函数, 因为实例化出 new_loop 用来执行 new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))def Start_task(self, new_loop, Referer_url, type, id, num):print(f"投递的任务: 索引:{str(num)} {Referer_url} {id} {type}")try:for i in range(1, self.For_num + 1):if stop_status_batch == 1: breaknew_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))if i < 10:# print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")self.start_print.emit(num, i, Referer_url)elif i % 10 == 0:# print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")self.start_print.emit(num, i, Referer_url)self.start_num -= 1 # 表示运行完后 减 已完成运行的任务线程print("Start_task当前线程数量:", len(threading.enumerate()))except Exception as e:print("执行任务出错了", e)# 真正执行请求的代码 通过 aiohttp 请求async def async_BaiduStatistics(self, si, Referer_url):requestCounter = 0while requestCounter < self.requestRate:try:headers = {'Referer': Referer_url,'User-Agent': ua.random,}async with aiohttp.ClientSession() as seesion:async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:pass# 这里如果需要等待返回数据使用 await | await resp.textexcept Exception as e:print("执行投递错误:", e)requestCounter += 1
threading单线程+aiohttp+多循环
# 初始化self.new_loop = asyncio.new_event_loop()asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错# 在 run 函数中调用 执行asyncioself.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))# 在实际请求网络的函数代码中使用了 异步请求网络 0.2/s 左右async with aiohttp.ClientSession() as seesion:async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:pass
# 单个网址刷 ok 0429class Runthread_specified(QtCore.QThread):start_printSP_Ui = pyqtSignal(int, int, str)start_print_starus = pyqtSignal()def __init__(self, specified):super(Runthread_specified, self).__init__()self.ThreadList = []self.Radio_start = specified["Radio_start"]self.api_url = specified["api_url"]self.requestRate = int(specified["then_mun"])self.for_num = int(specified["for_num"])self.id = specified["id"]self.url = specified["url"]self.type = specified["type"]self.UA = specified["UA"]self.keyData = specified["key"]self.locationX = specified["X"]self.start_num = 0 # 启动线程self.new_loop = asyncio.new_event_loop()asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错def run(self):for cnum in range(self.for_num + 1):isgo = True # 真表示等待 , 假 退出循环if stop_status_specified == 1: breakself.start_num += 1# 投递任务try:self.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))except Exception as e:print("eeee", e)if cnum < 10:# print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")self.start_printSP_Ui.emit(self.locationX, cnum, self.url)elif cnum % 10 == 0:# print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")self.start_printSP_Ui.emit(self.locationX, cnum, self.url)self.start_print_starus.emit()async def async_BaiduStatistics(self, si, Referer_url):requestCounter = 0while requestCounter < self.requestRate:try:ua = UserAgent()if self.keyData.rfind("\n") != -1:self.key = random.choice(self.keyData.split("\n"))else:self.key = self.keyDatatext = urllib.parse.quote(self.key)url = "https://hm.baidu.com/hm.gif?"payload = {}headers = {'Referer': Referer_url,'User-Agent': ua.random,}async with aiohttp.ClientSession() as seesion:async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:pass# timeout=2except Exception as e:print("执行投递错误:", e)requestCounter += 1
============
aiohttp各种参数设置案列
aoihttp 业务核心功能
发起 get 请求
# -*- encoding: utf-8 -*-import asyncioimport aiohttpasync def main():async with aiohttp.ClientSession() as session:async with session.get('http://www.baidu.com') as resp:print(resp.status)res = await resp.text()print(res[:100])if __name__ == '__main__':# 注意:# python3.7+ 支持写法# asyncio.run(main())# python3.6及以下版本写法event_loop = asyncio.get_event_loop()result = event_loop.run_until_complete(asyncio.gather(main()))event_loop.close()
发起post 请求
# -*- encoding: utf-8 -*-import asyncioimport aiohttpasync def post_v1():data = b'\x00Binary-data\x00' # 未经编码的数据通过bytes数据上传data = 'text' # 传递文本数据data = {'key': 'value'} # 传递form表单async with aiohttp.ClientSession() as sess:async with sess.post('http://httpbin.org/post', data=data) as resp:print(resp.status)# 复杂的 post 请求async def post_v2():payload = {'key': 'value'} # 传递 pyloadasync with aiohttp.ClientSession() as sess:async with sess.post('http://httpbin.org/post', json=payload) as resp:print(resp.status)if __name__ == '__main__':event_loop = asyncio.get_event_loop()result = event_loop.run_until_complete(asyncio.gather(main()))event_loop.close()
向 url 中传递参数
有些场景是需要拼接请求url 在这个时候可以使用本 case 来做处理
# -*- encoding: utf-8 -*-import asyncioimport aiohttpasync def main():""" 以下三种方式均可以 """params = {'key1': 'value1', 'key2': 'value2'}params = [('key', 'value1'), ('key', 'value2')]params = 'key=value+1'async with aiohttp.ClientSession() as sess:async with sess.get('http://httpbin.org/get', params=params) as resp:print(resp.status)if __name__ == '__main__':event_loop = asyncio.get_event_loop()result = event_loop.run_until_complete(asyncio.gather(main()))event_loop.close()
向目标服务器上传文件
有时候,我们确实是有想服务器传文件的需求,eg:上传回执单;上传图片…… 100张 10000张的量级的时候我们会想用多线程去处理,但量再大 你再使用 多线程+requests 的方式就会发现有大量的报错,若有类似的使用场景,可以用以下 case 处理
import aiohttpasync def main():""" 传递文件 """files = {'file': open('report.xls', 'rb')}async with aiohttp.ClientSession() as sess:async with sess.post('http://httpbin.org/post', data=files) as resp:print(resp.status)print(await resp.text())async def main2():""" 实例化 FormData 可以指定 filename 和 content_type """data = aiohttp.FormData()data.add_field('file',open('report.xls', 'rb'),filename='report.xls',content_type='application/vnd.ms-excel')async with aiohttp.ClientSession() as sess:async with sess.post('http://httpbin.org/post', data=data) as resp:print(resp.status)print(await resp.text())async def main3():""" 流式上传文件 """async with aiohttp.ClientSession() as sess:with open('report.xls', 'rb') as f:async with sess.post('http://httpbin.org/post', data=f) as resp:print(resp.status)print(await resp.text())async def main4():""" 因为 content属性是 StreamReader(提供异步迭代器协议),所以可以将 get 和 post 请求链接在一起。python3.6+能使用"""async with aiohttp.ClientSession() as sess:async with sess.get('http://python.org') as resp:async with sess.post('http://httpbin.org/post', data=resp.content) as r:print(r.status)print(await r.text())
设置请求超时
有时候,我们向服务器发送请求,若没有设置超时时间,此请求就会一直阻塞直到系统报错,这对于我们的系统是无法容忍的,所以发请求的时候千万要记得加上超时时间。
import aiohttptimeout = aiohttp.ClientTimeout(total=60)async def main():async with aiohttp.ClientSession(timeout=timeout) as sess:async with sess.get('http://httpbin.org/get') as resp:print(resp.status)print(await resp.text())
aoihttp 爬虫核心功能
自定义cookie
import aiohttpimport asynciocookies = {'cookies_are': 'working'}async def main():async with aiohttp.ClientSession(cookies=cookies) as session:async with session.get('http://httpbin.org/cookies') as resp:print(resp.status)print(await resp.text())assert await resp.json() == {"cookies": {"cookies_are": "working"}}if __name__ == "__main__":event_loop = asyncio.get_event_loop()result = event_loop.run_until_complete(asyncio.gather(main()))event_loop.close()
在多个请求之间共享cookie
import aiohttpimport asyncioasync def main():async with aiohttp.ClientSession() as session:await session.get('http://httpbin.org/cookies/set?my_cookie=my_value')filtered = session.cookie_jar.filter_cookies('http://httpbin.org')print(filtered)assert filtered['my_cookie'].value == 'my_value'async with session.get('http://httpbin.org/cookies') as r:json_body = await r.json()print(json_body)assert json_body['cookies']['my_cookie'] == 'my_value'if __name__ == "__main__":event_loop = asyncio.get_event_loop()result = event_loop.run_until_complete(asyncio.gather(main()))event_loop.close()
Cookie 的安全性问题: 默认 ClientSession 使用的是严格模式的 aiohttp.CookieJar. RFC 2109,明确的禁止接受url和ip地址产生的 cookie,只能接受 DNS 解析IP产生的cookie。
可以通过设置 aiohttp.CookieJar 的 unsafe=True 来配置
jar = aiohttp.CookieJar(unsafe=True)session = aiohttp.ClientSession(cookie_jar=jar)
使用虚假Cookie Jar: 有时不想处理cookie。这时可以在会话中使用aiohttp.DummyCookieJar来达到目的。
jar = aiohttp.DummyCookieJar()session = aiohttp.ClientSession(cookie_jar=jar)
自定义请求头
import aiohttpimport asyncioasync with aiohttp.ClientSession(headers={'User-Agent': 'your agent'"refer":"http://httpbin.org"}) as session:async with session.get('http://httpbin.org/headers') as resp:print(resp.status)print(await resp.text())
SSL验证警告问题
默认情况下,aiohttp对HTTPS协议使用严格检查,如果你不想上传SSL证书,可将ssl设置为False。
r = await session.get('https://example.com', ssl=False)
代理问题AgentIP
# 第一种async with aiohttp.ClientSession() as session:proxy_auth = aiohttp.BasicAuth('user', 'pass')async with session.get("http://python.org", proxy="http://proxy.com", proxy_auth=proxy_auth) as resp:print(resp.status)# 第二种session.get("http://python.org", proxy="http://user:pass@some.proxy.com")
aoihttp 连接池
1.使用连接器
想要调整请求的传输层你可以为ClientSession及其同类组件传递自定义的连接器。例如:conn = aiohttp.TCPConnector() session = aiohttp.ClientSession(connector=conn)
注:不要给多个会话对象使用同一个连接器,某一会话对象拥有其所有权。
2.限制连接池的容量
限制同一时间打开的连接数可以传递limit参数:conn = aiohttp.TCPConnector(limit=30)
这样就将总数限制在30,默认情况下是100.如果你不想有限制,传递0即可:conn = aiohttp.TCPConnector(limit=0)
小结:
爬虫常用的功能单独来写,主要是 aiohttp 还有一个问题没有解决,通过阅读源码确实是无法很好解决这个问题,在网上搜索了大半天基本没有有效的解决方案,so 笔者会给出一个自己找到的解决方案,在接下来的文章中我会进行分享。
