前言
asyncio函数:
异步IO采用消息循环的模式,重复“读取消息—处理消息”的过程,也就是说异步IO模型”需要一个消息循环,在消息循环中,主线程不断地重复“读取消息-处理消息”这一过程。
event_loop 事件循环:
程序开启一个无限的循环,程序员会把一些函数注册到事件循环上。当满足事件发生的时候,调用相应的协程函数。
coroutine 协程:
协程对象,指一个使用async关键字定义的函数,它的调用不会立即执行函数,而是会返回一个协程对象。协程对象需要注册到事件循环,由事件循环调用。
task 任务:
一个协程对象就是一个原生可以挂起的函数,任务则是对协程进一步封装,其中包含任务的各种状态。
async/await 关键字: 用于定义协程的关键字,async定义一个协程,await用于挂起阻塞的异步调用接口。
参考案列
https://cloud.tencent.com/developer/article/1907288?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1738086?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1785143?from=article.detail.1949693
安装
pip install asyncio
案列演示
from aiohttp import ClientSession
import aiohttp, time
import asyncio
async def main(url):
t1 = time.time()
print("启动时间: %s" % t1)
async with aiohttp.ClientSession() as seesion:
async with seesion.get(url=url) as resp:
t2 = time.time()
print("结束时间: %s" % t2)
print('相差时间 cost: %.6f' % (t2 - t1))
if __name__ == '__main__':
url = "http://www.baidu.com"
asyncio.run(main(url=url))
""" 打印
启动时间: 1651936215.269375
结束时间: 1651936215.3499389
相差时间 cost: 0.080564
"""
from aiohttp import ClientSession
import aiohttp, time
import asyncio
tasks = []
url = "https://www.baidu.com/{}"
async def main(url):
t1 = time.time()
async with ClientSession() as session:
async with session.get(url) as response:
# response = await response.read()
t2 = time.time()
print(f'相差时间 cost: {t2 - t1} {url}')
def run():
for i in range(5):
task = asyncio.ensure_future(main(url.format(i)))
tasks.append(task)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
run()
loop.run_until_complete(asyncio.wait(tasks))
""" 打印
相差时间 cost: 0.1047825813293457 https://www.baidu.com/1
相差时间 cost: 0.1047821044921875 https://www.baidu.com/2
相差时间 cost: 0.2055821418762207 https://www.baidu.com/0
相差时间 cost: 0.11085987091064453 https://www.baidu.com/4
相差时间 cost: 0.11085987091064453 https://www.baidu.com/3
进程已结束,退出代码为 0
"""
threading多线程+aiohttp+多循环
代码 结合了 threading + aiohttp 多线程 + 多循环次数
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
new_loop = asyncio.new_event_loop()
th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )
self.ThreadList.append(th) # 加入队列
th.setDaemon(False) #守护线程
th.start()
#=========================================================上面的吧new_loop 带进来了
new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
#==========================================================
async with aiohttp.ClientSession() as seesion:
async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
pass
在for 循环中 使用threading 启动了一个线程 调用了 Start_task 这个函数, 这个函数是执行每个个URL 循环指定次数
class Runthread_batch(QtCore.QThread):
start_print = pyqtSignal(int, int, str)
start_print_starus = pyqtSignal()
start_print_api = pyqtSignal(dict)
def __init__(self, batchlist, dataObj):
super(Runthread_batch, self).__init__()
self.ThreadList = [] # 后面需要对线程销毁
self.start_num = 0
def __del__(self):
print("del 当程序关闭了会触发这个")
def run(self):
for index, item in enumerate(self.batchlist):
isgo = True
if stop_status_batch == 1: break
data = item.split("----")
# 投递任务
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
new_loop = asyncio.new_event_loop()
th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )
self.ThreadList.append(th) # 加入队列
th.setDaemon(False) #守护线程
th.start()
self.start_num += 1
# 这里实现 控制线程并行启动数量, 如果有空余的线程, 这里会跳出循环进行下一个url 的请求 并循环次数
if self.start_num >= self.Therade_num:
while isgo:
if self.start_num < self.Therade_num:
isgo = False
self.sleep(2)
isgo = True
# print("当前线程数量:", len(threading.enumerate()))
for th in self.ThreadList:
print("销毁_ 投递任务线程:: ", th)
th.join()
self.ThreadList = []
self.start_print_starus.emit() # 结束整个任务, 会给槽函数发送信号
# 执行循环次数的代码函数, 因为实例化出 new_loop 用来执行 new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
def Start_task(self, new_loop, Referer_url, type, id, num):
print(f"投递的任务: 索引:{str(num)} {Referer_url} {id} {type}")
try:
for i in range(1, self.For_num + 1):
if stop_status_batch == 1: break
new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
if i < 10:
# print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")
self.start_print.emit(num, i, Referer_url)
elif i % 10 == 0:
# print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")
self.start_print.emit(num, i, Referer_url)
self.start_num -= 1 # 表示运行完后 减 已完成运行的任务线程
print("Start_task当前线程数量:", len(threading.enumerate()))
except Exception as e:
print("执行任务出错了", e)
# 真正执行请求的代码 通过 aiohttp 请求
async def async_BaiduStatistics(self, si, Referer_url):
requestCounter = 0
while requestCounter < self.requestRate:
try:
headers = {'Referer': Referer_url,'User-Agent': ua.random,}
async with aiohttp.ClientSession() as seesion:
async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
pass
# 这里如果需要等待返回数据使用 await | await resp.text
except Exception as e:
print("执行投递错误:", e)
requestCounter += 1
threading单线程+aiohttp+多循环
# 初始化
self.new_loop = asyncio.new_event_loop()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
# 在 run 函数中调用 执行asyncio
self.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))
# 在实际请求网络的函数代码中使用了 异步请求网络 0.2/s 左右
async with aiohttp.ClientSession() as seesion:
async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
pass
# 单个网址刷 ok 0429
class Runthread_specified(QtCore.QThread):
start_printSP_Ui = pyqtSignal(int, int, str)
start_print_starus = pyqtSignal()
def __init__(self, specified):
super(Runthread_specified, self).__init__()
self.ThreadList = []
self.Radio_start = specified["Radio_start"]
self.api_url = specified["api_url"]
self.requestRate = int(specified["then_mun"])
self.for_num = int(specified["for_num"])
self.id = specified["id"]
self.url = specified["url"]
self.type = specified["type"]
self.UA = specified["UA"]
self.keyData = specified["key"]
self.locationX = specified["X"]
self.start_num = 0 # 启动线程
self.new_loop = asyncio.new_event_loop()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
def run(self):
for cnum in range(self.for_num + 1):
isgo = True # 真表示等待 , 假 退出循环
if stop_status_specified == 1: break
self.start_num += 1
# 投递任务
try:
self.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))
except Exception as e:
print("eeee", e)
if cnum < 10:
# print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")
self.start_printSP_Ui.emit(self.locationX, cnum, self.url)
elif cnum % 10 == 0:
# print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")
self.start_printSP_Ui.emit(self.locationX, cnum, self.url)
self.start_print_starus.emit()
async def async_BaiduStatistics(self, si, Referer_url):
requestCounter = 0
while requestCounter < self.requestRate:
try:
ua = UserAgent()
if self.keyData.rfind("\n") != -1:
self.key = random.choice(self.keyData.split("\n"))
else:
self.key = self.keyData
text = urllib.parse.quote(self.key)
url = "https://hm.baidu.com/hm.gif?"
payload = {}
headers = {'Referer': Referer_url,'User-Agent': ua.random,}
async with aiohttp.ClientSession() as seesion:
async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
pass
# timeout=2
except Exception as e:
print("执行投递错误:", e)
requestCounter += 1
============
aiohttp各种参数设置案列
aoihttp 业务核心功能
发起 get 请求
# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def main():
async with aiohttp.ClientSession() as session:
async with session.get('http://www.baidu.com') as resp:
print(resp.status)
res = await resp.text()
print(res[:100])
if __name__ == '__main__':
# 注意:
# python3.7+ 支持写法
# asyncio.run(main())
# python3.6及以下版本写法
event_loop = asyncio.get_event_loop()
result = event_loop.run_until_complete(asyncio.gather(main()))
event_loop.close()
发起post 请求
# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def post_v1():
data = b'\x00Binary-data\x00' # 未经编码的数据通过bytes数据上传
data = 'text' # 传递文本数据
data = {'key': 'value'} # 传递form表单
async with aiohttp.ClientSession() as sess:
async with sess.post('http://httpbin.org/post', data=data) as resp:
print(resp.status)
# 复杂的 post 请求
async def post_v2():
payload = {'key': 'value'} # 传递 pyload
async with aiohttp.ClientSession() as sess:
async with sess.post('http://httpbin.org/post', json=payload) as resp:
print(resp.status)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
result = event_loop.run_until_complete(asyncio.gather(main()))
event_loop.close()
向 url 中传递参数
有些场景是需要拼接请求url 在这个时候可以使用本 case 来做处理
# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def main():
""" 以下三种方式均可以 """
params = {'key1': 'value1', 'key2': 'value2'}
params = [('key', 'value1'), ('key', 'value2')]
params = 'key=value+1'
async with aiohttp.ClientSession() as sess:
async with sess.get('http://httpbin.org/get', params=params) as resp:
print(resp.status)
if __name__ == '__main__':
event_loop = asyncio.get_event_loop()
result = event_loop.run_until_complete(asyncio.gather(main()))
event_loop.close()
向目标服务器上传文件
有时候,我们确实是有想服务器传文件的需求,eg:上传回执单;上传图片…… 100张 10000张的量级的时候我们会想用多线程去处理,但量再大 你再使用 多线程+requests 的方式就会发现有大量的报错,若有类似的使用场景,可以用以下 case 处理
import aiohttp
async def main():
""" 传递文件 """
files = {'file': open('report.xls', 'rb')}
async with aiohttp.ClientSession() as sess:
async with sess.post('http://httpbin.org/post', data=files) as resp:
print(resp.status)
print(await resp.text())
async def main2():
""" 实例化 FormData 可以指定 filename 和 content_type """
data = aiohttp.FormData()
data.add_field('file',
open('report.xls', 'rb'),
filename='report.xls',
content_type='application/vnd.ms-excel')
async with aiohttp.ClientSession() as sess:
async with sess.post('http://httpbin.org/post', data=data) as resp:
print(resp.status)
print(await resp.text())
async def main3():
""" 流式上传文件 """
async with aiohttp.ClientSession() as sess:
with open('report.xls', 'rb') as f:
async with sess.post('http://httpbin.org/post', data=f) as resp:
print(resp.status)
print(await resp.text())
async def main4():
""" 因为 content属性是 StreamReader(提供异步迭代器协议),
所以可以将 get 和 post 请求链接在一起。python3.6+能使用"""
async with aiohttp.ClientSession() as sess:
async with sess.get('http://python.org') as resp:
async with sess.post('http://httpbin.org/post', data=resp.content) as r:
print(r.status)
print(await r.text())
设置请求超时
有时候,我们向服务器发送请求,若没有设置超时时间,此请求就会一直阻塞直到系统报错,这对于我们的系统是无法容忍的,所以发请求的时候千万要记得加上超时时间。
import aiohttp
timeout = aiohttp.ClientTimeout(total=60)
async def main():
async with aiohttp.ClientSession(timeout=timeout) as sess:
async with sess.get('http://httpbin.org/get') as resp:
print(resp.status)
print(await resp.text())
aoihttp 爬虫核心功能
自定义cookie
import aiohttp
import asyncio
cookies = {'cookies_are': 'working'}
async def main():
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get('http://httpbin.org/cookies') as resp:
print(resp.status)
print(await resp.text())
assert await resp.json() == {"cookies": {"cookies_are": "working"}}
if __name__ == "__main__":
event_loop = asyncio.get_event_loop()
result = event_loop.run_until_complete(asyncio.gather(main()))
event_loop.close()
在多个请求之间共享cookie
import aiohttp
import asyncio
async def main():
async with aiohttp.ClientSession() as session:
await session.get('http://httpbin.org/cookies/set?my_cookie=my_value')
filtered = session.cookie_jar.filter_cookies('http://httpbin.org')
print(filtered)
assert filtered['my_cookie'].value == 'my_value'
async with session.get('http://httpbin.org/cookies') as r:
json_body = await r.json()
print(json_body)
assert json_body['cookies']['my_cookie'] == 'my_value'
if __name__ == "__main__":
event_loop = asyncio.get_event_loop()
result = event_loop.run_until_complete(asyncio.gather(main()))
event_loop.close()
Cookie 的安全性问题: 默认 ClientSession 使用的是严格模式的 aiohttp.CookieJar. RFC 2109,明确的禁止接受url和ip地址产生的 cookie,只能接受 DNS 解析IP产生的cookie。
可以通过设置 aiohttp.CookieJar 的 unsafe=True 来配置
jar = aiohttp.CookieJar(unsafe=True)
session = aiohttp.ClientSession(cookie_jar=jar)
使用虚假Cookie Jar: 有时不想处理cookie。这时可以在会话中使用aiohttp.DummyCookieJar来达到目的。
jar = aiohttp.DummyCookieJar()
session = aiohttp.ClientSession(cookie_jar=jar)
自定义请求头
import aiohttp
import asyncio
async with aiohttp.ClientSession(headers={'User-Agent': 'your agent'
"refer":"http://httpbin.org"}) as session:
async with session.get('http://httpbin.org/headers') as resp:
print(resp.status)
print(await resp.text())
SSL验证警告问题
默认情况下,aiohttp对HTTPS协议使用严格检查,如果你不想上传SSL证书,可将ssl设置为False。
r = await session.get('https://example.com', ssl=False)
代理问题AgentIP
# 第一种
async with aiohttp.ClientSession() as session:
proxy_auth = aiohttp.BasicAuth('user', 'pass')
async with session.get("http://python.org", proxy="http://proxy.com", proxy_auth=proxy_auth) as resp:
print(resp.status)
# 第二种
session.get("http://python.org", proxy="http://user:pass@some.proxy.com")
aoihttp 连接池
1.使用连接器
想要调整请求的传输层你可以为ClientSession及其同类组件传递自定义的连接器。例如:conn = aiohttp.TCPConnector() session = aiohttp.ClientSession(connector=conn)
注:不要给多个会话对象使用同一个连接器,某一会话对象拥有其所有权。
2.限制连接池的容量
限制同一时间打开的连接数可以传递limit参数:conn = aiohttp.TCPConnector(limit=30)
这样就将总数限制在30,默认情况下是100.如果你不想有限制,传递0即可:conn = aiohttp.TCPConnector(limit=0)
小结:
爬虫常用的功能单独来写,主要是 aiohttp 还有一个问题没有解决,通过阅读源码确实是无法很好解决这个问题,在网上搜索了大半天基本没有有效的解决方案,so 笔者会给出一个自己找到的解决方案,在接下来的文章中我会进行分享。