相关模块 - aiohttp 模块-异步网络请求 - 《Python 基础知识》

前言
参考案列
安装
案列演示
threading多线程+aiohttp+多循环
threading单线程+aiohttp+多循环
============
aiohttp各种参数设置案列

前言

asyncio函数：

异步IO采用消息循环的模式，重复“读取消息—处理消息”的过程，也就是说异步IO模型”需要一个消息循环，在消息循环中，主线程不断地重复“读取消息-处理消息”这一过程。

event_loop 事件循环：

程序开启一个无限的循环，程序员会把一些函数注册到事件循环上。当满足事件发生的时候，调用相应的协程函数。

coroutine 协程：

协程对象，指一个使用async关键字定义的函数，它的调用不会立即执行函数，而是会返回一个协程对象。协程对象需要注册到事件循环，由事件循环调用。

task 任务：

一个协程对象就是一个原生可以挂起的函数，任务则是对协程进一步封装，其中包含任务的各种状态。
async/await 关键字：用于定义协程的关键字，async定义一个协程，await用于挂起阻塞的异步调用接口。

参考案列

https://cloud.tencent.com/developer/article/1907288?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1738086?from=article.detail.1949693
https://cloud.tencent.com/developer/article/1785143?from=article.detail.1949693

安装

pip install asyncio

案列演示


from aiohttp import ClientSession
import aiohttp, time
import asyncio
async def main(url):
    t1 = time.time()
    print("启动时间: %s" % t1)
    async with aiohttp.ClientSession() as seesion:
        async with seesion.get(url=url) as resp:
            t2 = time.time()
            print("结束时间: %s" % t2)
            print('相差时间 cost: %.6f' % (t2 - t1))
if __name__ == '__main__':
    url = "http://www.baidu.com"
    asyncio.run(main(url=url))
""" 打印
启动时间: 1651936215.269375
结束时间: 1651936215.3499389
相差时间 cost: 0.080564
"""

from aiohttp import ClientSession
import aiohttp, time
import asyncio
tasks = []
url = "https://www.baidu.com/{}"
async def main(url):
    t1 = time.time()
    async with ClientSession() as session:
        async with session.get(url) as response:
            # response = await response.read()
            t2 = time.time()
            print(f'相差时间 cost: {t2 - t1} {url}')
def run():
    for i in range(5):
        task = asyncio.ensure_future(main(url.format(i)))
        tasks.append(task)
if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    run()
    loop.run_until_complete(asyncio.wait(tasks))
""" 打印
相差时间 cost: 0.1047825813293457 https://www.baidu.com/1
相差时间 cost: 0.1047821044921875 https://www.baidu.com/2
相差时间 cost: 0.2055821418762207 https://www.baidu.com/0
相差时间 cost: 0.11085987091064453 https://www.baidu.com/4
相差时间 cost: 0.11085987091064453 https://www.baidu.com/3
进程已结束，退出代码为 0
"""

threading多线程+aiohttp+多循环

代码结合了 threading + aiohttp 多线程 + 多循环次数

asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
new_loop = asyncio.new_event_loop()
th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )
self.ThreadList.append(th)  # 加入队列
th.setDaemon(False) #守护线程
th.start()
#=========================================================上面的吧new_loop 带进来了
new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
#==========================================================
async with aiohttp.ClientSession() as seesion:
      async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
            pass

在for 循环中使用threading 启动了一个线程调用了 Start_task 这个函数, 这个函数是执行每个个URL 循环指定次数


class Runthread_batch(QtCore.QThread):
    start_print = pyqtSignal(int, int, str)
    start_print_starus = pyqtSignal()
    start_print_api = pyqtSignal(dict)
    def __init__(self, batchlist, dataObj):
        super(Runthread_batch, self).__init__()
        self.ThreadList = []  # 后面需要对线程销毁
        self.start_num = 0
    def __del__(self):
        print("del 当程序关闭了会触发这个")
    def run(self):
        for index, item in enumerate(self.batchlist):
            isgo = True
            if stop_status_batch == 1: break
            data = item.split("----")
            # 投递任务
            asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # 捕获报错
            new_loop = asyncio.new_event_loop()
            th = threading.Thread(target=self.Start_task, args=(new_loop,data[0], data[1], data[2], index,), name=f'threadQt_index:{index}', )
            self.ThreadList.append(th)  # 加入队列
            th.setDaemon(False) #守护线程
            th.start()
            self.start_num += 1
            # 这里实现 控制线程并行启动数量, 如果有空余的线程, 这里会跳出循环进行下一个url 的请求 并循环次数
            if self.start_num >= self.Therade_num:
                while isgo:
                    if self.start_num < self.Therade_num:
                        isgo = False
                    self.sleep(2)
                isgo = True
                # print("当前线程数量:", len(threading.enumerate()))
                for th in self.ThreadList:
                    print("销毁_ 投递任务线程:: ", th)
                    th.join()
                self.ThreadList = []
        self.start_print_starus.emit() # 结束整个任务, 会给槽函数发送信号
   # 执行循环次数的代码函数, 因为实例化出  new_loop 用来执行 new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
   def Start_task(self, new_loop, Referer_url, type, id, num):
        print(f"投递的任务: 索引:{str(num)} {Referer_url}  {id} {type}")
        try:
            for i in range(1, self.For_num + 1):
                if stop_status_batch == 1: break
                new_loop.run_until_complete(self.async_BaiduStatistics(id, Referer_url))
                if i < 10:
                    # print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")
                    self.start_print.emit(num, i, Referer_url)
                elif i % 10 == 0:
                    # print(f"索引 {str(num)} 投递了请求次数: {i} 投递了请求 {Referer_url}")
                    self.start_print.emit(num, i, Referer_url)
            self.start_num -= 1  # 表示运行完后 减 已完成运行的任务线程
            print("Start_task当前线程数量:", len(threading.enumerate()))
        except Exception as e:
            print("执行任务出错了", e)
     # 真正执行请求的代码 通过 aiohttp 请求
     async def async_BaiduStatistics(self, si, Referer_url):
        requestCounter = 0
        while requestCounter < self.requestRate:
            try:
                headers = {'Referer': Referer_url,'User-Agent': ua.random,}
                async with aiohttp.ClientSession() as seesion:
                    async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
                        pass
                        # 这里如果需要等待返回数据使用 await  |   await resp.text  
            except Exception as e:
                print("执行投递错误:", e)
            requestCounter += 1

threading单线程+aiohttp+多循环

# 初始化
self.new_loop = asyncio.new_event_loop()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # 捕获报错
# 在 run 函数中调用 执行asyncio
 self.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))
# 在实际请求网络的函数代码中使用了 异步请求网络 0.2/s 左右
async with aiohttp.ClientSession() as seesion:
      async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
            pass

# 单个网址刷 ok 0429
class Runthread_specified(QtCore.QThread):
    start_printSP_Ui = pyqtSignal(int, int, str)
    start_print_starus = pyqtSignal()
    def __init__(self, specified):
        super(Runthread_specified, self).__init__()
        self.ThreadList = []
        self.Radio_start = specified["Radio_start"]
        self.api_url = specified["api_url"]
        self.requestRate = int(specified["then_mun"])
        self.for_num = int(specified["for_num"])
        self.id = specified["id"]
        self.url = specified["url"]
        self.type = specified["type"]
        self.UA = specified["UA"]
        self.keyData = specified["key"]
        self.locationX = specified["X"]
        self.start_num = 0  # 启动线程
        self.new_loop = asyncio.new_event_loop()
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # 捕获报错
    def run(self):
        for cnum in range(self.for_num + 1):
            isgo = True  # 真表示等待 , 假 退出循环
            if stop_status_specified == 1: break
            self.start_num += 1
            # 投递任务
            try:
                self.new_loop.run_until_complete(self.async_BaiduStatistics(self.id, self.url))
            except Exception as e:
                print("eeee", e)
            if cnum < 10:
                # print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")
                self.start_printSP_Ui.emit(self.locationX, cnum, self.url)
            elif cnum % 10 == 0:
                # print(f"索引 {str(self.locationX)} 投递了请求次数: {cnum} 投递了请求 {self.url}")
                self.start_printSP_Ui.emit(self.locationX, cnum, self.url)
        self.start_print_starus.emit()
    async def async_BaiduStatistics(self, si, Referer_url):
        requestCounter = 0
        while requestCounter < self.requestRate:
            try:
                ua = UserAgent()
                if self.keyData.rfind("\n") != -1:
                    self.key = random.choice(self.keyData.split("\n"))
                else:
                    self.key = self.keyData
                text = urllib.parse.quote(self.key)
                url = "https://hm.baidu.com/hm.gif?"
                payload = {}
                headers = {'Referer': Referer_url,'User-Agent': ua.random,}
                async with aiohttp.ClientSession() as seesion:
                    async with seesion.get(url=url, params=payload, headers=headers, timeout=1) as resp:
                        pass
                        # timeout=2
            except Exception as e:
                print("执行投递错误:", e)
            requestCounter += 1

============

aiohttp各种参数设置案列

aoihttp 业务核心功能

发起 get 请求

# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def main():
    async with aiohttp.ClientSession() as session:
        async with session.get('http://www.baidu.com') as resp:
            print(resp.status)
            res = await resp.text()
            print(res[:100])
if __name__ == '__main__':
    # 注意:
    # python3.7+ 支持写法
    # asyncio.run(main())
    # python3.6及以下版本写法
    event_loop = asyncio.get_event_loop()
    result = event_loop.run_until_complete(asyncio.gather(main()))
    event_loop.close()

发起post 请求

# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def post_v1():
    data = b'\x00Binary-data\x00'  # 未经编码的数据通过bytes数据上传
    data = 'text'  # 传递文本数据
    data = {'key': 'value'}  # 传递form表单
    async with aiohttp.ClientSession() as sess:
        async with sess.post('http://httpbin.org/post', data=data) as resp:
            print(resp.status)
# 复杂的 post 请求
async def post_v2():
    payload = {'key': 'value'}  # 传递 pyload
    async with aiohttp.ClientSession() as sess:
        async with sess.post('http://httpbin.org/post', json=payload) as resp:
            print(resp.status)
if __name__ == '__main__':
    event_loop = asyncio.get_event_loop()
    result = event_loop.run_until_complete(asyncio.gather(main()))
    event_loop.close()

向 url 中传递参数

有些场景是需要拼接请求url 在这个时候可以使用本 case 来做处理

# -*- encoding: utf-8 -*-
import asyncio
import aiohttp
async def main():
    """ 以下三种方式均可以 """
    params = {'key1': 'value1', 'key2': 'value2'}
    params = [('key', 'value1'), ('key', 'value2')]
    params = 'key=value+1'
    async with aiohttp.ClientSession() as sess:
        async with sess.get('http://httpbin.org/get', params=params) as resp:
            print(resp.status)
if __name__ == '__main__':
    event_loop = asyncio.get_event_loop()
    result = event_loop.run_until_complete(asyncio.gather(main()))
    event_loop.close()

向目标服务器上传文件

有时候，我们确实是有想服务器传文件的需求，eg:上传回执单；上传图片…… 100张 10000张的量级的时候我们会想用多线程去处理，但量再大你再使用多线程+requests 的方式就会发现有大量的报错，若有类似的使用场景，可以用以下 case 处理

import aiohttp
async def main():
    """ 传递文件 """
    files = {'file': open('report.xls', 'rb')}
    async with aiohttp.ClientSession() as sess:
        async with sess.post('http://httpbin.org/post', data=files) as resp:
            print(resp.status)
            print(await resp.text())
async def main2():
    """ 实例化 FormData 可以指定 filename 和 content_type """
    data = aiohttp.FormData()
    data.add_field('file',
                   open('report.xls', 'rb'),
                   filename='report.xls',
                   content_type='application/vnd.ms-excel')
    async with aiohttp.ClientSession() as sess:
        async with sess.post('http://httpbin.org/post', data=data) as resp:
            print(resp.status)
            print(await resp.text())
async def main3():
    """ 流式上传文件 """
    async with aiohttp.ClientSession() as sess:
        with open('report.xls', 'rb') as f:
            async with sess.post('http://httpbin.org/post', data=f) as resp:
                print(resp.status)
                print(await resp.text())
async def main4():
    """ 因为 content属性是 StreamReader（提供异步迭代器协议），
    所以可以将 get 和 post 请求链接在一起。python3.6+能使用"""
    async with aiohttp.ClientSession() as sess:
        async with sess.get('http://python.org') as resp:
            async with sess.post('http://httpbin.org/post', data=resp.content) as r:
                print(r.status)
                print(await r.text())

设置请求超时

有时候，我们向服务器发送请求，若没有设置超时时间，此请求就会一直阻塞直到系统报错，这对于我们的系统是无法容忍的，所以发请求的时候千万要记得加上超时时间。

import aiohttp
timeout = aiohttp.ClientTimeout(total=60)
async def main():
    async with aiohttp.ClientSession(timeout=timeout) as sess:
        async with sess.get('http://httpbin.org/get') as resp:
            print(resp.status)
            print(await resp.text())

aoihttp 爬虫核心功能

自定义cookie

import aiohttp
import asyncio
cookies = {'cookies_are': 'working'}
async def main():
    async with aiohttp.ClientSession(cookies=cookies) as session:
        async with session.get('http://httpbin.org/cookies') as resp:
            print(resp.status)
            print(await resp.text())
            assert await resp.json() == {"cookies": {"cookies_are": "working"}}
if __name__ == "__main__":
    event_loop = asyncio.get_event_loop()
    result = event_loop.run_until_complete(asyncio.gather(main()))
    event_loop.close()

在多个请求之间共享cookie

import aiohttp
import asyncio
async def main():
    async with aiohttp.ClientSession() as session:
        await session.get('http://httpbin.org/cookies/set?my_cookie=my_value')
        filtered = session.cookie_jar.filter_cookies('http://httpbin.org')
        print(filtered)
        assert filtered['my_cookie'].value == 'my_value'
        async with session.get('http://httpbin.org/cookies') as r:
            json_body = await r.json()
            print(json_body)
            assert json_body['cookies']['my_cookie'] == 'my_value'
if __name__ == "__main__":
    event_loop = asyncio.get_event_loop()
    result = event_loop.run_until_complete(asyncio.gather(main()))
    event_loop.close()

Cookie 的安全性问题: 默认 ClientSession 使用的是严格模式的 aiohttp.CookieJar. RFC 2109，明确的禁止接受url和ip地址产生的 cookie，只能接受 DNS 解析IP产生的cookie。
可以通过设置 aiohttp.CookieJar 的 unsafe=True 来配置

jar = aiohttp.CookieJar(unsafe=True)
session = aiohttp.ClientSession(cookie_jar=jar)

使用虚假Cookie Jar: 有时不想处理cookie。这时可以在会话中使用aiohttp.DummyCookieJar来达到目的。

jar = aiohttp.DummyCookieJar()
session = aiohttp.ClientSession(cookie_jar=jar)

自定义请求头

import aiohttp
import asyncio
async with aiohttp.ClientSession(headers={'User-Agent': 'your agent'
"refer":"http://httpbin.org"}) as session:
    async with session.get('http://httpbin.org/headers') as resp:
        print(resp.status)
        print(await resp.text())

SSL验证警告问题

默认情况下，aiohttp对HTTPS协议使用严格检查，如果你不想上传SSL证书，可将ssl设置为False。

r = await session.get('https://example.com', ssl=False)

代理问题AgentIP

# 第一种
async with aiohttp.ClientSession() as session:
    proxy_auth = aiohttp.BasicAuth('user', 'pass')
    async with session.get("http://python.org", proxy="http://proxy.com", proxy_auth=proxy_auth) as resp:
        print(resp.status)
# 第二种
session.get("http://python.org", proxy="http://user:pass@some.proxy.com")

aoihttp 连接池

1.使用连接器

想要调整请求的传输层你可以为ClientSession及其同类组件传递自定义的连接器。例如:
conn = aiohttp.TCPConnector() session = aiohttp.ClientSession(connector=conn)

注:不要给多个会话对象使用同一个连接器，某一会话对象拥有其所有权。

2.限制连接池的容量

限制同一时间打开的连接数可以传递limit参数:
conn = aiohttp.TCPConnector(limit=30)

这样就将总数限制在30,默认情况下是100.如果你不想有限制，传递0即可:
conn = aiohttp.TCPConnector(limit=0)

小结：

爬虫常用的功能单独来写，主要是 aiohttp 还有一个问题没有解决，通过阅读源码确实是无法很好解决这个问题，在网上搜索了大半天基本没有有效的解决方案，so 笔者会给出一个自己找到的解决方案，在接下来的文章中我会进行分享。