1. 简介

简单优雅，下载次数多，受人喜欢，下载量高，官方文档完善
Request官网

2. 发起请求

构建请求和响应报文

import requests
targetUrl = "http://httpbin.org/get"
resp = requests.get(targetUrl)
print(resp.text)

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-60225ec0-04d817761fdafd9459be4c66"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/get"
}

直接打印文本

import requests
test_url = "http://httpbin.org/get" # 测试url
res = requests.get(test_url)
print(res.text)

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-6022666c-492947c37cb3a71443f517bd"
  }, 
  "origin": "183.223.85.232", 
  "url": "http://httpbin.org/get"
}

测试其他请求方式

POST方式

import requests
# post请求
data = {"data1":"Spider", "data2":"测试爬虫"}
test_url = "http://httpbin.org/post" # 测试url
res = requests.post(test_url,data=data)
print(res.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "data1": "Spider", 
    "data2": "\u6d4b\u8bd5\u722c\u866b"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "55", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-60227e99-47295bf138329e21631e53d8"
  }, 
  "json": null, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/post"
}

传递json表单
json和data二选一，有data则json为null

import requests
# post请求
data = {"data1":"Spider", "data2":"测试爬虫"}
json = {"json_style":"json-data"}
test_url = "http://httpbin.org/post" # 测试url
res = requests.post(test_url, json=json)
print(res.text)

{
  "args": {}, 
  "data": "{\"json_style\": \"json-data\"}", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "27", 
    "Content-Type": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-60228036-0845b88b3f85bca75085b16c"
  }, 
  "json": {
    "json_style": "json-data"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/post"
}

测试request的几个参数

Request库的使用 - 图1

测试自动转码

用户以get访问方式，传入字典形式的参数，特别注意的是字典中的值是中文，所以requests库的一个功能就是中文的自动转码

import requests
test_url = "http://httpbin.org/get" # 测试url
params = {"name1":"网络", "name2":"爬虫"}  # 请求参数
res = requests.get(test_url, params=params)
print(res.text)

{
  "args": {
    "name1": "\u7f51\u7edc", 
    "name2": "\u722c\u866b"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-602269a1-670a1cff406e773611659033"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/get?name1=\u7f51\u7edc&name2=\u722c\u866b"
}

测试headers

响应报文中有headers

import requests
test_url = "http://httpbin.org/get" # 测试url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
params = {"name1":"网络", "name2":"爬虫"}
res = requests.get(test_url, params=params, headers=headers)
print(res.text)

{
  "args": {
    "name1": "\u7f51\u7edc", 
    "name2": "\u722c\u866b"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56", 
    "X-Amzn-Trace-Id": "Root=1-60226c1f-7f595f34039a54485b1ace0d"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/get?name1=\u7f51\u7edc&name2=\u722c\u866b"
}

直接在requests方法中使用cookies参数

import requests
test_url = "http://httpbin.org/get" # 测试url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
cookies = {"sessionid":"hashcode", "userid": "987654321"}
params = {"name1":"网络", "name2":"爬虫"}
res = requests.get(test_url, params=params, headers=headers, cookies=cookies)
print(res.text)

{
  "args": {
    "name1": "\u7f51\u7edc", 
    "name2": "\u722c\u866b"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Cookie": "sessionid=hashcode; userid=987654321", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56", 
    "X-Amzn-Trace-Id": "Root=1-60226e2a-5a62cdd162da8d122627d70f"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/get?name1=\u7f51\u7edc&name2=\u722c\u866b"
}

设置超时

import requests
test_url = "http://httpbin.org/get" # 测试url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
cookies = {"sessionid":"hashcode", "userid": "987654321"}
params = {"name1":"网络", "name2":"爬虫"}
res = requests.get(test_url, params=params, headers=headers, cookies=cookies, timeout=1)
print(res.text)

关闭重定向

关闭重定向之后，像github此类存在重定向功能的网站无法正常访问，会直接显示302
302是临时重定向的意思

# 重定向
url = "http://github.com"
res_gh = requests.get(url, allow_redirects=False)
print(res_gh.text)
print(res_gh.status_code)

使用代理

仅作演示

import requests
test_url = "http://httpbin.org/get" # 测试url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
cookies = {"sessionid":"hashcode", "userid": "987654321"}
proxies = {"http":"123.123.12.123"}
params = {"name1":"网络", "name2":"爬虫"}
res = requests.get(test_url, params=params, headers=headers, cookies=cookies, timeout=100, proxies=proxies)
print(res.text)

证书验证

# 证书验证
url = "https://inv-veri.chinatax.gov.cn/"
res_ca = requests.get(url)
print(res_ca.text)

During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "C:\Users\41999\Documents\PycharmProjects\Python\TZ—Spyder\第三节Request库的使用\request_demo.py", line 40, in <module>
    res_ca = requests.get(url)
  File "C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\sessions.py", line 542, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\sessions.py", line 655, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\adapters.py", line 514, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='inv-veri.chinatax.gov.cn', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1123)')))

对比(设置verify=false之后就不再验证证书，但是返回报文依然会报错)

# 证书验证
url = "https://inv-veri.chinatax.gov.cn/"
res_ca = requests.get(url, verify=False)
print(res_ca.text)

C:\Users\41999\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'inv-veri.chinatax.gov.cn'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  warnings.warn(
ï»¿<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
    <title>å½å®¶ç¨å¡æ»å±å¨å½å¢å¼ç¨åç¥¨æ¥éªå¹³å°</title>
    <meta name="keywords" content="">
    <META HTTP-EQUIV="pragma" CONTENT="no-cache">

使用urllib3忽略警告(urllib3.disable_warnings())

# 证书验证
urllib3.disable_warnings()
url = "https://inv-veri.chinatax.gov.cn/"
res_ca = requests.get(url, verify=False)
print(res_ca.text)

使用requests包忽略警告(无需导入urllib3包)

# 证书验证
requests.packages.urllib3.disable_warnings()
url = "https://inv-veri.chinatax.gov.cn/"
res_ca = requests.get(url, verify=False)
print(res_ca.text)

ï»¿<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
    <title>å½å®¶ç¨å¡æ»å±å¨å½å¢å¼ç¨åç¥¨æ¥éªå¹³å°</title>
    <meta name="keywords" content="">
    <META HTTP-EQUIV="pragma" CONTENT="no-cache"> 
    <META HTTP-EQUIV="Cache-Control" CONTENT="no-cache, must-revalidate"> 
    <META HTTP-EQUIV="expires" CONTENT="0">

4. 接受响应

正常显示网页中的中文字体

res.encoding = “utf-8” # 为了正常显示网页中的中文信息

将获取的网页转换成字符串，而非字节码

print(res.text) # 直接转换成字符串，非字节码

将获取的数据转换成字节码而非字符串，图片和视频需要此格式

print(res.content) # 字节码，图片视频使用此参数

打印状态码

print(res.status_code) # 打印状态码
print(“—-“*20)

解析Json并且获取响应头中的数据

print(res.json()[“headers”][“User-Agent”]) # 用json取响应报文中headers中的数据，自动转换为字典格式

获取响应头

print(res.headers)
print(“—-“*20)

获取Cookies

print(res.cookies)

获取URL

print(res.url)
print(“—-“*20)

这里是获取响应头，request是标准格式，不是包Resquests

print(res.request.headers)

import requests
# post请求
data = {"data1":"Spider", "data2":"测试爬虫"}
json = {"json_style":"json-data"}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
test_url = "http://httpbin.org/post" # 测试url
res = requests.post(test_url, json=json)
res.encoding = "utf-8"  # 为了正常显示网页中的中文信息
print(res.text)  # 直接转换成字符串，非字节码
print(res.content)  # 字节码，图片视频使用此参数
print(res.status_code)  # 打印状态码
print("---"*20)
print(res.json()["headers"]["User-Agent"])  # 用json取响应报文中headers中的数据，自动转换为字典格式
print(res.headers)
print("---"*20)
print(res.cookies)
print(res.url)
print("---"*20)
# 这里是获取响应头，request是标准格式，不是包Resquests
print(res.request.headers)

{
  "args": {}, 
  "data": "{\"json_style\": \"json-data\"}", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "27", 
    "Content-Type": "application/json", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.25.1", 
    "X-Amzn-Trace-Id": "Root=1-602342b5-6d003f1f348f62150ec0a2f5"
  }, 
  "json": {
    "json_style": "json-data"
  }, 
  "origin": "47.240.65.248", 
  "url": "http://httpbin.org/post"
}
b'{\n  "args": {}, \n  "data": "{\\"json_style\\": \\"json-data\\"}", \n  "files": {}, \n  "form": {}, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Content-Length": "27", \n    "Content-Type": "application/json", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.25.1", \n    "X-Amzn-Trace-Id": "Root=1-602342b5-6d003f1f348f62150ec0a2f5"\n  }, \n  "json": {\n    "json_style": "json-data"\n  }, \n  "origin": "47.240.65.248", \n  "url": "http://httpbin.org/post"\n}\n'
200
------------------------------------------------------------
python-requests/2.25.1
{'Content-Length': '502', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Date': 'Wed, 10 Feb 2021 02:19:33 GMT', 'Keep-Alive': 'timeout=4', 'Proxy-Connection': 'keep-alive', 'Server': 'gunicorn/19.9.0'}
------------------------------------------------------------
<RequestsCookieJar[]>
http://httpbin.org/post
------------------------------------------------------------
{'User-Agent': 'python-requests/2.25.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Length': '27', 'Content-Type': 'application/json'}
Process finished with exit code 0

5. Session对象

功能：自动更新请求头信息，常用在账户登录的时候，先访问登录页面的URL，再访问数据提交的URL
例如12306

使用request请求获取请求头headers，含cookies

import resquests
index_url = "https://www.bing.com"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56"}
session = requests.session()
session.headers = headers
res_ss = session.get(index_url)
res_ss.encoding = "utf-8"
# print(res_ss.text)
'''使用session之后就可以使用request请求获取headers，这时候headers包含cookies'''
print("---"*40)
print(res_ss.request.headers)

{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56', 
 'Cookie': 'SNRHOP=TS=637485229167516980&I=1; _EDGE_S=F=1&SID=05EF50E1397D6E18095B5F3B383E6F57; _EDGE_V=1; MUID=056E607057156E9C14FE6FAA56566F14'}

20210210 年三十前一天 headers中的cookies无法解决，先学习后面的抓包工具