Python爬虫总结
1.1 urllib库
示例代码如下:
1.1.1发送get请求
from urllib import requesturl = 'https://qiubai-video-web.qiushibaike.com/MK11R93Y1A96C557_hd.mp4'resp = request.urlopen(url) # 返回 http.client.HTTPResponse objectif resp.getcode() == 200:with open('a.mp4', 'wb') as f:f.write(resp.read())
1.1.2发送post请求
from urllib import requestfrom urllib import parseurl = "https://study.163.com/mob/search/independent/v1"data = {"keyword": "王顺子","pageIndex": "1","pageSize": "20","searchType": "0"}params_str = parse.urlencode(data)resp = request.urlopen(url, data=bytes(params_str, encoding='UTF-8'))
1.1.3获取状态码
resp.getcode()
1.1.4获取响应头信息
获取所有头部信息
resp.getheaders()
获取某一字段信息
resp.getheader('Content-Type') # text/html;charset=UTF-8
1.1.5获取响应内容
resp.read()
1.1.6url地址编解码
from urllib import parseparams = {'kw': 'python教程','searchType': '1'}print(parse.urlencode(params)) # kw=python%E6%95%99%E7%A8%8B&searchType=1print(parse.parse_qs(params_str)) # {'kw': ['python教程'], 'searchType': ['1']}print(parse.parse_qsl(params_str)) # [('kw', 'python教程'), ('searchType', '1')]
1.1.7构建请求头
示例代码如下:
from urllib import requestfrom urllib import parseurl = "http://www.baidu.com"headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}# 1、构建一个Request对象req = request.Request(url, headers=headers)# 2、发送网络请求resp = request.urlopen(req)print(resp.read().decode('UTF-8'))
1.1.8处理不受信任的证书
示例代码如下:
from urllib import requestimport sslurl = "https://www.baidu.com"headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}# 1、构建一个Request对象req = request.Request(url, headers=headers)# context = ssl._create_unverified_context() # 忽略证书认证context = ssl.create_default_context(cafile="charles.pem") # 指定证书# 2、发送网络请求resp = request.urlopen(req, context=context)# resp = request.urlopen(req, cafile='charles.pem')print(resp.read().decode('UTF-8'))
1.1.9设置代理
from urllib import requesturl = "http://httpbin.org/ip"proxy = '121.230.209.149:18022' # 免费代理# proxy = '1924086038:xle4zavg@113.57.97.228:19532' # 私密代理# 1、创建一个httpHandler处理器对象proxy_handler = request.ProxyHandler({'http': proxy})# 2、封装成opener对象proxy_opener = request.build_opener(proxy_handler)# 3、发送网络请求resp = proxy_opener.open(url)print(resp.read().decode('UTF-8'))
1.1.10验证码处理
12306验证码案例
手动处理cookie
from urllib import requestfrom urllib import parse# 1. 下载验证码, 保存cookieurl = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"resp = request.urlopen(url)# 1.1 保存cookieheaders_str = resp.getheader("Set-Cookie")headers = headers_str.split(",")headers_result = []for header in headers:# _passport_session=c1ac8bbf9fc44a0ba06274fdadd588cf2003; Path=/passportheaders_result.append(header.split(";")[0])headers_result_str = ";".join(headers_result)print(headers_result_str)# 1.2 保存验证码图片with open("yzm.jpg", "wb") as f:f.write(resp.read())answer = input("请输入验证码答案: ")# 2. 验证验证码操作# postcheck_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"data = {"answer": answer,"login_site": "E","rand": "sjrand"}param = parse.urlencode(data)param_bytes = bytes(param, encoding="utf-8")check_headers = {"Cookie": headers_result_str}req = request.Request(check_url, data=param_bytes, headers=check_headers)result = request.urlopen(req)print(result.read().decode("utf-8"))
自动处理cookie
from urllib import requestfrom urllib import parsefrom http.cookiejar import CookieJar, MozillaCookieJar# 1. 下载验证码, 保存cookieurl = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"# 1.1 创建一个处理器对象# cookie_jar = CookieJar() # CookieJar只是在内存缓存cookie_jar = MozillaCookieJar() # MozillaCookieJar对象有一个save方法,可以把cookie保存到磁盘cookie_handler = request.HTTPCookieProcessor(cookie_jar)# 1.2 构造一个opener对象cookie_opener = request.build_opener(cookie_handler)# 1.3 opener 发送一个网络请求resp = cookie_opener.open(url)# print(cookie_jar)# 查看cookiefor c in cookie_jar:print(c)# cookie_jar.save("cookie.txt", ignore_discard=True, ignore_expires=True) # 保存cookie# print(cookie_jar.load("cookie.txt", ignore_discard=True, ignore_expires=True)) # 加载cookie# 1.2 保存验证码图片with open("yzm.jpg", "wb") as f:f.write(resp.read())answer = input("请输入验证码答案: ")# 2. 验证验证码操作# postcheck_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"data = {"answer": answer,"login_site": "E","rand": "sjrand"}param = parse.urlencode(data)param_bytes = bytes(param, encoding="utf-8")req = request.Request(check_url, data=param_bytes)result = cookie_opener.open(req)print(result.read().decode("utf-8"))
账号密码登录自动处理
from urllib import requestuser = "itlike"pwd = "123456"url = "http://httpbin.org/basic-auth/itlike/123456"# 1. 创建一个特定的处理器对象pm = request.HTTPPasswordMgrWithDefaultRealm()pm.add_password(None, url, user, pwd)handler = request.HTTPBasicAuthHandler(pm)# 2. 构建出一个Opener对象opener = request.build_opener(handler)# 3. 使用opener对象打开所有需要授权验证的URLresp = opener.open(url)print(resp.read().decode("utf-8"))
创建多处理器opner对象
from urllib import requestfrom http.cookiejar import CookieJar# httpproxy = "111.177.177.28:9999"# url = "http://httpbin.org/ip"url = "https://www.baidu.com"# 1. 创建一个处理器对象Handlerproxy_handler = request.ProxyHandler({"http": proxy})cookie_jar = CookieJar()cookie_handler = request.HTTPCookieProcessor(cookie_jar)opener = request.build_opener(proxy_handler, cookie_handler)# 3. opener发送请求(url, Request)req = request.Request(url, headers={"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"})resp = opener.open(req)for cookie in cookie_jar:print(cookie)# print(resp.read().decode("utf-8"))
1.1.11弹窗验证授权处理
from urllib import requestimport base64user = "itlike"pwd = "123456"result_str = user + ":" + pwdresult = "Basic " + base64.b64encode(bytes(result_str, encoding="utf-8")).decode("utf-8")# print(result)## exit(0)url = "http://httpbin.org/basic-auth/itlike/123456"req = request.Request(url, headers={"Authorization": result})resp = request.urlopen(req)print(resp.read().decode("utf-8"))
1.1.12下载进度监听
from urllib import requesturl = "https://m801.music.126.net/20210113162434/d250854fc6dd71e3ac7f107df2305932/jdyyaac/0353/055e/565e/4910a621a524e5158013a4ccdad535d9.m4a"def download_msg(block_num, block_size, total_size):# print(block_num, block_size, total_size)progress = (block_num + 1) * block_size / total_sizeprogress = 1 if progress > 1 else progressprint(progress)request.urlretrieve(url, "url_test_video.mp4", reporthook=download_msg)
1.1.12异常处理
from urllib import requestfrom urllib import errorimport sockettry:url = "http://localhost/test8.mp4"resp = request.urlopen(url)print(resp.read().decode("utf-8"))# except error.HTTPError as he:# print("http error", he.code, he.msg)# except error.URLError as ue:# print(ue)# except socket.timeout as te:# print(te)except Exception as e:print(e)
1.2 requests库
1.2.1发送get请求
import requests as rts# get# mid = 2081377url = "https://study.163.com/category/480000003131009"# full_url = url + "?" + "mid=2081377"pm = {"mid": "2081377"}# pm = [# ("mid", "2081377")# ]# rts.request("get", url)resp = rts.get(url, params=pm)
1.2.2发送post请求
import requests as rts# posturl = "https://study.163.com/mob/search/independent/v1"data_dic = {"keyword": "python","pageIndex": "1","pageSize": "20","searchType": "0"}# resp = rts.request("post", url, data=data_dic, verify=False)cert = r'charles.pem'# resp = rts.post(url, json=data_dic, verify=False) # json参数看post请求携带的参数是json字符串还是使用&符拼接的方式resp = rts.post(url, data=data_dic, verify=cert) # verify可以解决证书验证的问题,可以忽略验证或指定证书路径
这里需要注意的是,发送post请求是使用第三方库requests版本位2.25.1时,证书验证verify字段设置不起作用,需要降低版本库。使用requests2.7版本可以解决证书验证的问题。
1.2.3获取状态码
resp.status_code
1.2.4获取响应头信息
print(resp.reason) # okprint(resp.ok) # Trueprint(resp.headers) # 获取响应头信息
1.2.5获取响应体信息
# print(resp.raw)# print(type(resp.raw)) # urllib3.response.HTTPResponseprint(resp.content) # 原始字符串,未编码print(resp.text) # 经过编码
1.2.6获取响应编码
print(resp.encoding) # UTF-8resp.encoding = 'UTF-8' # 指定编码
1.2.7构建请求头
import requests as rtsurl = "https://www.baidu.com"headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}resp = rts.get(url, verify=False, headers=headers)
1.2.8处理不受信任的证书
import requests as rtsurl = "https://www.1234.com"resp = rts.get(url, verify=False) # 忽略证书验证print(resp.text)
1.2.9设置代理
import requests as rtsurl = "http://httpbin.org/ip"proxy = {# "http": "110.243.9.204:9999""http": "http://1924086038:xle4zavg@140.250.153.124:21160" # 私密代理}resp = rts.get(url, proxies=proxy)print(resp.text)
1.2.10验证码处理
手动处理cookies
import requests as rtsyzm_url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"yzm_resp = rts.get(yzm_url)cookie = yzm_resp.cookieswith open("yzm.jpg", "wb") as f:f.write(yzm_resp.content)# 回答验证码answer = input("请输入验证码答案: ")check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"data = {"answer": answer,"login_site": "E","rand": "sjrand"}check_resp = rts.post(check_url, data=data, cookies=cookie)print(check_resp.text)
自动处理cookie
import requests as rtssession = rts.Session()yzm_url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"yzm_resp = session.get(yzm_url)# cookie = yzm_resp.cookieswith open("yzm.jpg", "wb") as f:f.write(yzm_resp.content)# 回答验证码answer = input("请输入验证码答案: ")check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"data = {"answer": answer,"login_site": "E","rand": "sjrand"}# check_resp = rts.post(check_url, data=data, cookies=cookie)check_resp = session.post(check_url, data=data)print(check_resp.text)
1.2.11弹窗验证授权处理
import requests as rtsurl = "http://httpbin.org/basic-auth/itlike/123456"resp = rts.get(url, auth=("itlike", "123456"))print(resp.text)
1.2.12流式下载
按字节读取
import requests as rtsurl = "https://qiubai-video-web.qiushibaike.com/B1E3Q8C4514VKRJ3_hd.mp4"resp = rts.get(url, stream=True, verify=False, headers={"Accept-Encoding": ""})print(len(resp.content))for chunk in resp.iter_content(1024):print(chunk)
按行读取
import requests as rtsurl = "https://study.163.com/category/480000003131009"pm = [("mid", "2081377")]resp = rts.get(url, params=pm, verify=False, stream=True)# print(resp.raw.read(30000))# print(resp.text)with open("163.html", "w", encoding="utf-8") as f:for chunk in resp.iter_lines():f.write(chunk.decode("utf-8"))# print(chunk.decode("utf-8"))# stop = input("请问是否停止(y/n)")# if stop == "y":# break
1.2.14下载进度监听
import requests as rtsurl = "https://qiubai-video-web.qiushibaike.com/B1E3Q8C4514VKRJ3_hd.mp4"resp = rts.get(url, verify=False, stream=True)print(resp.headers)# 1. 先来获取文件的总大小total_size = int(resp.headers["Content-Length"])# 2. 计算当前已经下载的内容大小current_size = 0with open("gaoxiao.mp4", "wb") as f:for chunk in resp.iter_content(1024*10):f.write(chunk)current_size += len(chunk)print(current_size / total_size)
1.2.15异常处理
import requests as rtsurl = "https://study.163.com/category/480000003131009"pm = [("mid", "2081377")]try:# resp = rts.get(url, params=pm, timeout=0.001)resp = rts.get(url, params=pm, timeout=(0.001, 2))print(resp.ok)except rts.exceptions.ConnectTimeout as cte:print("连接异常-超时", cte)except rts.exceptions.ReadTimeout as rte:print("读取异常-超时", rte)
1.3 数据解析
1.3.1json字符串解析
import jsonjson_str = '{"status":{"code":0,"message":""},"extraInfo":null}'result = json.loads(json_str)print(type(result)) # <class 'dict'>print(result) # {'status': {'code': 0, 'message': ''}, 'extraInfo': None}
