Python爬虫总结

1.1 urllib库

示例代码如下:

1.1.1发送get请求

  1. from urllib import request
  2. url = 'https://qiubai-video-web.qiushibaike.com/MK11R93Y1A96C557_hd.mp4'
  3. resp = request.urlopen(url) # 返回 http.client.HTTPResponse object
  4. if resp.getcode() == 200:
  5. with open('a.mp4', 'wb') as f:
  6. f.write(resp.read())

1.1.2发送post请求

  1. from urllib import request
  2. from urllib import parse
  3. url = "https://study.163.com/mob/search/independent/v1"
  4. data = {
  5. "keyword": "王顺子",
  6. "pageIndex": "1",
  7. "pageSize": "20",
  8. "searchType": "0"
  9. }
  10. params_str = parse.urlencode(data)
  11. resp = request.urlopen(url, data=bytes(params_str, encoding='UTF-8'))

1.1.3获取状态码

  1. resp.getcode()

1.1.4获取响应头信息

获取所有头部信息

  1. resp.getheaders()

获取某一字段信息

  1. resp.getheader('Content-Type') # text/html;charset=UTF-8

1.1.5获取响应内容

  1. resp.read()

1.1.6url地址编解码

  1. from urllib import parse
  2. params = {
  3. 'kw': 'python教程',
  4. 'searchType': '1'
  5. }
  6. print(parse.urlencode(params)) # kw=python%E6%95%99%E7%A8%8B&searchType=1
  7. print(parse.parse_qs(params_str)) # {'kw': ['python教程'], 'searchType': ['1']}
  8. print(parse.parse_qsl(params_str)) # [('kw', 'python教程'), ('searchType', '1')]

1.1.7构建请求头

示例代码如下:

  1. from urllib import request
  2. from urllib import parse
  3. url = "http://www.baidu.com"
  4. headers = {
  5. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
  6. }
  7. # 1、构建一个Request对象
  8. req = request.Request(url, headers=headers)
  9. # 2、发送网络请求
  10. resp = request.urlopen(req)
  11. print(resp.read().decode('UTF-8'))

1.1.8处理不受信任的证书

示例代码如下:

  1. from urllib import request
  2. import ssl
  3. url = "https://www.baidu.com"
  4. headers = {
  5. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
  6. }
  7. # 1、构建一个Request对象
  8. req = request.Request(url, headers=headers)
  9. # context = ssl._create_unverified_context() # 忽略证书认证
  10. context = ssl.create_default_context(cafile="charles.pem") # 指定证书
  11. # 2、发送网络请求
  12. resp = request.urlopen(req, context=context)
  13. # resp = request.urlopen(req, cafile='charles.pem')
  14. print(resp.read().decode('UTF-8'))

1.1.9设置代理

  1. from urllib import request
  2. url = "http://httpbin.org/ip"
  3. proxy = '121.230.209.149:18022' # 免费代理
  4. # proxy = '1924086038:xle4zavg@113.57.97.228:19532' # 私密代理
  5. # 1、创建一个httpHandler处理器对象
  6. proxy_handler = request.ProxyHandler({'http': proxy})
  7. # 2、封装成opener对象
  8. proxy_opener = request.build_opener(proxy_handler)
  9. # 3、发送网络请求
  10. resp = proxy_opener.open(url)
  11. print(resp.read().decode('UTF-8'))

1.1.10验证码处理

12306验证码案例

手动处理cookie

  1. from urllib import request
  2. from urllib import parse
  3. # 1. 下载验证码, 保存cookie
  4. url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"
  5. resp = request.urlopen(url)
  6. # 1.1 保存cookie
  7. headers_str = resp.getheader("Set-Cookie")
  8. headers = headers_str.split(",")
  9. headers_result = []
  10. for header in headers:
  11. # _passport_session=c1ac8bbf9fc44a0ba06274fdadd588cf2003; Path=/passport
  12. headers_result.append(header.split(";")[0])
  13. headers_result_str = ";".join(headers_result)
  14. print(headers_result_str)
  15. # 1.2 保存验证码图片
  16. with open("yzm.jpg", "wb") as f:
  17. f.write(resp.read())
  18. answer = input("请输入验证码答案: ")
  19. # 2. 验证验证码操作
  20. # post
  21. check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"
  22. data = {
  23. "answer": answer,
  24. "login_site": "E",
  25. "rand": "sjrand"
  26. }
  27. param = parse.urlencode(data)
  28. param_bytes = bytes(param, encoding="utf-8")
  29. check_headers = {
  30. "Cookie": headers_result_str
  31. }
  32. req = request.Request(check_url, data=param_bytes, headers=check_headers)
  33. result = request.urlopen(req)
  34. print(result.read().decode("utf-8"))

自动处理cookie

  1. from urllib import request
  2. from urllib import parse
  3. from http.cookiejar import CookieJar, MozillaCookieJar
  4. # 1. 下载验证码, 保存cookie
  5. url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"
  6. # 1.1 创建一个处理器对象
  7. # cookie_jar = CookieJar() # CookieJar只是在内存缓存
  8. cookie_jar = MozillaCookieJar() # MozillaCookieJar对象有一个save方法,可以把cookie保存到磁盘
  9. cookie_handler = request.HTTPCookieProcessor(cookie_jar)
  10. # 1.2 构造一个opener对象
  11. cookie_opener = request.build_opener(cookie_handler)
  12. # 1.3 opener 发送一个网络请求
  13. resp = cookie_opener.open(url)
  14. # print(cookie_jar)
  15. # 查看cookie
  16. for c in cookie_jar:
  17. print(c)
  18. # cookie_jar.save("cookie.txt", ignore_discard=True, ignore_expires=True) # 保存cookie
  19. # print(cookie_jar.load("cookie.txt", ignore_discard=True, ignore_expires=True)) # 加载cookie
  20. # 1.2 保存验证码图片
  21. with open("yzm.jpg", "wb") as f:
  22. f.write(resp.read())
  23. answer = input("请输入验证码答案: ")
  24. # 2. 验证验证码操作
  25. # post
  26. check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"
  27. data = {
  28. "answer": answer,
  29. "login_site": "E",
  30. "rand": "sjrand"
  31. }
  32. param = parse.urlencode(data)
  33. param_bytes = bytes(param, encoding="utf-8")
  34. req = request.Request(check_url, data=param_bytes)
  35. result = cookie_opener.open(req)
  36. print(result.read().decode("utf-8"))

账号密码登录自动处理

  1. from urllib import request
  2. user = "itlike"
  3. pwd = "123456"
  4. url = "http://httpbin.org/basic-auth/itlike/123456"
  5. # 1. 创建一个特定的处理器对象
  6. pm = request.HTTPPasswordMgrWithDefaultRealm()
  7. pm.add_password(None, url, user, pwd)
  8. handler = request.HTTPBasicAuthHandler(pm)
  9. # 2. 构建出一个Opener对象
  10. opener = request.build_opener(handler)
  11. # 3. 使用opener对象打开所有需要授权验证的URL
  12. resp = opener.open(url)
  13. print(resp.read().decode("utf-8"))

创建多处理器opner对象

  1. from urllib import request
  2. from http.cookiejar import CookieJar
  3. # http
  4. proxy = "111.177.177.28:9999"
  5. # url = "http://httpbin.org/ip"
  6. url = "https://www.baidu.com"
  7. # 1. 创建一个处理器对象Handler
  8. proxy_handler = request.ProxyHandler({"http": proxy})
  9. cookie_jar = CookieJar()
  10. cookie_handler = request.HTTPCookieProcessor(cookie_jar)
  11. opener = request.build_opener(proxy_handler, cookie_handler)
  12. # 3. opener发送请求(url, Request)
  13. req = request.Request(url, headers={
  14. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
  15. })
  16. resp = opener.open(req)
  17. for cookie in cookie_jar:
  18. print(cookie)
  19. # print(resp.read().decode("utf-8"))

1.1.11弹窗验证授权处理

  1. from urllib import request
  2. import base64
  3. user = "itlike"
  4. pwd = "123456"
  5. result_str = user + ":" + pwd
  6. result = "Basic " + base64.b64encode(bytes(result_str, encoding="utf-8")).decode("utf-8")
  7. # print(result)
  8. #
  9. # exit(0)
  10. url = "http://httpbin.org/basic-auth/itlike/123456"
  11. req = request.Request(url, headers={
  12. "Authorization": result
  13. })
  14. resp = request.urlopen(req)
  15. print(resp.read().decode("utf-8"))

1.1.12下载进度监听

  1. from urllib import request
  2. url = "https://m801.music.126.net/20210113162434/d250854fc6dd71e3ac7f107df2305932/jdyyaac/0353/055e/565e/4910a621a524e5158013a4ccdad535d9.m4a"
  3. def download_msg(block_num, block_size, total_size):
  4. # print(block_num, block_size, total_size)
  5. progress = (block_num + 1) * block_size / total_size
  6. progress = 1 if progress > 1 else progress
  7. print(progress)
  8. request.urlretrieve(url, "url_test_video.mp4", reporthook=download_msg)

1.1.12异常处理

  1. from urllib import request
  2. from urllib import error
  3. import socket
  4. try:
  5. url = "http://localhost/test8.mp4"
  6. resp = request.urlopen(url)
  7. print(resp.read().decode("utf-8"))
  8. # except error.HTTPError as he:
  9. # print("http error", he.code, he.msg)
  10. # except error.URLError as ue:
  11. # print(ue)
  12. # except socket.timeout as te:
  13. # print(te)
  14. except Exception as e:
  15. print(e)

1.2 requests库

1.2.1发送get请求

  1. import requests as rts
  2. # get
  3. # mid = 2081377
  4. url = "https://study.163.com/category/480000003131009"
  5. # full_url = url + "?" + "mid=2081377"
  6. pm = {
  7. "mid": "2081377"
  8. }
  9. # pm = [
  10. # ("mid", "2081377")
  11. # ]
  12. # rts.request("get", url)
  13. resp = rts.get(url, params=pm)

1.2.2发送post请求

  1. import requests as rts
  2. # post
  3. url = "https://study.163.com/mob/search/independent/v1"
  4. data_dic = {
  5. "keyword": "python",
  6. "pageIndex": "1",
  7. "pageSize": "20",
  8. "searchType": "0"
  9. }
  10. # resp = rts.request("post", url, data=data_dic, verify=False)
  11. cert = r'charles.pem'
  12. # resp = rts.post(url, json=data_dic, verify=False) # json参数看post请求携带的参数是json字符串还是使用&符拼接的方式
  13. resp = rts.post(url, data=data_dic, verify=cert) # verify可以解决证书验证的问题,可以忽略验证或指定证书路径

这里需要注意的是,发送post请求是使用第三方库requests版本位2.25.1时,证书验证verify字段设置不起作用,需要降低版本库。使用requests2.7版本可以解决证书验证的问题。

1.2.3获取状态码

  1. resp.status_code

1.2.4获取响应头信息

  1. print(resp.reason) # ok
  2. print(resp.ok) # True
  3. print(resp.headers) # 获取响应头信息

1.2.5获取响应体信息

  1. # print(resp.raw)
  2. # print(type(resp.raw)) # urllib3.response.HTTPResponse
  3. print(resp.content) # 原始字符串,未编码
  4. print(resp.text) # 经过编码

1.2.6获取响应编码

  1. print(resp.encoding) # UTF-8
  2. resp.encoding = 'UTF-8' # 指定编码

1.2.7构建请求头

  1. import requests as rts
  2. url = "https://www.baidu.com"
  3. headers = {
  4. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
  5. }
  6. resp = rts.get(url, verify=False, headers=headers)

1.2.8处理不受信任的证书

  1. import requests as rts
  2. url = "https://www.1234.com"
  3. resp = rts.get(url, verify=False) # 忽略证书验证
  4. print(resp.text)

1.2.9设置代理

  1. import requests as rts
  2. url = "http://httpbin.org/ip"
  3. proxy = {
  4. # "http": "110.243.9.204:9999"
  5. "http": "http://1924086038:xle4zavg@140.250.153.124:21160" # 私密代理
  6. }
  7. resp = rts.get(url, proxies=proxy)
  8. print(resp.text)

1.2.10验证码处理

手动处理cookies

  1. import requests as rts
  2. yzm_url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"
  3. yzm_resp = rts.get(yzm_url)
  4. cookie = yzm_resp.cookies
  5. with open("yzm.jpg", "wb") as f:
  6. f.write(yzm_resp.content)
  7. # 回答验证码
  8. answer = input("请输入验证码答案: ")
  9. check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"
  10. data = {
  11. "answer": answer,
  12. "login_site": "E",
  13. "rand": "sjrand"
  14. }
  15. check_resp = rts.post(check_url, data=data, cookies=cookie)
  16. print(check_resp.text)

自动处理cookie

  1. import requests as rts
  2. session = rts.Session()
  3. yzm_url = "https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand&0.7367307074501106"
  4. yzm_resp = session.get(yzm_url)
  5. # cookie = yzm_resp.cookies
  6. with open("yzm.jpg", "wb") as f:
  7. f.write(yzm_resp.content)
  8. # 回答验证码
  9. answer = input("请输入验证码答案: ")
  10. check_url = "https://kyfw.12306.cn/passport/captcha/captcha-check"
  11. data = {
  12. "answer": answer,
  13. "login_site": "E",
  14. "rand": "sjrand"
  15. }
  16. # check_resp = rts.post(check_url, data=data, cookies=cookie)
  17. check_resp = session.post(check_url, data=data)
  18. print(check_resp.text)

1.2.11弹窗验证授权处理

  1. import requests as rts
  2. url = "http://httpbin.org/basic-auth/itlike/123456"
  3. resp = rts.get(url, auth=("itlike", "123456"))
  4. print(resp.text)

1.2.12流式下载

按字节读取

  1. import requests as rts
  2. url = "https://qiubai-video-web.qiushibaike.com/B1E3Q8C4514VKRJ3_hd.mp4"
  3. resp = rts.get(url, stream=True, verify=False, headers={
  4. "Accept-Encoding": ""
  5. })
  6. print(len(resp.content))
  7. for chunk in resp.iter_content(1024):
  8. print(chunk)

按行读取

  1. import requests as rts
  2. url = "https://study.163.com/category/480000003131009"
  3. pm = [
  4. ("mid", "2081377")
  5. ]
  6. resp = rts.get(url, params=pm, verify=False, stream=True)
  7. # print(resp.raw.read(30000))
  8. # print(resp.text)
  9. with open("163.html", "w", encoding="utf-8") as f:
  10. for chunk in resp.iter_lines():
  11. f.write(chunk.decode("utf-8"))
  12. # print(chunk.decode("utf-8"))
  13. # stop = input("请问是否停止(y/n)")
  14. # if stop == "y":
  15. # break

1.2.14下载进度监听

  1. import requests as rts
  2. url = "https://qiubai-video-web.qiushibaike.com/B1E3Q8C4514VKRJ3_hd.mp4"
  3. resp = rts.get(url, verify=False, stream=True)
  4. print(resp.headers)
  5. # 1. 先来获取文件的总大小
  6. total_size = int(resp.headers["Content-Length"])
  7. # 2. 计算当前已经下载的内容大小
  8. current_size = 0
  9. with open("gaoxiao.mp4", "wb") as f:
  10. for chunk in resp.iter_content(1024*10):
  11. f.write(chunk)
  12. current_size += len(chunk)
  13. print(current_size / total_size)

1.2.15异常处理

  1. import requests as rts
  2. url = "https://study.163.com/category/480000003131009"
  3. pm = [
  4. ("mid", "2081377")
  5. ]
  6. try:
  7. # resp = rts.get(url, params=pm, timeout=0.001)
  8. resp = rts.get(url, params=pm, timeout=(0.001, 2))
  9. print(resp.ok)
  10. except rts.exceptions.ConnectTimeout as cte:
  11. print("连接异常-超时", cte)
  12. except rts.exceptions.ReadTimeout as rte:
  13. print("读取异常-超时", rte)

1.3 数据解析

1.3.1json字符串解析

  1. import json
  2. json_str = '{"status":{"code":0,"message":""},"extraInfo":null}'
  3. result = json.loads(json_str)
  4. print(type(result)) # <class 'dict'>
  5. print(result) # {'status': {'code': 0, 'message': ''}, 'extraInfo': None}