①:基本语法 + 请求头

  1. from urllib import parse, request
  2. import string
  3. def get_message(url):
  4. # 添加参数信息
  5. params = {
  6. "wd": "廖雪峰"
  7. }
  8. # 添加请求头信息
  9. header = {
  10. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
  11. }
  12. # 参数转码
  13. str_params = parse.urlencode(params)
  14. # url转码 ,url有中文时要
  15. url_end = parse.quote(url + str_params, safe=string.printable)
  16. # 添加请求头 ,urlopen 没有添加请求头的方法,需要用Request 来出创建
  17. requests = request.Request(url_end, headers=header)
  18. # 获取信息
  19. rt_data = request.urlopen(requests)
  20. print(rt_data.read().decode("utf-8"))
  21. if __name__ == '__main__':
  22. url = "http://www.baidu.com/s?"
  23. get_message(url)

②:+ 随机ip地址

  1. - urllib.request.urlopen没有跟改ip地址的方法从urlopen 底层代码可以看出
  2. - urlopen发起请求主要使用 HTTPSHandler类和一个build_opener()函数搭建
  3. - HTTPSHandler 没有改变ip的方法,将改为 ProxyHandler

文件名 config.py

  1. ###
  2. HEADER = [
  3. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  4. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  5. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  6. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  7. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  8. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
  9. ]
  10. HANDLER = [
  11. {"https": "77.234.220.242:3128"},
  12. {"https": "77.234.220.242:3128"},
  13. {"https": "77.234.220.242:3128"},
  14. {"https": "77.234.220.242:3128"},
  15. {"https": "77.234.220.242:3128"},
  16. ]
  1. import urllib
  2. import string
  3. import random
  4. from urllib import parse, request
  5. from config import HEADER, HANDLER
  6. def get_message(url):
  7. params = {
  8. "wd": "廖雪峰"
  9. }
  10. # 参数转码
  11. str_params = urllib.parse.urlencode(params)
  12. str_url = urllib.parse.quote(url + str_params, safe=string.printable)
  13. # 随机获取代理
  14. random_handler = random.choice(HANDLER)
  15. ### 付费的代理 : HANDLER = {"http":"name:password@uri"}
  16. # 创建代理处理器
  17. handler = urllib.request.ProxyHandler(random_handler)
  18. # 构建 请求工具
  19. opener = urllib.request.build_opener(handler)
  20. # 随机获取请求头
  21. random_header = random.choice(HEADER)
  22. # 添加请求头
  23. end_url = urllib.request.Request(str_url)
  24. end_url.add_header("User-Agent", random_header)
  25. # 请求网址
  26. opener = opener.open(end_url)
  27. # 打印网页数据
  28. print(opener.read().decode("utf-8"))
  29. if __name__ == "__main__":
  30. url = "http://www.baidu.com/s?"
  31. get_message(url)

付费ip的使用

  1. import random
  2. from urllib import request
  3. from config import MY_USER_AGENT
  4. # 账号名字
  5. username = "name"
  6. # 密码
  7. pwd = "password"
  8. # 网址
  9. pay_proxy = "***.***.***.***:****"
  10. # 要爬取的网址
  11. url = "https://www.baidu.com"
  12. # 请求头
  13. headers = {"User-Agent":random.choice(MY_USER_AGENT)}
  14. # 创建密码管理器,认证
  15. password_manager = request.HTTPPasswordMgrWithPriorAuth()
  16. password_manager.add_password(None,pay_proxy,pwd,username)
  17. # 创建 认证的代理处理器
  18. proxy_auth_handler = request.ProxyBasicAuthHandler(password_manager)
  19. # 添加请求头
  20. end_url = request.Request(url,headers=headers)
  21. # 创建请求工具
  22. opener= request.build_opener(proxy_auth_handler)
  23. # 请求数据,打印
  24. print(opener.open(end_url).read().decode("utf-8"))

③:cookie数据登录

    1. cookie 用来进行登录验证的,需要账号密码
  • 2.一个账号一段时间内,在不同的地点(代理ip)、不同的浏览器(header)上多次登录,会被识别为非人为操作(封号)
  • 3.爬取要登录验证的网址: 多账号 ```python import string import random from http import cookiejar from urllib import request, parse from config import HEADER, HANDLER

def get_cookie(url1, url2):

  1. # 获取请求头
  2. header = HEADER[0]
  3. # 登录时要用到的数据
  4. login_push_data = {
  5. "username": "stringdecode",
  6. "pwd": "aijia7721python",
  7. "country": "86_zh-CN",
  8. "formhash": "D9DBB6F9BA",
  9. "backurl": "%2F%2Fwww.yaozh.com%2F",
  10. }
  11. # 对数据进行转码
  12. login_push_data = parse.urlencode(login_push_data, safe=string.printable).encode("utf-8")
  13. # cookiejar 用来记录cookie
  14. cookie = cookiejar.CookieJar()
  15. # 创建cookie处理器
  16. cookie_pro = request.HTTPCookieProcessor(cookie)
  17. opener = request.build_opener(cookie_pro)
  18. # 添加请求头
  19. request_url = request.Request(url1)
  20. request_url.add_header("User-Agent", header)
  21. # 通过登录信息,验证网页的登录功能,cookie 会被保存下来
  22. opener.open(request_url, data=login_push_data)
  23. 添加请求头
  24. end_url = request.Request(url2)
  25. end_url.add_header(key="User-Agent", val=header)
  26. #请求目标页面
  27. target_html = opener.open(end_url).read().decode("utf-8")
  28. # 写入文件
  29. with open("1.html", "w", encoding="utf-8") as f:
  30. f.write(target_html)

if name == ‘main‘: login_url = “https://www.yaozh.com/login/“ target_url = “https://report.yaozh.com/?yaozh“ cookie = get_cookie(login_url, target_url) ```