①:基本语法 + 请求头
from urllib import parse, requestimport stringdef get_message(url):# 添加参数信息params = {"wd": "廖雪峰"}# 添加请求头信息header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}# 参数转码str_params = parse.urlencode(params)# url转码 ,url有中文时要url_end = parse.quote(url + str_params, safe=string.printable)# 添加请求头 ,urlopen 没有添加请求头的方法,需要用Request 来出创建requests = request.Request(url_end, headers=header)# 获取信息rt_data = request.urlopen(requests)print(rt_data.read().decode("utf-8"))if __name__ == '__main__':url = "http://www.baidu.com/s?"get_message(url)
②:+ 随机ip地址
- urllib.request.urlopen没有跟改ip地址的方法从urlopen 底层代码可以看出- urlopen发起请求主要使用 HTTPSHandler类和一个build_opener()函数搭建- HTTPSHandler 没有改变ip的方法,将改为 ProxyHandler
文件名 config.py
###HEADER = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",]HANDLER = [{"https": "77.234.220.242:3128"},{"https": "77.234.220.242:3128"},{"https": "77.234.220.242:3128"},{"https": "77.234.220.242:3128"},{"https": "77.234.220.242:3128"},]
import urllibimport stringimport randomfrom urllib import parse, requestfrom config import HEADER, HANDLERdef get_message(url):params = {"wd": "廖雪峰"}# 参数转码str_params = urllib.parse.urlencode(params)str_url = urllib.parse.quote(url + str_params, safe=string.printable)# 随机获取代理random_handler = random.choice(HANDLER)### 付费的代理 : HANDLER = {"http":"name:password@uri"}# 创建代理处理器handler = urllib.request.ProxyHandler(random_handler)# 构建 请求工具opener = urllib.request.build_opener(handler)# 随机获取请求头random_header = random.choice(HEADER)# 添加请求头end_url = urllib.request.Request(str_url)end_url.add_header("User-Agent", random_header)# 请求网址opener = opener.open(end_url)# 打印网页数据print(opener.read().decode("utf-8"))if __name__ == "__main__":url = "http://www.baidu.com/s?"get_message(url)
付费ip的使用
import randomfrom urllib import requestfrom config import MY_USER_AGENT# 账号名字username = "name"# 密码pwd = "password"# 网址pay_proxy = "***.***.***.***:****"# 要爬取的网址url = "https://www.baidu.com"# 请求头headers = {"User-Agent":random.choice(MY_USER_AGENT)}# 创建密码管理器,认证password_manager = request.HTTPPasswordMgrWithPriorAuth()password_manager.add_password(None,pay_proxy,pwd,username)# 创建 认证的代理处理器proxy_auth_handler = request.ProxyBasicAuthHandler(password_manager)# 添加请求头end_url = request.Request(url,headers=headers)# 创建请求工具opener= request.build_opener(proxy_auth_handler)# 请求数据,打印print(opener.open(end_url).read().decode("utf-8"))
③:cookie数据登录
- cookie 用来进行登录验证的,需要账号密码
- 2.一个账号一段时间内,在不同的地点(代理ip)、不同的浏览器(header)上多次登录,会被识别为非人为操作(封号)
- 3.爬取要登录验证的网址: 多账号 ```python import string import random from http import cookiejar from urllib import request, parse from config import HEADER, HANDLER
def get_cookie(url1, url2):
# 获取请求头header = HEADER[0]# 登录时要用到的数据login_push_data = {"username": "stringdecode","pwd": "aijia7721python","country": "86_zh-CN","formhash": "D9DBB6F9BA","backurl": "%2F%2Fwww.yaozh.com%2F",}# 对数据进行转码login_push_data = parse.urlencode(login_push_data, safe=string.printable).encode("utf-8")# cookiejar 用来记录cookiecookie = cookiejar.CookieJar()# 创建cookie处理器cookie_pro = request.HTTPCookieProcessor(cookie)opener = request.build_opener(cookie_pro)# 添加请求头request_url = request.Request(url1)request_url.add_header("User-Agent", header)# 通过登录信息,验证网页的登录功能,cookie 会被保存下来opener.open(request_url, data=login_push_data)添加请求头end_url = request.Request(url2)end_url.add_header(key="User-Agent", val=header)#请求目标页面target_html = opener.open(end_url).read().decode("utf-8")# 写入文件with open("1.html", "w", encoding="utf-8") as f:f.write(target_html)
if name == ‘main‘: login_url = “https://www.yaozh.com/login/“ target_url = “https://report.yaozh.com/?yaozh“ cookie = get_cookie(login_url, target_url) ```
