①:基本语法 + 请求头
from urllib import parse, request
import string
def get_message(url):
# 添加参数信息
params = {
"wd": "廖雪峰"
}
# 添加请求头信息
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}
# 参数转码
str_params = parse.urlencode(params)
# url转码 ,url有中文时要
url_end = parse.quote(url + str_params, safe=string.printable)
# 添加请求头 ,urlopen 没有添加请求头的方法,需要用Request 来出创建
requests = request.Request(url_end, headers=header)
# 获取信息
rt_data = request.urlopen(requests)
print(rt_data.read().decode("utf-8"))
if __name__ == '__main__':
url = "http://www.baidu.com/s?"
get_message(url)
②:+ 随机ip地址
- urllib.request.urlopen没有跟改ip地址的方法从urlopen 底层代码可以看出
- urlopen发起请求主要使用 HTTPSHandler类和一个build_opener()函数搭建
- HTTPSHandler 没有改变ip的方法,将改为 ProxyHandler
文件名 config.py
###
HEADER = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
]
HANDLER = [
{"https": "77.234.220.242:3128"},
{"https": "77.234.220.242:3128"},
{"https": "77.234.220.242:3128"},
{"https": "77.234.220.242:3128"},
{"https": "77.234.220.242:3128"},
]
import urllib
import string
import random
from urllib import parse, request
from config import HEADER, HANDLER
def get_message(url):
params = {
"wd": "廖雪峰"
}
# 参数转码
str_params = urllib.parse.urlencode(params)
str_url = urllib.parse.quote(url + str_params, safe=string.printable)
# 随机获取代理
random_handler = random.choice(HANDLER)
### 付费的代理 : HANDLER = {"http":"name:password@uri"}
# 创建代理处理器
handler = urllib.request.ProxyHandler(random_handler)
# 构建 请求工具
opener = urllib.request.build_opener(handler)
# 随机获取请求头
random_header = random.choice(HEADER)
# 添加请求头
end_url = urllib.request.Request(str_url)
end_url.add_header("User-Agent", random_header)
# 请求网址
opener = opener.open(end_url)
# 打印网页数据
print(opener.read().decode("utf-8"))
if __name__ == "__main__":
url = "http://www.baidu.com/s?"
get_message(url)
付费ip的使用
import random
from urllib import request
from config import MY_USER_AGENT
# 账号名字
username = "name"
# 密码
pwd = "password"
# 网址
pay_proxy = "***.***.***.***:****"
# 要爬取的网址
url = "https://www.baidu.com"
# 请求头
headers = {"User-Agent":random.choice(MY_USER_AGENT)}
# 创建密码管理器,认证
password_manager = request.HTTPPasswordMgrWithPriorAuth()
password_manager.add_password(None,pay_proxy,pwd,username)
# 创建 认证的代理处理器
proxy_auth_handler = request.ProxyBasicAuthHandler(password_manager)
# 添加请求头
end_url = request.Request(url,headers=headers)
# 创建请求工具
opener= request.build_opener(proxy_auth_handler)
# 请求数据,打印
print(opener.open(end_url).read().decode("utf-8"))
③:cookie数据登录
- cookie 用来进行登录验证的,需要账号密码
- 2.一个账号一段时间内,在不同的地点(代理ip)、不同的浏览器(header)上多次登录,会被识别为非人为操作(封号)
- 3.爬取要登录验证的网址: 多账号 ```python import string import random from http import cookiejar from urllib import request, parse from config import HEADER, HANDLER
def get_cookie(url1, url2):
# 获取请求头
header = HEADER[0]
# 登录时要用到的数据
login_push_data = {
"username": "stringdecode",
"pwd": "aijia7721python",
"country": "86_zh-CN",
"formhash": "D9DBB6F9BA",
"backurl": "%2F%2Fwww.yaozh.com%2F",
}
# 对数据进行转码
login_push_data = parse.urlencode(login_push_data, safe=string.printable).encode("utf-8")
# cookiejar 用来记录cookie
cookie = cookiejar.CookieJar()
# 创建cookie处理器
cookie_pro = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_pro)
# 添加请求头
request_url = request.Request(url1)
request_url.add_header("User-Agent", header)
# 通过登录信息,验证网页的登录功能,cookie 会被保存下来
opener.open(request_url, data=login_push_data)
添加请求头
end_url = request.Request(url2)
end_url.add_header(key="User-Agent", val=header)
#请求目标页面
target_html = opener.open(end_url).read().decode("utf-8")
# 写入文件
with open("1.html", "w", encoding="utf-8") as f:
f.write(target_html)
if name == ‘main‘: login_url = “https://www.yaozh.com/login/“ target_url = “https://report.yaozh.com/?yaozh“ cookie = get_cookie(login_url, target_url) ```