- https://www.cnblogs.com/
# ● 爬取要求:
# ○ 1、输入要搜索的内容
# ○ 2、翻页爬取相关页面html代码
# ○ 3、保存到本地
import requests">1.作业二
# ● 目标网站:https://www.cnblogs.com/
# ● 爬取要求:
# ○ 1、输入要搜索的内容
# ○ 2、翻页爬取相关页面html代码
# ○ 3、保存到本地
import requests - https://zzk.cnblogs.com/s/blogpost?Keywords=python&pageindex=1‘
# ua 伪装
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36’,
‘cookie’: ‘_gid=GA1.2.522223452.1644990859; gads=ID=19b29d9d008329ac:T=1644990889:S=ALNI_MYgTn7vl1YgmgxQbAWv_CzL0YrnIA; utmz=59123430.1644990907.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; NotRobot=CfDJ8GsLOKiGtk1Au0UP1SouGdWXSSbPJFpRXDfcyWHhDJJjOp6Zd5JzV3pmg-vBAICRzicAqC4PdLAOhYoeZ2r81iekL4dWnqHzmFeS1wmYKhVAD0etHN1mZKfm5Ic-eKVO7A; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1644990858,1644994817; _ga_3Q0DVSGN10=GS1.1.1644994830.1.1.1644994898.60; _ga=GA1.2.1389332791.1644990859; utma=59123430.1389332791.1644990859.1644990907.1645012342.2; utmc=59123430; utmt=1; utmb=59123430.1.10.1645012342’
}
# 发送请求
response = requests.get(url, headers=headers)
# print(response.text)
# 持久化保存
with open(f’第{pageindex}页.html’, ‘w’, encoding=’utf-8’ ) as f:
f.write(response.text)">指定url
Keyword = input(‘输入内容:’)
pageindex = input(‘输入页码:’)
url = ‘https://zzk.cnblogs.com/s/blogpost?Keywords=python&pageindex=1‘
# ua 伪装
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36’,
‘cookie’: ‘_gid=GA1.2.522223452.1644990859; gads=ID=19b29d9d008329ac:T=1644990889:S=ALNI_MYgTn7vl1YgmgxQbAWv_CzL0YrnIA; utmz=59123430.1644990907.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; NotRobot=CfDJ8GsLOKiGtk1Au0UP1SouGdWXSSbPJFpRXDfcyWHhDJJjOp6Zd5JzV3pmg-vBAICRzicAqC4PdLAOhYoeZ2r81iekL4dWnqHzmFeS1wmYKhVAD0etHN1mZKfm5Ic-eKVO7A; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1644990858,1644994817; _ga_3Q0DVSGN10=GS1.1.1644994830.1.1.1644994898.60; _ga=GA1.2.1389332791.1644990859; utma=59123430.1389332791.1644990859.1644990907.1645012342.2; utmc=59123430; utmt=1; utmb=59123430.1.10.1645012342’
}
# 发送请求
response = requests.get(url, headers=headers)
# print(response.text)
# 持久化保存
with open(f’第{pageindex}页.html’, ‘w’, encoding=’utf-8’ ) as f:
f.write(response.text)
