1. 作业1

  • 目标网站:https://fanyi.sogou.com/text
  • 爬取要求:
    • 1、输入要翻译的内容
    • 2、通过post请求拿到json数据(可以用urllib模块,也可以用requests模块)
    • 3、把翻译后的数据提取出来,直接打印到控制台就行

import requests
import json

url = ‘https://fanyi.sogou.com/reventondc/suggV3
work = input(‘请输入要翻译的单词’)
data = {
‘from’: ‘auto’,
‘to’: ‘en’,
‘client’: ‘web’,
‘text’: work,
‘uuid’: ‘ddcc0c84-ab0e-400f-9d01-3a4b58084ebf’,
‘pid’: ‘sogou-dict-vr’,
‘addSugg’: ‘on’
}
# 获取字节流数据
data = requests.post(url=url, params=data)
# json
zd = json.loads(data.text)
# print(zd)
print(zd[‘sugg’][0][‘v’])

2. 作业2

  • 目标网站:https://www.cnblogs.com/
  • 爬取要求:
    • 1、输入要搜索的内容
    • 2、翻页爬取相关页面html代码
    • 3、保存到本地

import requests
key = input(‘请输入要查询的内容’)
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36’,
‘cookie’:’_ga=GA1.2.1952270957.1593319406; gads=ID=0472d29db903ce1b:T=1625461152:R:S=ALNI_MbvV2QLCbdqA96krpuOIFOS41ZThg; UM_distinctid=17dd19ddbef5fd-0ba1b545a242f-b7a1438-149c48-17dd19ddbf0531; _gid=GA1.2.1423180869.1641566913; ShitNoRobotCookie=CfDJ8GsLOKiGtk1Au0UP1SouGdUUxCQfD3C3uoJmcz16VICHhFee8lEnRsBh7YeC0XGEq8iuqAzXtyBsO1l6L8l54ZWMZtrrHb-WD6cylaILZgMsYglcAfYFeUWTsvBEGHT4YQ; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1640398175,1641198257,1641566912,1641614990; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1641614990; _gat_gtag_UA_476124_1=1; DetectCookieSupport=OK; utma=59123430.1952270957.1593319406.1641612123.1641614999.3; utmc=59123430; utmz=59123430.1641614999.3.2.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; utmt=1; utmb=59123430.1.10.1641614999’
}
for n in range(1,4):
url = f’https://zzk.cnblogs.com/s/blogpost?Keywords={key}&pageindex={n}
rep = requests.get(url,headers=headers)
html = rep.text

  1. # 保存到本地 w 创建文件并写入 open(文件名,写入方式,编码格式)<br /> with open('./保存的html/第' + str(n) + '页.html', 'w', encoding='utf-8')as file:<br /> file.write(html)<br /> print(f'第{n}页.html写入成功')