作业1:
- 目标网站:https://fanyi.sogou.com/text
- 爬取要求:
- 1、输入要翻译的内容
- 2、通过post请求拿到json数据(可以用urllib模块,也可以用requests模块)
- 3、把翻译后的数据提取出来,直接打印到控制台就行
代码:
import requests
import json
from requests.exceptions import HTTPError,ReadTimeout,RequestException
class Transpc:
def __init__(self,search):
self.search = search
def trans(self):
url = 'https://fanyi.sogou.com/api/transpc/text/result'
data = {
"from": "auto",
"to": "en",
"text": self.search,
"client": "pc",
"fr": "browser_pc",
"needQc": 1,
"s": "7d472ca6863377bfec86011a9486bdd2",
"uuid": "dd0ea5cc-64ab-40be-a1b4-e577bfbc965a",
"exchange": "false"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
"Host": "fanyi.sogou.com",
"Cookie":"ABTEST=7|1641552779|v17;SNUID=39ED2A00B7BD6C4C72F82FAFB889C6C5;IPLOC=CN3301;SUID=81559DB73481A20A0000000061D81B8B;wuid=1641552779646;FUV=31e661252502f474d57365768288e3d1;translate.sess=e33281fc-b170-49e6-992b-47f9c73aa08e;SUV=1641522943266;SGINPUT_UPSCREEN=1641522943282"
}
try:
response = requests.post(url, data=data,headers=headers)
result = json.loads(response.text)
errorCode = result['data']['translate']['errorCode']
if int(errorCode) != 10:
return '请求失败'
else:
return result['data']['sentencesData']['trans_result'][0]['trans_text']
except ReadTimeout as error:
return error
except HTTPError as error:
return error
except RequestException as error:
return error
if __name__ == '__main__':
search = input('请输入您要翻译的内容')
transpc = Transpc(search)
result = transpc.trans()
print(f'翻译的结果为:{result}')
2. 作业2
- 目标网站:https://www.cnblogs.com/
爬取要求:
- 1、输入要搜索的内容
- 2、翻页爬取相关页面html代码
3、保存到本地 ``` import requests from requests.exceptions import HTTPError,ReadTimeout,RequestException class Cnblog: def init(self,search,page): self.search = search self.page = page
def getPageData(self):
if self.page == 1:url = 'https://zzk.cnblogs.com/s/blogpost?w=' + self.search
else:
url = f'https://zzk.cnblogs.com/s/blogpost?Keywords={self.search}&pageindex={self.page}'
response = requests.get(url) if response.status_code == 200:
with open(f'爬取第{self.page}页数据.html','w+',encoding='utf-8') as f:
f.write(response.text)
else:
return response.reason
if name == ‘main‘: search = input(‘请输入您要查询的内容’) while True: page = input(‘请输入要搜索的页数,输入0退出爬虫’) if int(page) == 0: break else: print(f’开始爬取第{page}页数据’) obj = Cnblog(search, page) obj.getPageData() ```