作业1:

  • 目标网站:https://fanyi.sogou.com/text
  • 爬取要求:
    • 1、输入要翻译的内容
    • 2、通过post请求拿到json数据(可以用urllib模块,也可以用requests模块)
    • 3、把翻译后的数据提取出来,直接打印到控制台就行

代码:

  1. import requests
  2. import json
  3. from requests.exceptions import HTTPError,ReadTimeout,RequestException
  4. class Transpc:
  5. def __init__(self,search):
  6. self.search = search
  7. def trans(self):
  8. url = 'https://fanyi.sogou.com/api/transpc/text/result'
  9. data = {
  10. "from": "auto",
  11. "to": "en",
  12. "text": self.search,
  13. "client": "pc",
  14. "fr": "browser_pc",
  15. "needQc": 1,
  16. "s": "7d472ca6863377bfec86011a9486bdd2",
  17. "uuid": "dd0ea5cc-64ab-40be-a1b4-e577bfbc965a",
  18. "exchange": "false"
  19. }
  20. headers = {
  21. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
  22. "Host": "fanyi.sogou.com",
  23. "Cookie":"ABTEST=7|1641552779|v17;SNUID=39ED2A00B7BD6C4C72F82FAFB889C6C5;IPLOC=CN3301;SUID=81559DB73481A20A0000000061D81B8B;wuid=1641552779646;FUV=31e661252502f474d57365768288e3d1;translate.sess=e33281fc-b170-49e6-992b-47f9c73aa08e;SUV=1641522943266;SGINPUT_UPSCREEN=1641522943282"
  24. }
  25. try:
  26. response = requests.post(url, data=data,headers=headers)
  27. result = json.loads(response.text)
  28. errorCode = result['data']['translate']['errorCode']
  29. if int(errorCode) != 10:
  30. return '请求失败'
  31. else:
  32. return result['data']['sentencesData']['trans_result'][0]['trans_text']
  33. except ReadTimeout as error:
  34. return error
  35. except HTTPError as error:
  36. return error
  37. except RequestException as error:
  38. return error
  39. if __name__ == '__main__':
  40. search = input('请输入您要翻译的内容')
  41. transpc = Transpc(search)
  42. result = transpc.trans()
  43. print(f'翻译的结果为:{result}')

2. 作业2

  • 目标网站:https://www.cnblogs.com/
  • 爬取要求:

    • 1、输入要搜索的内容
    • 2、翻页爬取相关页面html代码
    • 3、保存到本地 ``` import requests from requests.exceptions import HTTPError,ReadTimeout,RequestException class Cnblog: def init(self,search,page): self.search = search self.page = page

      def getPageData(self):
      if self.page == 1:

      1. url = 'https://zzk.cnblogs.com/s/blogpost?w=' + self.search

      else:

      1. url = f'https://zzk.cnblogs.com/s/blogpost?Keywords={self.search}&pageindex={self.page}'

      response = requests.get(url) if response.status_code == 200:

      1. with open(f'爬取第{self.page}页数据.html','w+',encoding='utf-8') as f:
      2. f.write(response.text)

      else:

      1. return response.reason

if name == ‘main‘: search = input(‘请输入您要查询的内容’) while True: page = input(‘请输入要搜索的页数,输入0退出爬虫’) if int(page) == 0: break else: print(f’开始爬取第{page}页数据’) obj = Cnblog(search, page) obj.getPageData() ```