爬取图片作业

代码

  1. import requests
  2. from bs4 import BeautifulSoup
  3. # 解析url翻页操作
  4. # 'https://sc.chinaz.com/tupian/index_1.html' 这样不行
  5. 'https://sc.chinaz.com/tupian/'
  6. 'https://sc.chinaz.com/tupian/index_2.html'
  7. 'https://sc.chinaz.com/tupian/index_3.html'
  8. class TuPian:
  9. def res_url(self, url):
  10. headers = {
  11. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
  12. }
  13. res = requests.get(url, headers=headers)
  14. html = res.content.decode('utf-8')
  15. return html
  16. def parse_html(self, html, li):
  17. soup = BeautifulSoup(html, 'lxml')
  18. first_div = soup.find('div', id="container")
  19. a_tags = first_div.find_all('a')
  20. for i in range(0, len(a_tags), 3):
  21. item = {}
  22. # print(a_tags[i])
  23. img = a_tags[i].find('img')
  24. # print(img)
  25. item['name'] = img['alt']
  26. item['url'] = 'https:' + img['src2']
  27. li.append(item)
  28. def write_data(self, li):
  29. for i in range(len(li)):
  30. item = li[i]
  31. name = item['name']
  32. url = item['url']
  33. res = requests.get(url)
  34. with open(name + '.png', 'wb') as f:
  35. f.write(res.content)
  36. print(f"第{i + 1}张爬取完成!")
  37. def main(self):
  38. li = []
  39. basic_url = 'https://sc.chinaz.com/tupian/index_{}.html'
  40. num = int(input("请输入你要爬取的页数:"))
  41. for i in range(1, num + 1):
  42. if i == 1:
  43. html = self.res_url('https://sc.chinaz.com/tupian/')
  44. else:
  45. html = self.res_url(basic_url.format(i))
  46. self.parse_html(html, li)
  47. self.write_data(li)
  48. if __name__ == '__main__':
  49. t = TuPian()
  50. t.main()

爬取的内容

image.png

图片实例

image.pngimage.png

image.png