爬取豆瓣

代码

  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4. '''
  5. 处理翻页操作,分析url
  6. https://movie.douban.com/top250?start=0
  7. https://movie.douban.com/top250?start=25
  8. https://movie.douban.com/top250?start=50
  9. '''
  10. class DouBan:
  11. def __init__(self):
  12. self.headers = {
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
  14. }
  15. self.url = 'https://movie.douban.com/top250?start={}'
  16. self.li = []
  17. self.header = ['name', 'score', 'review']
  18. def read_url(self, url):
  19. res = requests.get(url, headers=self.headers)
  20. html = res.content.decode('utf-8')
  21. # print(html)
  22. # print('-'*100)
  23. return html
  24. def parse_html(self, html):
  25. soup = BeautifulSoup(html, 'lxml')
  26. ol_tag = soup.find('ol', class_="grid_view")
  27. # print(ol_tag)
  28. li_tags = ol_tag.find_all('li')
  29. for li_tag in li_tags:
  30. item = {}
  31. div_hd_tag = li_tag.find('div', class_="hd")
  32. div_bd_tag = li_tag.find('div', class_="bd")
  33. name = div_hd_tag.find('span', class_="title").string
  34. item['name'] = name
  35. score = div_bd_tag.find('span', class_="rating_num").string
  36. item['score'] = score
  37. # 处理没有影评的操作
  38. try:
  39. review = div_bd_tag.find('span', class_="inq").string
  40. item['review'] = review
  41. except:
  42. item['review'] = '无'
  43. self.li.append(item)
  44. def write_data(self):
  45. with open("豆瓣top250.csv", 'w', encoding='utf-8', newline='') as f:
  46. w = csv.DictWriter(f, self.header)
  47. w.writeheader()
  48. w.writerows(self.li)
  49. def main(self):
  50. num = int(input("请输入你要爬取的页数:(一共有十页)"))
  51. for i in range(num):
  52. url = self.url.format(i*25)
  53. html = self.read_url(url)
  54. self.parse_html(html)
  55. # print(self.li)
  56. self.write_data()
  57. if __name__ == '__main__':
  58. db = DouBan()
  59. db.main()

结果展示

image.png
image.png