使用selenium爬取猫眼top100

代码

  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. import csv
  4. # 爬取内容
  5. def parse_data(li):
  6. dd_tags = driver.find_elements(By.XPATH, '//*[@id="app"]/div/div/div[1]/dl//dd')
  7. for dd_tag in dd_tags:
  8. item = {}
  9. # 爬取影名
  10. item['name'] = dd_tag.find_element(By.XPATH, './/p[@class="name"]/a').text.strip()
  11. # 爬取主演
  12. item['actor'] = dd_tag.find_element(By.XPATH, './/p[@class="star"]').text.strip()
  13. # 爬取评分
  14. item['score'] = dd_tag.find_element(By.XPATH, './/p[@class="score"]').text.strip()
  15. li.append(item)
  16. # 保存数据
  17. def write_data(li):
  18. header = ['name', 'actor', 'score']
  19. with open('猫眼电影top100.csv', 'w', encoding='utf-8', newline='') as f:
  20. w = csv.DictWriter(f, header)
  21. w.writeheader()
  22. w.writerows(li)
  23. print("保存成功!")
  24. # 主函数
  25. if __name__ == '__main__':
  26. # 加载驱动
  27. driver = webdriver.Chrome()
  28. driver.get('https://www.maoyan.com/')
  29. # 点击重新加载
  30. driver.find_element(By.XPATH, '//*[@id="reload-button"]').click()
  31. # 隐式等待
  32. driver.implicitly_wait(2)
  33. # 点击top100榜单
  34. driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/ul/li[5]/a').click()
  35. driver.implicitly_wait(2)
  36. driver.find_element(By.XPATH, '/html/body/div[3]/ul/li[5]/a').click()
  37. driver.implicitly_wait(2)
  38. li = []
  39. # 翻页操作
  40. for i in range(10):
  41. parse_data(li)
  42. driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/ul/li[8]/a').click()
  43. # 保存数据
  44. write_data(li)

展示

image.png

image.png