使用selenium爬取猫眼top100
代码
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
# 爬取内容
def parse_data(li):
dd_tags = driver.find_elements(By.XPATH, '//*[@id="app"]/div/div/div[1]/dl//dd')
for dd_tag in dd_tags:
item = {}
# 爬取影名
item['name'] = dd_tag.find_element(By.XPATH, './/p[@class="name"]/a').text.strip()
# 爬取主演
item['actor'] = dd_tag.find_element(By.XPATH, './/p[@class="star"]').text.strip()
# 爬取评分
item['score'] = dd_tag.find_element(By.XPATH, './/p[@class="score"]').text.strip()
li.append(item)
# 保存数据
def write_data(li):
header = ['name', 'actor', 'score']
with open('猫眼电影top100.csv', 'w', encoding='utf-8', newline='') as f:
w = csv.DictWriter(f, header)
w.writeheader()
w.writerows(li)
print("保存成功!")
# 主函数
if __name__ == '__main__':
# 加载驱动
driver = webdriver.Chrome()
driver.get('https://www.maoyan.com/')
# 点击重新加载
driver.find_element(By.XPATH, '//*[@id="reload-button"]').click()
# 隐式等待
driver.implicitly_wait(2)
# 点击top100榜单
driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/ul/li[5]/a').click()
driver.implicitly_wait(2)
driver.find_element(By.XPATH, '/html/body/div[3]/ul/li[5]/a').click()
driver.implicitly_wait(2)
li = []
# 翻页操作
for i in range(10):
parse_data(li)
driver.find_element(By.XPATH, '//*[@id="app"]/div/div/div[2]/ul/li[8]/a').click()
# 保存数据
write_data(li)
展示