- 目标网站:https://movie.douban.com/top250
- 爬取要求:
- 1、翻页获取到页面的源码
- 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
- 3、把数据保存到csv
from fake_useragent import UserAgent
import csv
import requests
from bs4 import BeautifulSoup
headers = {
‘User-Agent’ : UserAgent().chrome
}
for i in range(int(input(‘开始页码’)),int(input(‘开始页码’))):
url = f’https://movie.douban.com/top250?start={i *25}’
resp = requests.get(url**, **headers=headers)
soup=BeautifulSoup(resp.text**,**"lxml")<br /> scores=soup.find_all("span"**,**class_="rating_num")<br /> links=soup.find_all("div"**,**class_="hd")
data=[]<br /> for score**,**link in zip(scores**,**links):<br /> data.append({"电影名":link.find("span").string**,**"评分":score.text**,**"视频链接":link.find("a").get("href")})
with open('豆瓣电影.csv'**, **'a'**, **newline=''**, **encoding='utf-8') as f :<br /> writer = csv.DictWriter(f**, **fieldnames=['电影名'**, **'评分'**, **'视频链接']) # 提前预览列名,当下面代码写入数据时,会将其一一对应。<br /> writer.writeheader() # 写入列名<br /> writer.writerows(data) # 写入数据<br /> print("数据已经写入成功!!!")