• 目标网站:https://movie.douban.com/top250
    • 爬取要求:
      • 1、翻页获取到页面的源码
      • 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
      • 3、把数据保存到csv

    from fake_useragent import UserAgent
    import csv
    import requests
    from bs4 import BeautifulSoup
    headers = {
    ‘User-Agent’ : UserAgent().chrome
    }

    for i in range(int(input(‘开始页码’)),int(input(‘开始页码’))):
    url = f’https://movie.douban.com/top250?start={i *25}’

    1. resp = requests.get(url**, **headers=headers)
    2. soup=BeautifulSoup(resp.text**,**"lxml")<br /> scores=soup.find_all("span"**,**class_="rating_num")<br /> links=soup.find_all("div"**,**class_="hd")
    3. data=[]<br /> for score**,**link in zip(scores**,**links):<br /> data.append({"电影名":link.find("span").string**,**"评分":score.text**,**"视频链接":link.find("a").get("href")})
    4. with open('豆瓣电影.csv'**, **'a'**, **newline=''**, **encoding='utf-8') as f :<br /> writer = csv.DictWriter(f**, **fieldnames=['电影名'**, **'评分'**, **'视频链接']) # 提前预览列名,当下面代码写入数据时,会将其一一对应。<br /> writer.writeheader() # 写入列名<br /> writer.writerows(data) # 写入数据<br /> print("数据已经写入成功!!!")