- 目标网站:https://movie.douban.com/top250
- 爬取要求:
- 1、翻页获取到页面的源码
- 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
- 3、把数据保存到csv
import requests
from bs4 import BeautifulSoup
import csv
begin = int(input(‘请输入开始页:’))
end = int(input(‘请输入结束页:’))
for age in range(begin, end+1):
url = f'https://movie.douban.com/top250?start={age*25}'<br /> print(url)
headers = {<br /> 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '<br /> 'Chrome/95.0.4638.69 Safari/537.36 '<br /> }<br /> response = requests.get(url, headers=headers)<br /> # print(response.text)<br /> html_doc = response.text<br /> soup = BeautifulSoup(html_doc, 'lxml')<br /> div = soup.find_all('div', class_='hd')<br /> span = soup.find_all('span', class_='rating_num')<br /> # print(div,span)<br /> data = []<br /> for name, other in zip(div,span):<br /> # print(name, other)<br /> title = name.find('span').string<br /> link = name.find('a').get('href')<br /> score = other.text<br /> print(title,link, score)<br /> data.append({<br /> '电影名': title,<br /> '链接': link,<br /> '评分': score<br /> })<br /> with open('douban.csv', 'w') as f:<br /> writer = csv.DictWriter(f, fieldnames=['电影名', '链接', '评分'])<br /> writer.writeheader()<br /> writer.writerows(data)