import requests
import csv
from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
import os
'''
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
'''
# 保存页面
def savePage(page):
start = int(page) * 50
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'User-Agent': UserAgent().random,
'Referer': url
}
if not os.path.exists(f'{page + 1}页.html'):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
with open(f'{page + 1}页.html', 'w', encoding='utf-8') as file:
file.write(response.text)
else:
print(response.reason)
except Exception as error:
print(error)
# 获取页面数据
def getPageInfo(page,all_movie):
with open(f'{page + 1}页.html','r',encoding='utf-8') as file:
html = file.read()
soup = bs(html,'lxml')
all_info = soup.find_all('div',class_='info')
for info in all_info:
title_info = list(info.stripped_strings)
title = title_info[0] + title_info[1] # 电影名称
score = info.find('span',class_='rating_num').string
if info.select('span[class="inq"]'): # 使用CSS选择器查找
inq = info.select('span[class="inq"]')[0].text
else:
inq = '无影评'
all_movie.append({'电影名称':title,'评分':score,'影评分':inq})
return all_movie
# 将数据保存到csv文件中
def saveCsv(movie_list):
fields_name = ['电影名称','评分','影评分']
with open('movies.csv','w',encoding='utf-8',newline='') as file:
writer = csv.DictWriter(file,fieldnames=fields_name)
writer.writeheader()
writer.writerows(movie_list)
if __name__ == '__main__':
all_movie = []
for page in range(0,10):
savePage(page)
movie_list = getPageInfo(page,all_movie)
saveCsv(movie_list)
print('已写入完毕')