import csv,requests,time
from bs4 import BeautifulSoup
def res_html(url,headers):
res = requests.get(url=url,headers=headers)
# print(res.content.decode(‘utf-8’))
jiexi(res=res)
def jiexi(res):
data_list = []
soup = BeautifulSoup(res.content.decode(‘utf-8’),’lxml’)
ol = soup.ol.contents
for i in range(1, 51,2):
li = ol[i]<br /> # print(li.span.string)<br /> # print(li.find(property="v:average").string)<br /> # print(li.a["href"])<br /> data_list.append({"名称":li.span.string,"评分":li.find(property="v:average").string,"详情链接":li.a["href"]})<br /> baocun(data_list=data_list)<br /> print(data_list)<br />def baocun(data_list):<br /> with open(f'D:\桌面\赛博朋克\赛博朋克第{page}页.csv','a',newline='') as f:<br /> writer = csv.DictWriter(f, data_headers)<br /> writer.writeheader()<br /> writer.writerows(data_list)<br /> print("保存成功")<br />if __name__ == '__main__':data_headers = ("名称","评分","详情链接")<br /> headers = {<br /> 'Cookie': 'bid=Aljwk0oaAgw; __gads=ID=42393351f27d6db0-2243245dd5cf00ea:T=1642169303:RT=1642169303:S=ALNI_MY_IyUpkV8WoRDdS5VU3eiKNaU-gw; ll="108315"; _vwo_uuid_v2=D55AA2ACEDD80B9A8B8DAB7D6E8148C52|9750b7e781258d7f02d44a8f73d2ad4f; __utmz=223695111.1642402783.4.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmz=30149280.1644649389.5.4.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; __utmc=30149280; __utmc=223695111; __utma=30149280.1584029429.1642135609.1645799610.1645801838.7; __utmb=30149280.0.10.1645801838; __utma=223695111.1146132265.1642135609.1645799610.1645801838.6; __utmb=223695111.0.10.1645801838; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1645801838%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_id.100001.4cf6=19f5d90f5952d424.1642135608.6.1645801838.1645799831.; _pk_ses.100001.4cf6=*',<br /> 'Host': 'movie.douban.com',<br /> 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',<br /> }<br /> # url = 'https://movie.douban.com/top250?start=0&filter='<br /> page = 1<br /> for start in range(0,10):<br /> time.sleep(0.5)<br /> url = f'https://movie.douban.com/top250?start={start*25}&filter='<br /> res_html(url=url,headers=headers)<br /> page+=1
