- https://movie.douban.com/top250
# ● 爬取要求:
# ○ 1、翻页获取到页面的源码
# ○ 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
# ○ 3、把数据保存到csv
import requests
from bs4 import BeautifulSoup
import csv
data_list = []">● 目标网站:https://movie.douban.com/top250
# ● 爬取要求:
# ○ 1、翻页获取到页面的源码
# ○ 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
# ○ 3、把数据保存到csv
import requests
from bs4 import BeautifulSoup
import csv
data_list = []
● 目标网站:https://movie.douban.com/top250
# ● 爬取要求:
# ○ 1、翻页获取到页面的源码
# ○ 2、用bs4解析数据,获取到页面所有电影名、评分、和页面链接
# ○ 3、把数据保存到csv
import requests
from bs4 import BeautifulSoup
import csv
data_list = []
for n in range(1,3):
url = f’https://movie.douban.com/top250?start={n}*25&filter=‘
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36’
}
response = requests.get(url,headers=headers)
result = response.content.decode(‘utf-8’)
# print(result)
soup = BeautifulSoup(result,’lxml’)
# 获取ol标签
oltag = soup.find(‘ol’, class=’gridview’)
# 获取所有li标签
li_tag_list = ol_tag.find_all(‘li’)
for li_tag in li_tag_list:
item = {}
item[‘电影名’] = li_tag.find(‘div’, class=’info’).div.a.span.string
item[‘评分’] = litag.find(‘div’, class=’info’).find(‘div’, class=’bd’).div.find_all(‘span’)[1].string
item[‘链接’] = li_tag.find(‘div’, class=’info’).div.a[‘href’]
data_list.append(item)
print(item)
with open(‘db.csv’, ‘w’, encoding=’utf-8-sig’, newline=’’) as f:
writer = csv.DictWriter(f, fieldnames=[‘电影名’,’评分’,’链接’])
writer.writeheader()
writer.writerows(data_list)