爬取豆瓣
代码
import requests
from bs4 import BeautifulSoup
import csv
'''
处理翻页操作,分析url
https://movie.douban.com/top250?start=0
https://movie.douban.com/top250?start=25
https://movie.douban.com/top250?start=50
'''
class DouBan:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
self.url = 'https://movie.douban.com/top250?start={}'
self.li = []
self.header = ['name', 'score', 'review']
def read_url(self, url):
res = requests.get(url, headers=self.headers)
html = res.content.decode('utf-8')
# print(html)
# print('-'*100)
return html
def parse_html(self, html):
soup = BeautifulSoup(html, 'lxml')
ol_tag = soup.find('ol', class_="grid_view")
# print(ol_tag)
li_tags = ol_tag.find_all('li')
for li_tag in li_tags:
item = {}
div_hd_tag = li_tag.find('div', class_="hd")
div_bd_tag = li_tag.find('div', class_="bd")
name = div_hd_tag.find('span', class_="title").string
item['name'] = name
score = div_bd_tag.find('span', class_="rating_num").string
item['score'] = score
# 处理没有影评的操作
try:
review = div_bd_tag.find('span', class_="inq").string
item['review'] = review
except:
item['review'] = '无'
self.li.append(item)
def write_data(self):
with open("豆瓣top250.csv", 'w', encoding='utf-8', newline='') as f:
w = csv.DictWriter(f, self.header)
w.writeheader()
w.writerows(self.li)
def main(self):
num = int(input("请输入你要爬取的页数:(一共有十页)"))
for i in range(num):
url = self.url.format(i*25)
html = self.read_url(url)
self.parse_html(html)
# print(self.li)
self.write_data()
if __name__ == '__main__':
db = DouBan()
db.main()
结果展示