from lxml import etree
import time
import random
from requests_html import HTMLSession,UserAgent
header = {
'User-Agent':UserAgent().random
}
session = HTMLSession()
def get_movie_info(url,i):
print(f'爬取第{i + 1}页数据')
response = session.get(url,headers = header)
html = etree.HTML(response.text)
ol = html.xpath('//*[@class="grid_view"]')[0]
allLi = ol.xpath('//li/div[@class="item"]')
j = 0
for li in allLi:
print(f'{"*" * 10}第{i + 1}页数据,第{j + 1}条数据{"*" * 10}')
movieImg = li.xpath('./div[@class="pic"]/a/img/@src')[0]
print('电影封面图:',movieImg)
movieName = li.xpath('./div[@class="info"]/div[@class="hd"]/a//span/text()')
movieName = processing(movieName)
print('电影名称:',movieName)
movieDy = li.xpath('./div[@class="info"]/div[@class="bd"]/p/text()')
movieDy = processing(movieDy)
print("导演以及主演:",movieDy)
movieScore = li.xpath('./div[@class="info"]/div[@class="bd"]/div/span[2]/text()')[0]
print('电影评分:',movieScore)
movieTxt = li.xpath('./div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
print("电影总结:",movieTxt)
j += 1
print('-'*20,f'第{i+1}页数据已完毕','-'*20)
def processing(strs):
s = ''
for n in strs:
n = ''.join(n.split())
s += n
return s
if __name__ == '__main__':
for i in range(0,10):
start = 25 * int(i)
url = 'https://movie.douban.com/top250?start={start}&filter='.format(start=start)
get_movie_info(url,i)