from lxml import etreeimport timeimport randomfrom requests_html import HTMLSession,UserAgentheader = { 'User-Agent':UserAgent().random}session = HTMLSession()def get_movie_info(url,i): print(f'爬取第{i + 1}页数据') response = session.get(url,headers = header) html = etree.HTML(response.text) ol = html.xpath('//*[@class="grid_view"]')[0] allLi = ol.xpath('//li/div[@class="item"]') j = 0 for li in allLi: print(f'{"*" * 10}第{i + 1}页数据,第{j + 1}条数据{"*" * 10}') movieImg = li.xpath('./div[@class="pic"]/a/img/@src')[0] print('电影封面图:',movieImg) movieName = li.xpath('./div[@class="info"]/div[@class="hd"]/a//span/text()') movieName = processing(movieName) print('电影名称:',movieName) movieDy = li.xpath('./div[@class="info"]/div[@class="bd"]/p/text()') movieDy = processing(movieDy) print("导演以及主演:",movieDy) movieScore = li.xpath('./div[@class="info"]/div[@class="bd"]/div/span[2]/text()')[0] print('电影评分:',movieScore) movieTxt = li.xpath('./div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()') print("电影总结:",movieTxt) j += 1 print('-'*20,f'第{i+1}页数据已完毕','-'*20)def processing(strs): s = '' for n in strs: n = ''.join(n.split()) s += n return sif __name__ == '__main__': for i in range(0,10): start = 25 * int(i) url = 'https://movie.douban.com/top250?start={start}&filter='.format(start=start) get_movie_info(url,i)