1. from lxml import etree
    2. import time
    3. import random
    4. from requests_html import HTMLSession,UserAgent
    5. header = {
    6. 'User-Agent':UserAgent().random
    7. }
    8. session = HTMLSession()
    9. def get_movie_info(url,i):
    10. print(f'爬取第{i + 1}页数据')
    11. response = session.get(url,headers = header)
    12. html = etree.HTML(response.text)
    13. ol = html.xpath('//*[@class="grid_view"]')[0]
    14. allLi = ol.xpath('//li/div[@class="item"]')
    15. j = 0
    16. for li in allLi:
    17. print(f'{"*" * 10}第{i + 1}页数据,第{j + 1}条数据{"*" * 10}')
    18. movieImg = li.xpath('./div[@class="pic"]/a/img/@src')[0]
    19. print('电影封面图:',movieImg)
    20. movieName = li.xpath('./div[@class="info"]/div[@class="hd"]/a//span/text()')
    21. movieName = processing(movieName)
    22. print('电影名称:',movieName)
    23. movieDy = li.xpath('./div[@class="info"]/div[@class="bd"]/p/text()')
    24. movieDy = processing(movieDy)
    25. print("导演以及主演:",movieDy)
    26. movieScore = li.xpath('./div[@class="info"]/div[@class="bd"]/div/span[2]/text()')[0]
    27. print('电影评分:',movieScore)
    28. movieTxt = li.xpath('./div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
    29. print("电影总结:",movieTxt)
    30. j += 1
    31. print('-'*20,f'第{i+1}页数据已完毕','-'*20)
    32. def processing(strs):
    33. s = ''
    34. for n in strs:
    35. n = ''.join(n.split())
    36. s += n
    37. return s
    38. if __name__ == '__main__':
    39. for i in range(0,10):
    40. start = 25 * int(i)
    41. url = 'https://movie.douban.com/top250?start={start}&filter='.format(start=start)
    42. get_movie_info(url,i)