抓取ajax请求方式的数据

  1. """
  2. 爬取豆瓣电影信息
  3. 分析url:
  4. https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0
  5. https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20
  6. https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40
  7. """
  8. from requests_html import HTMLSession,UserAgent
  9. import json
  10. class GetDbMovies:
  11. def __init__(self):
  12. self.session = HTMLSession()
  13. self.headers = {
  14. 'User-Agent':UserAgent().random,
  15. 'Cookie': 'bid=tE6HN3Ew1o4; douban-fav-remind=1; __utmc=30149280; ll="118172"; __utmc=223695111; __gads=ID=57783c64c61b95f0-222169cb8ece00a5:T=1635952049:RT=1635952049:S=ALNI_MY6y5oy9OjMOtd8IX0CuXxdSF4SXQ; _vwo_uuid_v2=D81A9B83EB7C344F9A07D617FD1AE4887|474257f8af7204c717a5353d945eb818; __yadk_uid=UhvBUj5FHIKYmDv4VhlKTrZDXf1zW5uS; __utmz=30149280.1641476156.11.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; push_doumail_num=0; push_noty_num=0; dbcl2="198087769:Fi8yZABpQu4"; ck=o8Sj; __utmv=30149280.19808; gr_user_id=d4f2576e-6cd8-4af1-b939-2c8645bbdd3c; __utmz=223695111.1641642199.10.3.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1641719347%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.453644371.1620303194.1641641722.1641719347.14; __utmb=30149280.0.10.1641719347; __utma=223695111.1449280834.1635952050.1641642199.1641719347.11; __utmb=223695111.0.10.1641719347; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1641720466; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1641720466; _pk_id.100001.4cf6=74019193f4d4d88d.1635952050.11.1641721330.1641643747.'
  16. }
  17. # 获取视频
  18. def getMovies(self):
  19. for i in range(0,5):
  20. page_start = i * 20
  21. url = f'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start={page_start}&year_range=2020,2020'
  22. response = self.session.get(url,headers=self.headers)
  23. # print(response.status_code)
  24. if response.status_code == 200:
  25. response.html.render()
  26. result = json.loads(response.text)['data']
  27. for mv in result:
  28. title = mv['title']
  29. url = mv['url']
  30. casts = mv['casts']
  31. cover = mv['cover']
  32. print(f'电影名称:【{title}】,视频地址:{url},演出名单:{casts},封面图片:{cover}')
  33. obj = GetDbMovies()
  34. obj.getMovies()