1. # !/bin/env python
    2. # encoding=utf-8
    3. import re
    4. import requests
    5. from bs4 import BeautifulSoup as bs
    6. from time import sleep
    7. import lxml.etree
    8. import pandas as pd
    9. mn = [] # 电影名称
    10. mr = [] # 电影评分
    11. mp = [] # 电影评价人数
    12. mh = [] # 电影链接
    13. mtp = [] # 电影短评
    14. title_1 = [] # 评论1
    15. title_2 = [] # 评论2
    16. title_3 = [] # 评论3
    17. title_4 = [] # 评论4
    18. title_5 = [] # 评论5
    19. def get_url_name(url):
    20. user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    21. header = {}
    22. header['user-agent'] = user_agent
    23. cookie = '''bid=ZNJjsDB1mu4; douban-fav-remind=1; __yadk_uid=EH2JTTZhVzZmeLKBWnLIsMyMuUKyCmlr; ll="108288"; push_noty_num=0; push_doumail_num=0; dbcl2="170609308:/CPcn6wSV8s"; ck=gCQ_; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1583642177%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1482876592.1570801355.1583329616.1583642177.4; __utmc=30149280; __utmz=30149280.1583642177.4.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; __utmv=30149280.17060; douban-profile-remind=1; _pk_id.100001.8cb4=0032b1c289fde0dd.1570801350.3.1583642452.1570956509.; __gads=ID=356f8d35ed66b5f6:T=1583642453:S=ALNI_MbmKR7vLQRINc1rm5RCeVkIvGC60w; __utmb=30149280.8.10.1583642177'''
    24. cookie_dic = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
    25. response = requests.get(url, headers=header,cookies=cookie_dic)
    26. selector = lxml.etree.HTML(response.text)
    27. movie_tag = selector.xpath('//ol[@class="grid_view"]')
    28. # bs_info = bs(response.text,'html.parser') #语法分析器,以html的方式进行分析
    29. for data in movie_tag:
    30. movie_name = data.xpath('//div[@class="hd"]//span[@class="title"][1]/text()')
    31. for mn_info in movie_name:
    32. mn.append(mn_info)
    33. movie_rating = data.xpath('//div[@class="bd"]//div[@class="star"]//span[@property="v:average"]/text()')
    34. # print(movie_rating)
    35. for mr_info in movie_rating:
    36. mr.append(mr_info)
    37. movie_pingjia = data.xpath('//div[@class="star"]//span[4]/text()')
    38. # print(movie_pingjia)
    39. for mp_info in movie_pingjia:
    40. mp.append(mp_info)
    41. movie_hrefs = data.xpath('li//div[@class="item"]/div[@class="pic"]/a/@href')
    42. for movie_href in movie_hrefs:
    43. mh.append(movie_href.strip().replace('\n',''))
    44. # print(mh)
    45. response_2 = requests.get(movie_href, headers=header,cookies=cookie_dic)
    46. selector_2 = lxml.etree.HTML(response_2.text)
    47. comments = selector_2.xpath('//*[@id="hot-comments"]')
    48. for topics in comments:
    49. # mtp.clear()
    50. topic = topics.xpath('div//div[@class="comment"]/p/span[@class="short"]/text()')
    51. # print(topic)
    52. # for aa in topic:
    53. mtp.append(tuple(topic))
    54. urls = tuple(f'https://movie.douban.com/top250?start={page * 25}&filter=' for page in range(10))
    55. if __name__ == '__main__':
    56. for page in urls:
    57. print(page)
    58. get_url_name(page)
    59. sleep(10)
    60. dict_info ={}
    61. for i in range(len(mn)):
    62. title_1.append(mtp[i][0])
    63. title_2.append(mtp[i][1])
    64. title_3.append(mtp[i][2])
    65. title_4.append(mtp[i][3])
    66. title_5.append(mtp[i][4])
    67. dict_info = {
    68. '电影名称': mn,
    69. '电影评分': mr,
    70. '评价人数': mp,
    71. '电影链接': mh,
    72. '电影评价1': title_1,
    73. '电影评价2': title_2,
    74. '电影评价3': title_3,
    75. '电影评价4': title_4,
    76. '电影评价5': title_5
    77. }
    78. if dict_info:
    79. print(dict_info)
    80. book1 = pd.DataFrame(dict_info,
    81. columns=['电影名称',
    82. '电影评分',
    83. '评价人数',
    84. '电影链接',
    85. '电影评价1',
    86. '电影评价2',
    87. '电影评价3',
    88. '电影评价4',
    89. '电影评价5'])
    90. book1.to_csv('./movie_info.csv', encoding='utf-8')