1. import re
    2. import time
    3. import jieba
    4. import requests
    5. import numpy as np
    6. import pandas as pd
    7. from PIL import Image
    8. from wordcloud import WordCloud
    9. import matplotlib.pyplot as plt
    10. data_list = []
    11. for i in range(1,20,1):
    12. print("正在爬取第" + str(i) + "页")
    13. #构建访问的网址,这个网址可有讲究了
    14. first = 'https://rate.tmall.com/list_detail_rate.htm?itemId=596452219968&spuId=1240258038&sellerId=1579115485&order=3&currentPage=1'
    15. last = '&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvB9vnvPgvUvCkvvvvvjiPn25pQjlhPFSv0jthPmPy6jiPR2MwAjnjRLF9gjlERphvCvvvphmjvpvhvUCvp8wCvvpvvhHhmphvLvUIUkUaQCAwe1O0747BhCka%2BoHoDOvfjLeAnhjEKBmAdXIaUExreTgcnkxb5ah6Hd8ram56D40OdiUDNrBlHd8reC69D70fd3J18heivpvUvvCCWUB0wV0EvpvVvpCmpJ2vKphv8vvvpHwvvvvvvvCmqvvvv4pvvhZLvvmCvvvvBBWvvvjwvvCHhQvvvxQCvpvVvUCvpvvv2QhvCvvvMMGtvpvhvvCvp86CvChh9P2s3QvvC0ODj6KHkoVQROhCvCLwMbra3rMwznsJWxS5gn1Uzvr4486Cvvyv9mQS7Qvvm4p%3D&needFold=0&_ksTS=1585406932472_453&callback=jsonp454'
    16. url = first + str(i) + last
    17. #访问的头文件,还带这个cookie
    18. headers ={
    19. # 用的哪个浏览器
    20. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    21. # 从哪个页面发出的数据申请,每个网站可能略有不同
    22. 'referer': 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.464b6bbfQwJmpT&id=596452219968&skuId=4313616443848&areaId=340700&user_id=1579115485&cat_id=2&is_b=1&rn=2aaf4f3d019121cb4b9c1816fe2eb360',
    23. # 哪个用户想要看数据,是游客还是注册用户,建议使用登录后的cookie
    24. 'cookie':'tk_trace=1; cna=BPoFF17G1wkCASShM8zuMe/z; dnk=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; uc1=tag=10&cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&cookie14=UoTUP2Hg22VKGQ%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie21=WqG3DMC9Fb5mPLIQo9kR&lng=zh_CN&existShop=false&pas=0; uc3=nk2=rUtEsEAPxFiBAw%3D%3D&vt3=F8dBxd9vfOFX6TF0nIU%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UU20sOBlt5YjsA%3D%3D; tracknick=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; lid=%E6%88%91%E6%89%8B%E6%9C%BA%E9%9D%A2%E5%8C%85; _l_g_=Ug%3D%3D; uc4=nk4=0%40r7rCJKnwPLZ3%2FwyNCMllICP5es7j&id4=0%40U2%2Fz9fRgFErUiIbdThLAqnTeryYw; unb=2565225077; lgc=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; cookie1=VyVfQs3fk3Q1AMa82%2BACjr%2B92r264TDI3Q1c5WQuXXw%3D; login=true; cookie17=UU20sOBlt5YjsA%3D%3D; cookie2=1cf0a583503c0e1120b70f4ef312f5c5; _nk_=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; sgcookie=EilyrHs60A8pXOSQMCPEY; sg=%E5%8C%857f; t=0f46f0f89d1ad6a09a42a2e03e34c8ad; csg=af40d9de; _tb_token_=7e358e863e33f; enc=m7O0wanabkvr3U2e%2B%2FVwjIRhdoivog54aY5f614N4hBpuXKXuZzuCOP8Wqjk%2FohRVNzechJXzRihNyJDnIQHxw%3D%3D; l=dBOQ8BwlQB9FA9pWBOfwVsUBXgbOgIOb8sPzcQtKtICPOq1wBiJPWZ43uHTeCnGVh6JwR3laeFr4BMsXcnV0x6aNa6Fy_1Dmn; isg=BKOjn8dx-fVsPLXByTRwZsHRMuFNmDfaBnKiX9UB34JaFMI2XWiVKt1CDuQatI_S'
    25. }
    26. #尝试获取数据(这里的数据应该是从json里面获取的)
    27. try:
    28. data = requests.get(url,headers = headers).text
    29. time.sleep(10)
    30. result = re.findall('rateContent":"(.*?)"fromMall"',data)
    31. data_list.extend(result)
    32. except:
    33. print("本页爬取失败")
    34. df = pd.DataFrame()
    35. df["评论"] = data_list
    36. df.to_excel("评论_汇总.xlsx")
    37. # df = pd.DataFrame()
    38. # df["review"] = data_list
    39. # df.to_excel("评论_汇总.xlsx")
    40. df = pd.DataFrame()
    41. df["review"] = data_list
    42. df.to_csv("coms.csv",mode="a+",header=None,index=None,encoding="utf-8")
    43. # 读取原始数据
    44. raw_comments = pd.read_csv('com.csv')
    45. raw_comments.head()
    46. # 导入停用词表,这里的stopword是可以自己更改上传的
    47. with open('stopword.txt') as s:
    48. stopwords = set([line.replace('\n', ' ') for line in s])
    49. # 传入apply的预处理函数,完成中文提取、分词以及多余空格剔除
    50. def preprocessing(c):
    51. c = [word for word in jieba.cut(' '.join(re.findall('[\u4e00-\u9fa5]+', c))) if word != ' ' and word not in stopwords]
    52. return ' '.join(c)
    53. # 将所有语料按空格拼接为一整段文字
    54. comments = ' '.join(raw_comments['评论'].apply(preprocessing))
    55. comments[:500]
    56. # ---------生产词云----------
    57. usa_mask = np.array(Image.open('flower.png'))
    58. #image_colors = ImageColorGenerator(usa_mask) #读取图片本身颜色,但是这一句有错误
    59. #从文本中生成词云图
    60. wordcloud = WordCloud(background_color='white', # 背景色为白色
    61. height=400, # 高度设置为400
    62. width=800, # 宽度设置为800
    63. scale=1, # 长宽拉伸程度设置为20
    64. prefer_horizontal=0.2, # 调整水平显示倾向程度为0.2
    65. max_words=500, # 设置最大显示字数为500
    66. relative_scaling=0.3, # 设置字体大小与词频的关联程度为0.3
    67. max_font_size=50,# 缩小最大字体为50
    68. font_path='msyh.ttf',#设置字体为微软雅黑
    69. mask=usa_mask#添加蒙版
    70. ).generate_from_text(comments)
    71. plt.figure(figsize=[8, 4])
    72. plt.imshow(wordcloud
    73. #.recolor(color_func=image_colors),alpha=1
    74. )
    75. plt.axis('off')
    76. #保存到本地
    77. plt.savefig('图6.jpg', dpi=600, bbox_inches='tight', quality=95)
    78. plt.show()