import reimport timeimport jiebaimport requestsimport numpy as npimport pandas as pdfrom PIL import Imagefrom wordcloud import WordCloudimport matplotlib.pyplot as pltdata_list = []for i in range(1,20,1):    print("正在爬取第" + str(i) + "页")    #构建访问的网址,这个网址可有讲究了    first = 'https://rate.tmall.com/list_detail_rate.htm?itemId=596452219968&spuId=1240258038&sellerId=1579115485&order=3¤tPage=1'    last = '&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvB9vnvPgvUvCkvvvvvjiPn25pQjlhPFSv0jthPmPy6jiPR2MwAjnjRLF9gjlERphvCvvvphmjvpvhvUCvp8wCvvpvvhHhmphvLvUIUkUaQCAwe1O0747BhCka%2BoHoDOvfjLeAnhjEKBmAdXIaUExreTgcnkxb5ah6Hd8ram56D40OdiUDNrBlHd8reC69D70fd3J18heivpvUvvCCWUB0wV0EvpvVvpCmpJ2vKphv8vvvpHwvvvvvvvCmqvvvv4pvvhZLvvmCvvvvBBWvvvjwvvCHhQvvvxQCvpvVvUCvpvvv2QhvCvvvMMGtvpvhvvCvp86CvChh9P2s3QvvC0ODj6KHkoVQROhCvCLwMbra3rMwznsJWxS5gn1Uzvr4486Cvvyv9mQS7Qvvm4p%3D&needFold=0&_ksTS=1585406932472_453&callback=jsonp454'    url = first + str(i) + last    #访问的头文件,还带这个cookie    headers ={        # 用的哪个浏览器        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',        # 从哪个页面发出的数据申请,每个网站可能略有不同        'referer': 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.464b6bbfQwJmpT&id=596452219968&skuId=4313616443848&areaId=340700&user_id=1579115485&cat_id=2&is_b=1&rn=2aaf4f3d019121cb4b9c1816fe2eb360',        # 哪个用户想要看数据,是游客还是注册用户,建议使用登录后的cookie        'cookie':'tk_trace=1; cna=BPoFF17G1wkCASShM8zuMe/z; dnk=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; uc1=tag=10&cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&cookie14=UoTUP2Hg22VKGQ%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie21=WqG3DMC9Fb5mPLIQo9kR&lng=zh_CN&existShop=false&pas=0; uc3=nk2=rUtEsEAPxFiBAw%3D%3D&vt3=F8dBxd9vfOFX6TF0nIU%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UU20sOBlt5YjsA%3D%3D; tracknick=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; lid=%E6%88%91%E6%89%8B%E6%9C%BA%E9%9D%A2%E5%8C%85; _l_g_=Ug%3D%3D; uc4=nk4=0%40r7rCJKnwPLZ3%2FwyNCMllICP5es7j&id4=0%40U2%2Fz9fRgFErUiIbdThLAqnTeryYw; unb=2565225077; lgc=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; cookie1=VyVfQs3fk3Q1AMa82%2BACjr%2B92r264TDI3Q1c5WQuXXw%3D; login=true; cookie17=UU20sOBlt5YjsA%3D%3D; cookie2=1cf0a583503c0e1120b70f4ef312f5c5; _nk_=%5Cu6211%5Cu624B%5Cu673A%5Cu9762%5Cu5305; sgcookie=EilyrHs60A8pXOSQMCPEY; sg=%E5%8C%857f; t=0f46f0f89d1ad6a09a42a2e03e34c8ad; csg=af40d9de; _tb_token_=7e358e863e33f; enc=m7O0wanabkvr3U2e%2B%2FVwjIRhdoivog54aY5f614N4hBpuXKXuZzuCOP8Wqjk%2FohRVNzechJXzRihNyJDnIQHxw%3D%3D; l=dBOQ8BwlQB9FA9pWBOfwVsUBXgbOgIOb8sPzcQtKtICPOq1wBiJPWZ43uHTeCnGVh6JwR3laeFr4BMsXcnV0x6aNa6Fy_1Dmn; isg=BKOjn8dx-fVsPLXByTRwZsHRMuFNmDfaBnKiX9UB34JaFMI2XWiVKt1CDuQatI_S'    }    #尝试获取数据(这里的数据应该是从json里面获取的)    try:        data = requests.get(url,headers = headers).text        time.sleep(10)        result = re.findall('rateContent":"(.*?)"fromMall"',data)        data_list.extend(result)    except:        print("本页爬取失败")df = pd.DataFrame()df["评论"] = data_listdf.to_excel("评论_汇总.xlsx")# df = pd.DataFrame()# df["review"] = data_list# df.to_excel("评论_汇总.xlsx")df = pd.DataFrame()df["review"] = data_listdf.to_csv("coms.csv",mode="a+",header=None,index=None,encoding="utf-8")# 读取原始数据raw_comments = pd.read_csv('com.csv')raw_comments.head()# 导入停用词表,这里的stopword是可以自己更改上传的with open('stopword.txt') as s:    stopwords = set([line.replace('\n', ' ') for line in s])# 传入apply的预处理函数,完成中文提取、分词以及多余空格剔除def preprocessing(c):    c = [word for word in jieba.cut(' '.join(re.findall('[\u4e00-\u9fa5]+', c))) if word != ' ' and word not in stopwords]    return ' '.join(c)# 将所有语料按空格拼接为一整段文字comments = ' '.join(raw_comments['评论'].apply(preprocessing))comments[:500]# ---------生产词云----------usa_mask = np.array(Image.open('flower.png'))#image_colors = ImageColorGenerator(usa_mask) #读取图片本身颜色,但是这一句有错误#从文本中生成词云图wordcloud = WordCloud(background_color='white', # 背景色为白色                      height=400, # 高度设置为400                      width=800, # 宽度设置为800                      scale=1, # 长宽拉伸程度设置为20                      prefer_horizontal=0.2, # 调整水平显示倾向程度为0.2                      max_words=500, # 设置最大显示字数为500                      relative_scaling=0.3, # 设置字体大小与词频的关联程度为0.3                      max_font_size=50,# 缩小最大字体为50                      font_path='msyh.ttf',#设置字体为微软雅黑                      mask=usa_mask#添加蒙版                    ).generate_from_text(comments)plt.figure(figsize=[8, 4])plt.imshow(wordcloud           #.recolor(color_func=image_colors),alpha=1)plt.axis('off')#保存到本地plt.savefig('图6.jpg', dpi=600, bbox_inches='tight', quality=95)plt.show()