1. # https://blog.csdn.net/qq_40821402/article/details/88178359
    2. from lxml import html
    3. import time
    4. from selenium import webdriver
    5. import re
    6. import csv
    7. import pymysql
    8. class Demo:
    9. comment_list = [] # 定义评论列表文件
    10. writer_list = [] # 定义评论者列表
    11. # 写入数据库
    12. conn = pymysql.connect(host='localhost' ,user = 'root',passwd = 'root',db= 'wyy' ,charset = 'utf8mb4')
    13. curs = conn.cursor()
    14. def __init__(self,browser,url):
    15. self.browser = browser
    16. time.sleep(2)
    17. self.browser.get(url)
    18. self.browser.implicitly_wait(65)
    19. try:
    20. self.browser.switch_to.frame('contentFrame') # 进入frame
    21. except Exception as e:
    22. print(e)
    23. def all_pages(self):
    24. etree = html.etree
    25. htmls = self.browser.page_source
    26. codes = etree.HTML(htmls)
    27. texts = codes.xpath("//a[@href='#']")
    28. # for te in texts: #记录一下总的文本数量13个
    29. #print(te.text)
    30. s = 4
    31. while (s<=13):
    32. pa = texts[len(texts) - s].text
    33. if pa == None:
    34. s = s+1
    35. else:
    36. last_page = int(pa)
    37. return last_page
    38. def get_comment(self, pages):
    39. for x in range(pages): #64是需要抓取的页面总数***
    40. button = self.browser.find_element_by_link_text('下一页') # 通过找到‘下一页’的按钮进行翻页
    41. time.sleep(2)
    42. # print(button.text)
    43. aa = self.browser.find_elements_by_xpath('//div/div[2]/div[1]/div[1]') # 定位评论文字
    44. for i in aa:
    45. # print(i.text)
    46. a = re.search(':', i.text) # 通过正则表达式进行“:”定位,得出左边是评论者,这个要去掉
    47. # print(a)
    48. if a:
    49. n = i.text.index(':') + 1
    50. writ = i.text[:n].replace(':', '') # 只保留从“:”左边的文字,就是纯正的作者名
    51. comm = i.text[n:] # 只保留从“:”右边的文字,就是纯正的评论文字
    52. self.curs.execute("insert into data (writer,comment) values (%s,%s) " ,(writ, comm))
    53. self.conn.commit()
    54. else:
    55. #comment_list.append('')
    56. print('')
    57. self.conn.commit()
    58. print('抓取了>>>>' + str(x+1) + '<<<<页')
    59. try :
    60. button.click() # 自动进行“下一页”的按钮
    61. except Exception as e:
    62. print(e)
    63. if __name__ == '__main__':
    64. id = open('F:/music/网易云音乐歌曲信息.csv', 'r', encoding='utf-8-sig')
    65. id_reader = csv.DictReader(id)
    66. columns = [row for row in id_reader]
    67. #columns = [439646311,1348918118,541076059,1303464240] # 测试切换id
    68. chrome_options = webdriver.ChromeOptions()
    69. prefs = {"profile.managed_default_content_settings.images": 2}
    70. chrome_options.add_experimental_option("prefs", prefs)
    71. browser = webdriver.Chrome(options= chrome_options)
    72. for id_new in columns:
    73. new_id = id_new['id']
    74. url = 'https://music.163.com/#/song?id=' + str(new_id) # 测试的时候换成id_new
    75. dd = Demo(browser=browser, url = url)
    76. pages = dd.all_pages()
    77. dd.get_comment(pages)
    78. print('********************************进入下一个************************************')
    79. time.sleep(3)
    80. dd.browser.close()

    效果图
    image.png