语料说明

数据语料用处:文本分类、信息抽取、舆情分析、应急管理
语料表1字段说明:【标签、标题、发布时间、新闻链接】
image.png
语料表2字段说明:【标题、作者、发布时间、正文、图片路径】
image.png

爬虫源码安装和解析

爬虫实现工具:selenium、requests、BeautifulSoup
爬虫流程构建:实现定时一次爬取,可自由选择是否二次爬取。(为了防止重复爬取,应该有个去重判断处理,还没有写;另外还应该有针对主题的三次爬取)

爬虫源码
函数的参数都给出了具体说明,以及函数的输入输出说明

  • 导入依赖
    1. import os
    2. import random
    3. import pickle
    4. import datetime
    5. import requests
    6. import pandas as pd
    7. import time
    8. from tqdm import tqdm
    9. from bs4 import BeautifulSoup
    10. from gne import GeneralNewsExtractor
    11. from selenium import webdriver
    12. from selenium.webdriver.chrome.options import Options
    13. import sys
    14. sys.setrecursionlimit(10000)

SinaNewsExtractor Sina滚动新闻提取器

  • SinaNewsExtractor
    1. def SinaNewsExtractor(url=None,page_nums=50,stop_time_limit=3,verbose=1,withSave=False):
    2. """
    3. url:爬取链接,具有既定格式 https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}
    4. page_nums:爬取滚动新闻的页码数,可取值范围为[1,50]的整数,默认为50(最大值)
    5. stop_time_limit:为防止爬虫封锁IP,采用时停策略进行缓冲,可控制时停的上界,输入为一个整数,默认为3
    6. verbose:控制爬取可视化打印的标志位,0表示不显示,1表示html打印,2表示详细打印,默认为1
    7. withSave:是否保存输出到当前文件夹下,默认False
    8. 函数输入:url
    9. 函数输出:具有新闻信息的pandas表格、由bs4组成的html列表
    10. """
    11. day_html = []
    12. df = pd.DataFrame([],columns=["label","title","time","url"])
    13. url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
    14. chrome_options = Options()
    15. chrome_options.add_argument('--headless')
    16. driver = webdriver.Chrome("chromedriver.exe",chrome_options=chrome_options) # 启动driver
    17. driver.implicitly_wait(60)
    18. # 获取html,主要耗时
    19. for page in range(1,page_nums+1):
    20. driver.get(url.format(page))
    21. driver.refresh()
    22. soup = BeautifulSoup(driver.page_source,"lxml")
    23. frame = soup.find("div",attrs={"class":"d_list_txt","id":"d_list"})
    24. day_html.append(frame)
    25. time.sleep(random.randint(1,stop_time_limit))
    26. if verbose != 0:
    27. print(page,url.format(page),len(day_html))
    28. # 提取新闻信息,超快
    29. for html in day_html:
    30. for li in html.find_all("li"):
    31. url = li.a["href"]
    32. label = li.span.text
    33. title = li.a.text
    34. public = li.find("span",attrs={"class":"c_time"}).text
    35. df.loc[len(df)] = [label,title,public,url]
    36. if verbose ==2:
    37. print("{}\t{}\t{}".format(df.shape[0],public,title))
    38. # 关闭driver,防止后台进程残留
    39. driver.quit()
    40. # 开启保存
    41. if withSave:
    42. if os.path.isdir("dataDaliy") is False:
    43. os.makedirs('dataDaliy')
    44. if os.path.isdir("pklDaliy") is False:
    45. os.makedirs('pklDaliy')
    46. curr = datetime.datetime.now()
    47. curr_pkl = "pklDaliy/{}_{}_{}_{}_{}news.pkl".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
    48. curr_excel = "dataDaliy/{}_{}_{}_{}_{}news.xlsx".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
    49. pickle.dump(day_html,open(curr_pkl,"wb"))
    50. df.to_excel(curr_excel,index=False)
    51. return df,day_html

SingleNewsExtractor 单条新闻详细信息提取

  • SingleNewsExtractor
    1. def SingleNewsExtractor(url,verbose=False):
    2. """
    3. url:新闻链接
    4. verbose:是否开启打印,默认为False
    5. """
    6. extractor = GeneralNewsExtractor()
    7. user_agent_pc = [
    8. # 谷歌
    9. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
    10. 'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
    11. 'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
    12. # 火狐
    13. 'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
    14. 'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
    15. # opera
    16. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
    17. # qq浏览器
    18. 'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
    19. # 搜狗浏览器
    20. 'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
    21. # 360浏览器
    22. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
    23. 'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
    24. # uc浏览器
    25. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
    26. ]
    27. user_agent = {'User-Agent':random.choice(user_agent_pc)}
    28. rep = requests.get(url, headers=user_agent)
    29. source = rep.content.decode("utf-8",errors='ignore')
    30. result = extractor.extract(source)
    31. if verbose:
    32. print(result)
    33. return result

FileNewsExtractor 文档级别新闻详细信息提取

  • FileNewsExtractor
    1. def FileNewsExtractor(csv_file,save_file,verbose=False):
    2. """
    3. csv_file:csv路径
    4. save_file:保存文件路径
    5. verbose:是否开启打印,默认为False
    6. 函数输入:csv文件路径(必须有url)、提取新闻的文件保存路径
    7. 函数输出:具有详细新闻信息的pandas表
    8. """
    9. news = pd.DataFrame([],columns=['title', 'author', 'publish_time', 'content', 'images'])
    10. data = pd.read_excel(csv_file)
    11. for idx,news_url in tqdm(enumerate(data["url"]),total=len(data["url"])):
    12. news_infos = SingleNewsExtractor(news_url,verbose=verbose)
    13. news.loc[idx] = news_infos
    14. if idx % 3 and idx != 0:
    15. time.sleep(random.randint(1,3))
    16. news.to_excel(save_file,index=False)
    17. return news

main 主程序

  1. # 测试主程序
  2. def test(c1 = False ,c2 = False):
  3. # 测试1
  4. if c1:
  5. url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
  6. df,day_html = SinaNewsExtractor(url=url,page_nums=50,stop_time_limit=5,verbose=1,withSave=True)
  7. # 测试 2
  8. if c2:
  9. FileNewsExtractor("dataDaliy/2021_7_17_9_26news.xlsx","detailed_news.xlsx")
  10. if __name__ == '__main__':
  11. test(c1=False,c2=False)

完整代码

  1. import os
  2. import random
  3. import pickle
  4. import datetime
  5. import requests
  6. import pandas as pd
  7. import time
  8. from tqdm import tqdm
  9. from bs4 import BeautifulSoup
  10. from gne import GeneralNewsExtractor
  11. from selenium import webdriver
  12. from selenium.webdriver.chrome.options import Options
  13. import sys
  14. sys.setrecursionlimit(10000)
  15. def SinaNewsExtractor(url=None,page_nums=50,stop_time_limit=3,verbose=1,withSave=False):
  16. """
  17. url:爬取链接,具有既定格式 https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}
  18. page_nums:爬取滚动新闻的页码数,可取值范围为[1,50]的整数,默认为50(最大值)
  19. stop_time_limit:为防止爬虫封锁IP,采用时停策略进行缓冲,可控制时停的上界,输入为一个整数,默认为3
  20. verbose:控制爬取可视化打印的标志位,0表示不显示,1表示html打印,2表示详细打印,默认为1
  21. withSave:是否保存输出到当前文件夹下,默认False
  22. 函数输入:url
  23. 函数输出:具有新闻信息的pandas表格、由bs4组成的html列表
  24. """
  25. day_html = []
  26. df = pd.DataFrame([],columns=["label","title","time","url"])
  27. url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
  28. chrome_options = Options()
  29. chrome_options.add_argument('--headless')
  30. driver = webdriver.Chrome("chromedriver.exe",chrome_options=chrome_options) # 启动driver
  31. driver.implicitly_wait(60)
  32. # 获取html,主要耗时
  33. for page in range(1,page_nums+1):
  34. driver.get(url.format(page))
  35. driver.refresh()
  36. soup = BeautifulSoup(driver.page_source,"lxml")
  37. frame = soup.find("div",attrs={"class":"d_list_txt","id":"d_list"})
  38. day_html.append(frame)
  39. time.sleep(random.randint(1,stop_time_limit))
  40. if verbose != 0:
  41. print(page,url.format(page),len(day_html))
  42. # 提取新闻信息,超快
  43. for html in day_html:
  44. for li in html.find_all("li"):
  45. url = li.a["href"]
  46. label = li.span.text
  47. title = li.a.text
  48. public = li.find("span",attrs={"class":"c_time"}).text
  49. df.loc[len(df)] = [label,title,public,url]
  50. if verbose ==2:
  51. print("{}\t{}\t{}".format(df.shape[0],public,title))
  52. # 关闭driver,防止后台进程残留
  53. driver.quit()
  54. # 开启保存
  55. if withSave:
  56. if os.path.isdir("dataDaliy") is False:
  57. os.makedirs('dataDaliy')
  58. if os.path.isdir("pklDaliy") is False:
  59. os.makedirs('pklDaliy')
  60. curr = datetime.datetime.now()
  61. curr_pkl = "pklDaliy/{}_{}_{}_{}_{}news.pkl".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
  62. curr_excel = "dataDaliy/{}_{}_{}_{}_{}news.xlsx".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
  63. pickle.dump(day_html,open(curr_pkl,"wb"))
  64. df.to_excel(curr_excel,index=False)
  65. return df,day_html
  66. def SingleNewsExtractor(url,verbose=False):
  67. """
  68. url:新闻链接
  69. verbose:是否开启打印,默认为False
  70. """
  71. extractor = GeneralNewsExtractor()
  72. user_agent_pc = [
  73. # 谷歌
  74. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
  75. 'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
  76. 'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
  77. # 火狐
  78. 'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
  79. 'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
  80. # opera
  81. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
  82. # qq浏览器
  83. 'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
  84. # 搜狗浏览器
  85. 'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
  86. # 360浏览器
  87. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
  88. 'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
  89. # uc浏览器
  90. 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
  91. ]
  92. user_agent = {'User-Agent':random.choice(user_agent_pc)}
  93. rep = requests.get(url, headers=user_agent)
  94. source = rep.content.decode("utf-8",errors='ignore')
  95. result = extractor.extract(source)
  96. if verbose:
  97. print(result)
  98. return result
  99. def FileNewsExtractor(csv_file,save_file,verbose=False):
  100. """
  101. csv_file:csv路径
  102. save_file:保存文件路径
  103. verbose:是否开启打印,默认为False
  104. 函数输入:csv文件路径(必须有url)、提取新闻的文件保存路径
  105. 函数输出:具有详细新闻信息的pandas表
  106. """
  107. news = pd.DataFrame([],columns=['title', 'author', 'publish_time', 'content', 'images'])
  108. data = pd.read_excel(csv_file)
  109. for idx,news_url in tqdm(enumerate(data["url"]),total=len(data["url"])):
  110. news_infos = SingleNewsExtractor(news_url,verbose=verbose)
  111. news.loc[idx] = news_infos
  112. if idx % 3 and idx != 0:
  113. time.sleep(random.randint(1,3))
  114. news.to_excel(save_file,index=False)
  115. return news
  116. # 测试主程序
  117. def test(c1 = False ,c2 = False):
  118. # 测试1
  119. if c1:
  120. url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
  121. df,day_html = SinaNewsExtractor(url=url,page_nums=50,stop_time_limit=5,verbose=1,withSave=True)
  122. # 测试 2
  123. if c2:
  124. FileNewsExtractor("dataDaliy/2021_7_17_9_26news.xlsx","detailed_news.xlsx")
  125. if __name__ == '__main__':
  126. test(c1=False,c2=False)

c1、c2都是布尔类型,相当于开关,使用True来开启。
表1全都存储在dataDaliy文件夹中
表2是自定义路径存储
同时也会保存表1的html字段到pklDaliy文件夹中

已实现的应用

突发事件新闻抽取及分类【待写】

  • 基于规则触发

image.png