语料说明
数据语料用处:文本分类、信息抽取、舆情分析、应急管理
语料表1字段说明:【标签、标题、发布时间、新闻链接】
语料表2字段说明:【标题、作者、发布时间、正文、图片路径】
爬虫源码安装和解析
爬虫实现工具:selenium、requests、BeautifulSoup
爬虫流程构建:实现定时一次爬取,可自由选择是否二次爬取。(为了防止重复爬取,应该有个去重判断处理,还没有写;另外还应该有针对主题的三次爬取)
爬虫源码
函数的参数都给出了具体说明,以及函数的输入输出说明
- 导入依赖
import os
import random
import pickle
import datetime
import requests
import pandas as pd
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
sys.setrecursionlimit(10000)
SinaNewsExtractor Sina滚动新闻提取器
- SinaNewsExtractor
def SinaNewsExtractor(url=None,page_nums=50,stop_time_limit=3,verbose=1,withSave=False):
"""
url:爬取链接,具有既定格式 https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}
page_nums:爬取滚动新闻的页码数,可取值范围为[1,50]的整数,默认为50(最大值)
stop_time_limit:为防止爬虫封锁IP,采用时停策略进行缓冲,可控制时停的上界,输入为一个整数,默认为3
verbose:控制爬取可视化打印的标志位,0表示不显示,1表示html打印,2表示详细打印,默认为1
withSave:是否保存输出到当前文件夹下,默认False
函数输入:url
函数输出:具有新闻信息的pandas表格、由bs4组成的html列表
"""
day_html = []
df = pd.DataFrame([],columns=["label","title","time","url"])
url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome("chromedriver.exe",chrome_options=chrome_options) # 启动driver
driver.implicitly_wait(60)
# 获取html,主要耗时
for page in range(1,page_nums+1):
driver.get(url.format(page))
driver.refresh()
soup = BeautifulSoup(driver.page_source,"lxml")
frame = soup.find("div",attrs={"class":"d_list_txt","id":"d_list"})
day_html.append(frame)
time.sleep(random.randint(1,stop_time_limit))
if verbose != 0:
print(page,url.format(page),len(day_html))
# 提取新闻信息,超快
for html in day_html:
for li in html.find_all("li"):
url = li.a["href"]
label = li.span.text
title = li.a.text
public = li.find("span",attrs={"class":"c_time"}).text
df.loc[len(df)] = [label,title,public,url]
if verbose ==2:
print("{}\t{}\t{}".format(df.shape[0],public,title))
# 关闭driver,防止后台进程残留
driver.quit()
# 开启保存
if withSave:
if os.path.isdir("dataDaliy") is False:
os.makedirs('dataDaliy')
if os.path.isdir("pklDaliy") is False:
os.makedirs('pklDaliy')
curr = datetime.datetime.now()
curr_pkl = "pklDaliy/{}_{}_{}_{}_{}news.pkl".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
curr_excel = "dataDaliy/{}_{}_{}_{}_{}news.xlsx".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
pickle.dump(day_html,open(curr_pkl,"wb"))
df.to_excel(curr_excel,index=False)
return df,day_html
SingleNewsExtractor 单条新闻详细信息提取
- SingleNewsExtractor
def SingleNewsExtractor(url,verbose=False):
"""
url:新闻链接
verbose:是否开启打印,默认为False
"""
extractor = GeneralNewsExtractor()
user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]
user_agent = {'User-Agent':random.choice(user_agent_pc)}
rep = requests.get(url, headers=user_agent)
source = rep.content.decode("utf-8",errors='ignore')
result = extractor.extract(source)
if verbose:
print(result)
return result
FileNewsExtractor 文档级别新闻详细信息提取
- FileNewsExtractor
def FileNewsExtractor(csv_file,save_file,verbose=False):
"""
csv_file:csv路径
save_file:保存文件路径
verbose:是否开启打印,默认为False
函数输入:csv文件路径(必须有url)、提取新闻的文件保存路径
函数输出:具有详细新闻信息的pandas表
"""
news = pd.DataFrame([],columns=['title', 'author', 'publish_time', 'content', 'images'])
data = pd.read_excel(csv_file)
for idx,news_url in tqdm(enumerate(data["url"]),total=len(data["url"])):
news_infos = SingleNewsExtractor(news_url,verbose=verbose)
news.loc[idx] = news_infos
if idx % 3 and idx != 0:
time.sleep(random.randint(1,3))
news.to_excel(save_file,index=False)
return news
main 主程序
# 测试主程序
def test(c1 = False ,c2 = False):
# 测试1
if c1:
url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
df,day_html = SinaNewsExtractor(url=url,page_nums=50,stop_time_limit=5,verbose=1,withSave=True)
# 测试 2
if c2:
FileNewsExtractor("dataDaliy/2021_7_17_9_26news.xlsx","detailed_news.xlsx")
if __name__ == '__main__':
test(c1=False,c2=False)
完整代码
import os
import random
import pickle
import datetime
import requests
import pandas as pd
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
sys.setrecursionlimit(10000)
def SinaNewsExtractor(url=None,page_nums=50,stop_time_limit=3,verbose=1,withSave=False):
"""
url:爬取链接,具有既定格式 https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}
page_nums:爬取滚动新闻的页码数,可取值范围为[1,50]的整数,默认为50(最大值)
stop_time_limit:为防止爬虫封锁IP,采用时停策略进行缓冲,可控制时停的上界,输入为一个整数,默认为3
verbose:控制爬取可视化打印的标志位,0表示不显示,1表示html打印,2表示详细打印,默认为1
withSave:是否保存输出到当前文件夹下,默认False
函数输入:url
函数输出:具有新闻信息的pandas表格、由bs4组成的html列表
"""
day_html = []
df = pd.DataFrame([],columns=["label","title","time","url"])
url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome("chromedriver.exe",chrome_options=chrome_options) # 启动driver
driver.implicitly_wait(60)
# 获取html,主要耗时
for page in range(1,page_nums+1):
driver.get(url.format(page))
driver.refresh()
soup = BeautifulSoup(driver.page_source,"lxml")
frame = soup.find("div",attrs={"class":"d_list_txt","id":"d_list"})
day_html.append(frame)
time.sleep(random.randint(1,stop_time_limit))
if verbose != 0:
print(page,url.format(page),len(day_html))
# 提取新闻信息,超快
for html in day_html:
for li in html.find_all("li"):
url = li.a["href"]
label = li.span.text
title = li.a.text
public = li.find("span",attrs={"class":"c_time"}).text
df.loc[len(df)] = [label,title,public,url]
if verbose ==2:
print("{}\t{}\t{}".format(df.shape[0],public,title))
# 关闭driver,防止后台进程残留
driver.quit()
# 开启保存
if withSave:
if os.path.isdir("dataDaliy") is False:
os.makedirs('dataDaliy')
if os.path.isdir("pklDaliy") is False:
os.makedirs('pklDaliy')
curr = datetime.datetime.now()
curr_pkl = "pklDaliy/{}_{}_{}_{}_{}news.pkl".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
curr_excel = "dataDaliy/{}_{}_{}_{}_{}news.xlsx".format(curr.year,curr.month,curr.day,curr.hour,curr.minute)
pickle.dump(day_html,open(curr_pkl,"wb"))
df.to_excel(curr_excel,index=False)
return df,day_html
def SingleNewsExtractor(url,verbose=False):
"""
url:新闻链接
verbose:是否开启打印,默认为False
"""
extractor = GeneralNewsExtractor()
user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]
user_agent = {'User-Agent':random.choice(user_agent_pc)}
rep = requests.get(url, headers=user_agent)
source = rep.content.decode("utf-8",errors='ignore')
result = extractor.extract(source)
if verbose:
print(result)
return result
def FileNewsExtractor(csv_file,save_file,verbose=False):
"""
csv_file:csv路径
save_file:保存文件路径
verbose:是否开启打印,默认为False
函数输入:csv文件路径(必须有url)、提取新闻的文件保存路径
函数输出:具有详细新闻信息的pandas表
"""
news = pd.DataFrame([],columns=['title', 'author', 'publish_time', 'content', 'images'])
data = pd.read_excel(csv_file)
for idx,news_url in tqdm(enumerate(data["url"]),total=len(data["url"])):
news_infos = SingleNewsExtractor(news_url,verbose=verbose)
news.loc[idx] = news_infos
if idx % 3 and idx != 0:
time.sleep(random.randint(1,3))
news.to_excel(save_file,index=False)
return news
# 测试主程序
def test(c1 = False ,c2 = False):
# 测试1
if c1:
url = "https://news.sina.com.cn/roll/#pageid=153&lid=2970&k=&num=50&page={}"
df,day_html = SinaNewsExtractor(url=url,page_nums=50,stop_time_limit=5,verbose=1,withSave=True)
# 测试 2
if c2:
FileNewsExtractor("dataDaliy/2021_7_17_9_26news.xlsx","detailed_news.xlsx")
if __name__ == '__main__':
test(c1=False,c2=False)
c1、c2都是布尔类型,相当于开关,使用True来开启。
表1全都存储在dataDaliy文件夹中
表2是自定义路径存储
同时也会保存表1的html字段到pklDaliy文件夹中
已实现的应用
突发事件新闻抽取及分类【待写】
- 基于规则触发