ArxivExtractor.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File : ArxivExtractor.py
@Contact : htkstudy@163.com
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2021/5/10 21:40 Armor(htk) 1.0 None
'''
import re
import time
import numpy as np
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.image import MIMEImage
def seed_email(file_name,receivers_list):
#设置服务器所需信息
#163邮箱服务器地址
mail_host = 'smtp.163.com'
#163用户名
mail_user = '*****@163.com'
#密码(部分邮箱为授权码)
mail_pass = '**********'
#邮件发送方邮箱地址
sender = '*****@163.com'
#邮件接受方邮箱地址,注意需要[]包裹,这意味着你可以写多个邮件地址群发
receivers = receivers_list #如 receivers = ['*****@qq.com']
#设置email信息
#邮件内容设置
message = MIMEMultipart()
message['From'] = sender
message['To'] = receivers[0]
#邮件主题
message['Subject'] = 'Arxiv每日推送'
#推荐使用html格式的正文内容,这样比较灵活,可以附加图片地址,调整格式等
with open(file_name,'r',encoding="utf-8") as f:
content = f.read()
#设置html格式参数
part1 = MIMEText(content,'html','utf-8')
#将内容附加到邮件主体中
message.attach(part1)
#登录并发送邮件
try:
smtpObj = smtplib.SMTP()
#连接到服务器
smtpObj.connect(mail_host,25)
#登录到服务器
smtpObj.login(mail_user,mail_pass)
#发送
smtpObj.sendmail(
sender,receivers,message.as_string())
#退出
smtpObj.quit()
print('success')
except smtplib.SMTPException as e:
print('error',e) #打印错误
class GeneralArxivExtractor(object):
def extract(self,html):
title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]
print(title)
subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]
return 0
def simple(self,html,spical):
title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]
print(title)
flags = [1 if key in title else 0 for key in spical]
if np.array(flags).sum() > 0:
subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'
tail = '</body></html>'
html = front + subhtml + front
with open("html/{}.html".format(title),"w",encoding='utf-8') as f:
f.write(html)
pass
class SelenuimLoading(object):
def __init__(self):
# option配置
self.chrome_options = Options()
self.chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
self.chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
self.chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.chrome_options)
def Spider_Html(self,url):
self.driver.implicitly_wait(60)
self.driver.get(url)
time.sleep(random.randint(5,8))
html = self.driver.page_source
return html
def close(self):
self.driver.close()
pass
if __name__ == '__main__':
org = 12056
sp = ['人工智能','机器学习','计算机视觉','自然语言处理']
# 生成urls
url_format = "https://www.arxivdaily.com/thread/{}"
urls = [url_format.format(i) for i in np.arange(970,11152)[::-1]]
# 加载 浏览器
web = SelenuimLoading()
# 加载提取器
extractor = GeneralArxivExtractor()
# 抓取urls中的html页面
for idx,url in enumerate(urls):
try:
# 提取html
html = web.Spider_Html(url)
timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("{} {}/{} ".format(timeput,idx+1,len(urls)),sep=" ")
# 抓取网页
extractor.simple(html.replace("\n",""),sp)
except:
print(urls[idx],"出错")
web.close()
SeedArxiv.py
import re
import time
import numpy as np
import random
import os
import datetime
from ArxivExtractor import seed_email
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions
class GeneralArxivExtractor(object):
def extract(self,html):
title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]
print(title)
subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]
return 0
def simple(self,html,spical):
title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]
print(title)
flags = [1 if key in title else 0 for key in spical ]
if np.array(flags).sum() > 0:
subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'
tail = '</body></html>'
html = front + subhtml + front
with open("html/{}.html".format(title),"w",encoding='utf-8') as f:
f.write(html)
pass
class SelenuimLoading(object):
def __init__(self):
# option配置
self.options = Options()
self.chrome_options = ChromeOptions()
self.chrome_options.add_argument('headless') # 设置option
self.options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
self.options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
self.options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.options,chrome_options=self.chrome_options)
def Spider_Html(self,url):
self.driver.implicitly_wait(60)
self.driver.get(url)
first = self.driver.find_element_by_xpath(r'//*[@id="__layout"]/div/div[2]/div/main/div[2]/div[1]/div[1]/div/div/div/div[2]/a/div/span')
first.click()
self.driver.switch_to.window(self.driver.window_handles[-1])
time.sleep(random.randint(5,8))
html = self.driver.page_source
return html
def close(self):
self.driver.quit()
pass
def scray(choose):
urls = {'计算机视觉':"https://www.arxivdaily.com/category/19?search_ids=19",
'自然语言处理':"https://www.arxivdaily.com/category/20?search_ids=20",
'人工智能':'https://www.arxivdaily.com/category/21?search_ids=21',
'机器学习':'https://www.arxivdaily.com/category/22?search_ids=22'
}
url = urls[choose]
# 加载 浏览器
web = SelenuimLoading()
# 加载提取器
extractor = GeneralArxivExtractor()
# 抓取urls中的html页面
try:
# 提取html
html = web.Spider_Html(url)
timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("{}".format(timeput),sep=" ")
# 抓取网页
extractor.simple(html.replace("\n",""),urls.keys())
except Exception as e:
# 访问异常的错误编号和详细信息
print(url,"出错")
print(e.args)
web.close()
if __name__ == '__main__':
choose = ["自然语言处理","计算机视觉","人工智能","机器学习"]
receiver_lists = ['1939454633@qq.com','htkstudy@163.com','htkstudy@163.com']
flag = True
while True:
scray("自然语言处理")
current_time = datetime.datetime.now()
current_time_string = str(current_time.year)+"_"+str(current_time.month)+"_"+str(current_time.day)
for file_name in os.listdir("html"):
if current_time_string in file_name and "自然语言处理" in file_name and ".html" in file_name:
seed_email("html/"+file_name,receiver_lists)
flag = False
break
if flag:
print("未更新")
else:
break
time.sleep(20)