ArxivExtractor.py

    1. #!/usr/bin/env python
    2. # -*- encoding: utf-8 -*-
    3. '''
    4. @File : ArxivExtractor.py
    5. @Contact : htkstudy@163.com
    6. @Modify Time @Author @Version @Desciption
    7. ------------ ------- -------- -----------
    8. 2021/5/10 21:40 Armor(htk) 1.0 None
    9. '''
    10. import re
    11. import time
    12. import numpy as np
    13. import random
    14. from selenium import webdriver
    15. from selenium.webdriver.chrome.options import Options
    16. import smtplib
    17. from email.mime.text import MIMEText
    18. from email.mime.multipart import MIMEMultipart
    19. from email.mime.image import MIMEImage
    20. def seed_email(file_name,receivers_list):
    21. #设置服务器所需信息
    22. #163邮箱服务器地址
    23. mail_host = 'smtp.163.com'
    24. #163用户名
    25. mail_user = '*****@163.com'
    26. #密码(部分邮箱为授权码)
    27. mail_pass = '**********'
    28. #邮件发送方邮箱地址
    29. sender = '*****@163.com'
    30. #邮件接受方邮箱地址,注意需要[]包裹,这意味着你可以写多个邮件地址群发
    31. receivers = receivers_list #如 receivers = ['*****@qq.com']
    32. #设置email信息
    33. #邮件内容设置
    34. message = MIMEMultipart()
    35. message['From'] = sender
    36. message['To'] = receivers[0]
    37. #邮件主题
    38. message['Subject'] = 'Arxiv每日推送'
    39. #推荐使用html格式的正文内容,这样比较灵活,可以附加图片地址,调整格式等
    40. with open(file_name,'r',encoding="utf-8") as f:
    41. content = f.read()
    42. #设置html格式参数
    43. part1 = MIMEText(content,'html','utf-8')
    44. #将内容附加到邮件主体中
    45. message.attach(part1)
    46. #登录并发送邮件
    47. try:
    48. smtpObj = smtplib.SMTP()
    49. #连接到服务器
    50. smtpObj.connect(mail_host,25)
    51. #登录到服务器
    52. smtpObj.login(mail_user,mail_pass)
    53. #发送
    54. smtpObj.sendmail(
    55. sender,receivers,message.as_string())
    56. #退出
    57. smtpObj.quit()
    58. print('success')
    59. except smtplib.SMTPException as e:
    60. print('error',e) #打印错误
    61. class GeneralArxivExtractor(object):
    62. def extract(self,html):
    63. title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]
    64. print(title)
    65. subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
    66. subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]
    67. return 0
    68. def simple(self,html,spical):
    69. title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]
    70. print(title)
    71. flags = [1 if key in title else 0 for key in spical]
    72. if np.array(flags).sum() > 0:
    73. subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
    74. front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'
    75. tail = '</body></html>'
    76. html = front + subhtml + front
    77. with open("html/{}.html".format(title),"w",encoding='utf-8') as f:
    78. f.write(html)
    79. pass
    80. class SelenuimLoading(object):
    81. def __init__(self):
    82. # option配置
    83. self.chrome_options = Options()
    84. self.chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
    85. self.chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
    86. self.chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
    87. self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.chrome_options)
    88. def Spider_Html(self,url):
    89. self.driver.implicitly_wait(60)
    90. self.driver.get(url)
    91. time.sleep(random.randint(5,8))
    92. html = self.driver.page_source
    93. return html
    94. def close(self):
    95. self.driver.close()
    96. pass
    97. if __name__ == '__main__':
    98. org = 12056
    99. sp = ['人工智能','机器学习','计算机视觉','自然语言处理']
    100. # 生成urls
    101. url_format = "https://www.arxivdaily.com/thread/{}"
    102. urls = [url_format.format(i) for i in np.arange(970,11152)[::-1]]
    103. # 加载 浏览器
    104. web = SelenuimLoading()
    105. # 加载提取器
    106. extractor = GeneralArxivExtractor()
    107. # 抓取urls中的html页面
    108. for idx,url in enumerate(urls):
    109. try:
    110. # 提取html
    111. html = web.Spider_Html(url)
    112. timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    113. print("{} {}/{} ".format(timeput,idx+1,len(urls)),sep=" ")
    114. # 抓取网页
    115. extractor.simple(html.replace("\n",""),sp)
    116. except:
    117. print(urls[idx],"出错")
    118. web.close()

    SeedArxiv.py

    1. import re
    2. import time
    3. import numpy as np
    4. import random
    5. import os
    6. import datetime
    7. from ArxivExtractor import seed_email
    8. from selenium import webdriver
    9. from selenium.webdriver.chrome.options import Options
    10. from selenium.webdriver import ChromeOptions
    11. class GeneralArxivExtractor(object):
    12. def extract(self,html):
    13. title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>',html)[0]
    14. print(title)
    15. subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
    16. subjects = re.findall(r'<center>(.*?)</center>', subhtml)[1:]
    17. return 0
    18. def simple(self,html,spical):
    19. title = re.findall(r'<title>(.*?) - arXiv每日学术速递</title>', html)[0]
    20. print(title)
    21. flags = [1 if key in title else 0 for key in spical ]
    22. if np.array(flags).sum() > 0:
    23. subhtml = re.findall(r'<article data-v-f3c566ae="" data-v-04c933e6="" class="global">(.*?)</article>', html)[0]
    24. front = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'
    25. tail = '</body></html>'
    26. html = front + subhtml + front
    27. with open("html/{}.html".format(title),"w",encoding='utf-8') as f:
    28. f.write(html)
    29. pass
    30. class SelenuimLoading(object):
    31. def __init__(self):
    32. # option配置
    33. self.options = Options()
    34. self.chrome_options = ChromeOptions()
    35. self.chrome_options.add_argument('headless') # 设置option
    36. self.options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
    37. self.options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
    38. self.options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
    39. self.driver = webdriver.Chrome(executable_path='D:\HuProject\chromedriver_win32\chromedriver.exe',options=self.options,chrome_options=self.chrome_options)
    40. def Spider_Html(self,url):
    41. self.driver.implicitly_wait(60)
    42. self.driver.get(url)
    43. first = self.driver.find_element_by_xpath(r'//*[@id="__layout"]/div/div[2]/div/main/div[2]/div[1]/div[1]/div/div/div/div[2]/a/div/span')
    44. first.click()
    45. self.driver.switch_to.window(self.driver.window_handles[-1])
    46. time.sleep(random.randint(5,8))
    47. html = self.driver.page_source
    48. return html
    49. def close(self):
    50. self.driver.quit()
    51. pass
    52. def scray(choose):
    53. urls = {'计算机视觉':"https://www.arxivdaily.com/category/19?search_ids=19",
    54. '自然语言处理':"https://www.arxivdaily.com/category/20?search_ids=20",
    55. '人工智能':'https://www.arxivdaily.com/category/21?search_ids=21',
    56. '机器学习':'https://www.arxivdaily.com/category/22?search_ids=22'
    57. }
    58. url = urls[choose]
    59. # 加载 浏览器
    60. web = SelenuimLoading()
    61. # 加载提取器
    62. extractor = GeneralArxivExtractor()
    63. # 抓取urls中的html页面
    64. try:
    65. # 提取html
    66. html = web.Spider_Html(url)
    67. timeput = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    68. print("{}".format(timeput),sep=" ")
    69. # 抓取网页
    70. extractor.simple(html.replace("\n",""),urls.keys())
    71. except Exception as e:
    72. # 访问异常的错误编号和详细信息
    73. print(url,"出错")
    74. print(e.args)
    75. web.close()
    76. if __name__ == '__main__':
    77. choose = ["自然语言处理","计算机视觉","人工智能","机器学习"]
    78. receiver_lists = ['1939454633@qq.com','htkstudy@163.com','htkstudy@163.com']
    79. flag = True
    80. while True:
    81. scray("自然语言处理")
    82. current_time = datetime.datetime.now()
    83. current_time_string = str(current_time.year)+"_"+str(current_time.month)+"_"+str(current_time.day)
    84. for file_name in os.listdir("html"):
    85. if current_time_string in file_name and "自然语言处理" in file_name and ".html" in file_name:
    86. seed_email("html/"+file_name,receiver_lists)
    87. flag = False
    88. break
    89. if flag:
    90. print("未更新")
    91. else:
    92. break
    93. time.sleep(20)