import requests
import re
#获取网面源代码
def getHTMLText(url):
try:
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"}
r = requests.get(url, timeout=30, headers=headers,allow_redirects=False)
print(r.status_code)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
#print(r.text[1000:2000])
return r.text
except:
print("网页访问异常!")
url = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4'
html = getHTMLText(url)
#print(html)
#提取新闻内容
p_news = '<div class="result-op c-container xpath-log new-pmd".*?aria-label="百度快照">百度快照</a></div></div></div>'
news = re.findall(p_news,html, re.S)
print(len(news))
#print(news)
file = open('Savenews.txt','a')
for i in range(len(news)):
res = news[i]
#提取标题
p_title = '<h3 class="news-title_1YtI1">.*?aria-label="标题:(.*?)"'
title = re.findall(p_title,res, re.S)
print(title)
#提取网址
p_href = '<h3 class="news-title_1YtI1">.*?<a href="(.*?)"'
href = re.findall(p_href,res, re.S) #re.S自动考虑换行修饰
print(href)
#提取时间
p_date = 'aria-label="发布于:.*?">(.*?)</span>'
date = re.findall(p_date,res, re.S) #re.S自动考虑换行修饰
print(date)
#提取来源
p_source = 'aria-label="新闻来源:.*?">(.*?)</span>'
source = re.findall(p_source,res, re.S) #re.S自动考虑换行修饰
print(source)
print('\n')
#内容写入txt
newsContent= str(1+i)+'. '+title[0]+' ('+date[0]+': '+source[0]+')'+'\n'
file.write(newsContent)
file.write(href[0]+'\n')
file.write('\n')
file.close()