import requestsimport re#获取网面源代码def getHTMLText(url): try: headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"} r = requests.get(url, timeout=30, headers=headers,allow_redirects=False) print(r.status_code) r.raise_for_status() # 如果状态不是200,引发HTTPError异常 r.encoding = r.apparent_encoding #print(r.text[1000:2000]) return r.text except: print("网页访问异常!")url = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4'html = getHTMLText(url)#print(html)#提取新闻内容p_news = '<div class="result-op c-container xpath-log new-pmd".*?aria-label="百度快照">百度快照</a></div></div></div>'news = re.findall(p_news,html, re.S)print(len(news))#print(news)file = open('Savenews.txt','a')for i in range(len(news)): res = news[i] #提取标题 p_title = '<h3 class="news-title_1YtI1">.*?aria-label="标题:(.*?)"' title = re.findall(p_title,res, re.S) print(title) #提取网址 p_href = '<h3 class="news-title_1YtI1">.*?<a href="(.*?)"' href = re.findall(p_href,res, re.S) #re.S自动考虑换行修饰 print(href) #提取时间 p_date = 'aria-label="发布于:.*?">(.*?)</span>' date = re.findall(p_date,res, re.S) #re.S自动考虑换行修饰 print(date) #提取来源 p_source = 'aria-label="新闻来源:.*?">(.*?)</span>' source = re.findall(p_source,res, re.S) #re.S自动考虑换行修饰 print(source) print('\n') #内容写入txt newsContent= str(1+i)+'. '+title[0]+' ('+date[0]+': '+source[0]+')'+'\n' file.write(newsContent) file.write(href[0]+'\n') file.write('\n')file.close()