1. import requests
    2. import re
    3. #获取网面源代码
    4. def getHTMLText(url):
    5. try:
    6. headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"}
    7. r = requests.get(url, timeout=30, headers=headers,allow_redirects=False)
    8. print(r.status_code)
    9. r.raise_for_status() # 如果状态不是200,引发HTTPError异常
    10. r.encoding = r.apparent_encoding
    11. #print(r.text[1000:2000])
    12. return r.text
    13. except:
    14. print("网页访问异常!")
    15. url = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4'
    16. html = getHTMLText(url)
    17. #print(html)
    18. #提取新闻内容
    19. p_news = '<div class="result-op c-container xpath-log new-pmd".*?aria-label="百度快照">百度快照</a></div></div></div>'
    20. news = re.findall(p_news,html, re.S)
    21. print(len(news))
    22. #print(news)
    23. file = open('Savenews.txt','a')
    24. for i in range(len(news)):
    25. res = news[i]
    26. #提取标题
    27. p_title = '<h3 class="news-title_1YtI1">.*?aria-label="标题:(.*?)"'
    28. title = re.findall(p_title,res, re.S)
    29. print(title)
    30. #提取网址
    31. p_href = '<h3 class="news-title_1YtI1">.*?<a href="(.*?)"'
    32. href = re.findall(p_href,res, re.S) #re.S自动考虑换行修饰
    33. print(href)
    34. #提取时间
    35. p_date = 'aria-label="发布于:.*?">(.*?)</span>'
    36. date = re.findall(p_date,res, re.S) #re.S自动考虑换行修饰
    37. print(date)
    38. #提取来源
    39. p_source = 'aria-label="新闻来源:.*?">(.*?)</span>'
    40. source = re.findall(p_source,res, re.S) #re.S自动考虑换行修饰
    41. print(source)
    42. print('\n')
    43. #内容写入txt
    44. newsContent= str(1+i)+'. '+title[0]+' ('+date[0]+': '+source[0]+')'+'\n'
    45. file.write(newsContent)
    46. file.write(href[0]+'\n')
    47. file.write('\n')
    48. file.close()