csdn爬取文章保存为markdown - 《python实战》

from selenium import webdriver
import os
import time
import html2text as ht
from bs4 import BeautifulSoup
import parsel
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ChromeOptions
# html模板，主要是为了设置来解决乱码问题
html = """
    {content}
"""
# md文件存储路径
path = "./文件"
#初始化浏览器
def init():
    # 实现无可视化界面的操作
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    # 实施规避检测
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 把无可视化界面和规避检测设置进去
    driver = webdriver.Chrome(chrome_options=chrome_options,options=option)
    #请求CSDN
    driver.get("https://www.csdn.net/")
    time.sleep(0.5)
    #操控浏览器滑轮滑到底部
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    #继续操控浏览器滑轮进行逐步滑动
    #如果想要爬取更多文章，可控制滑轮向下多滑
    for y in range(10):
        #0,200  每次滑动200像素
        js = 'window.scrollBy(0,200)'
        driver.execute_script(js)
        time.sleep(0.5)
    time.sleep(3)
    #返回driver
    return driver
#爬取
def Crawling(driver):
    #使用lxml库解析
    data = BeautifulSoup(driver.page_source, "lxml")
    #参考图1，先定位到feedlist_mod home，在获取其下的全部class为clearfix的li标签
    li_list = data.find(class_="feedlist_mod home").find_all(class_="clearfix")
    #遍历li_list
    for li in li_list:
        li_data = BeautifulSoup(str(li), "lxml")
        #异常处理，对于有的没有详情页url的那肯定不是博客文章，直接continue
        try:
            # 详情页url，参考图2
            page_url = li_data.find("a")["href"]
        except:
            continue
        # 如果是官方直播就跳过
        if li_data.find(class_="name").find("a").text.strip() == '官方直播':
            continue
        # 文章标题，参考图2
        title = li_data.find("a").text.replace(" ", "")
        #进行详情页请求爬取
        page_Crawling(title,page_url,driver)
#详情页爬取并保存为md文件
def page_Crawling(title,page_url,driver):
    # 如果不存在就创建该文件夹
    if not os.path.exists(path):
        os.makedirs(path)
    #向详情页发起get请求
    driver.get(page_url)
    #使用Selectors选择器
    selector = parsel.Selector(driver.page_source)
    #使用CSS获取,可参考图3，文章只有一个article标签
    text = selector.css("article").get()
    #先保存为html
    with open("text.html", "w", encoding="utf-8") as f:
           #需要设置html模板，不然出现乱码
        f.write(html.format(content=text))
    #这里我们使用html2text库将html转为markdown
    text_maker = ht.HTML2Text()
    # 读取html格式文件
    with open('text.html', 'r', encoding='UTF-8') as f:
        htmlpage = f.read()
    # 处理html格式文件中的内容
    text = text_maker.handle(htmlpage)
    # 写入处理后的内容
    with open(path + "/" + title + '.md', 'w', encoding="utf-8") as f:
        f.write(text)
    print(title + "爬取完毕")
#开始
if __name__ == "__main__":
    #初始化
    driver = init()
    #爬取
    Crawling(driver)