title: python爬虫聚焦解析随笔 #标题tags: #标签
date: 2022-04-23
categories: python # 分类

记录下bs4及xpath语法在python爬虫中的使用。

bs4语法使用示例

# 学习bs4语法,爬取西游记小说

import requests
import os

from bs4 import BeautifulSoup

url = 'https://xiyouji.5000yan.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
xs_dir = './西游记'

if not os.path.exists(xs_dir):
    os.mkdir(xs_dir)

rp = requests.get(url, headers=headers)
rp.encoding = 'utf-8'
html = rp.text

soup = BeautifulSoup(html, 'lxml')

# print(soup.find_all("div",class_='book-mulu'))
mulu = soup.select(".sidamingzhu-list-mulu > ul > li > a")

for zj in mulu:
    zj_url = zj.get("href")  # 获取章节内容的url:https://sanguo.5000yan.com/965.html,或者使用zj['href']
    zj_name = zj.string + '.txt'  # 获取章节名称
    zj_path = os.path.join(xs_dir, zj_name)  # 拼接章节保存路径
    zj_res = requests.get(zj_url, headers=headers)
    zj_res.encoding = 'utf-8'
    zj_html = zj_res.text
    zj_soup = BeautifulSoup(zj_html, 'lxml')
    zj_title = zj_soup.select(".section-body > header > h2")
    content_title = zj_title[0].string
    content_body = zj_soup.select_one(".section-body > div").text
    with open(zj_path, mode='w', encoding='utf-8') as f:
        f.write(content_title)
        f.write(content_body)
    print('【{0}】下载成功!!!'.format(zj_name))

xpath语法使用记录

# 用于爬取美女图片,并保存至本地
import requests
import os
import traceback

from lxml import etree

url = 'https://pic.netbian.com/4kmeinv/index_{0}.html'

headers = {
    'cache-control': 'no-cache',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
}
xs_dir = './图片'

if not os.path.exists(xs_dir):
    os.mkdir(xs_dir)


# 获取图片名称及二进制数据
def get_img(html):
    tree = etree.HTML(html)
    res = tree.xpath('//*[@id="main"]/div[3]/ul/li[not(@class="nextpage")]/a/@href')
    for href in res:
        try:
            tmp_url = 'https://pic.netbian.com/' + href
            tmp_res = requests.get(tmp_url, headers).text
            tmp_tree = etree.HTML(tmp_res)
            img_uri = tmp_tree.xpath('//*[@id="img"]/img/@src')[0]
            img_name = tmp_tree.xpath('//*[@id="main"]/div[2]/div[1]/div[1]/h1/text()')[0].encode(
                'ISO-8859-1').decode('gbk') + '.jpg'
            # 上面获取的文件名会有乱码,可以通过.encode('ISO-8859-1').decode('gbk') 来解决乱码问题
            img_url = 'https://pic.netbian.com' + img_uri  # https://pic.netbian.com/uploads/allimg/220422/225834-165063951427ae.jpg
            img = requests.get(img_url, headers=headers)
            yield img_name, img.content
        except:
            traceback.print_exc()
            pass


# 获取每一页链接
def get_index(url, page_num=1):
    index_list = []
    for i in range(1, page_num + 1):
        index_url = 'https://pic.netbian.com/4kmeinv/index.html' if i == 1 else url.format(i)
        index_list.append(index_url)
    return index_list


# 下载图片并保存至本地
def main(img_base_dir):
    index_list = get_index(url, page_num=1)
    for index in index_list:
        html = requests.get(index, headers=headers).text
        image = get_img(html=html)
        while 1:
            try:
                tup_image_info = next(image)
                image_name = tup_image_info[0]
                image_content = tup_image_info[1]
                img_abs_path = os.path.join(img_base_dir, image_name)
                with open(img_abs_path, mode='wb') as f:
                    f.write(image_content)
                print('{0} 下载完成!!!'.format(image_name))
            except StopIteration:
                break


if __name__ == '__main__':
    main(xs_dir)