title: python爬虫聚焦解析随笔 #标题tags: #标签
date: 2022-04-23
categories: python # 分类
记录下bs4及xpath语法在python爬虫中的使用。
bs4语法使用示例
# 学习bs4语法,爬取西游记小说
import requests
import os
from bs4 import BeautifulSoup
url = 'https://xiyouji.5000yan.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
xs_dir = './西游记'
if not os.path.exists(xs_dir):
os.mkdir(xs_dir)
rp = requests.get(url, headers=headers)
rp.encoding = 'utf-8'
html = rp.text
soup = BeautifulSoup(html, 'lxml')
# print(soup.find_all("div",class_='book-mulu'))
mulu = soup.select(".sidamingzhu-list-mulu > ul > li > a")
for zj in mulu:
zj_url = zj.get("href") # 获取章节内容的url:https://sanguo.5000yan.com/965.html,或者使用zj['href']
zj_name = zj.string + '.txt' # 获取章节名称
zj_path = os.path.join(xs_dir, zj_name) # 拼接章节保存路径
zj_res = requests.get(zj_url, headers=headers)
zj_res.encoding = 'utf-8'
zj_html = zj_res.text
zj_soup = BeautifulSoup(zj_html, 'lxml')
zj_title = zj_soup.select(".section-body > header > h2")
content_title = zj_title[0].string
content_body = zj_soup.select_one(".section-body > div").text
with open(zj_path, mode='w', encoding='utf-8') as f:
f.write(content_title)
f.write(content_body)
print('【{0}】下载成功!!!'.format(zj_name))
xpath语法使用记录
# 用于爬取美女图片,并保存至本地
import requests
import os
import traceback
from lxml import etree
url = 'https://pic.netbian.com/4kmeinv/index_{0}.html'
headers = {
'cache-control': 'no-cache',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
}
xs_dir = './图片'
if not os.path.exists(xs_dir):
os.mkdir(xs_dir)
# 获取图片名称及二进制数据
def get_img(html):
tree = etree.HTML(html)
res = tree.xpath('//*[@id="main"]/div[3]/ul/li[not(@class="nextpage")]/a/@href')
for href in res:
try:
tmp_url = 'https://pic.netbian.com/' + href
tmp_res = requests.get(tmp_url, headers).text
tmp_tree = etree.HTML(tmp_res)
img_uri = tmp_tree.xpath('//*[@id="img"]/img/@src')[0]
img_name = tmp_tree.xpath('//*[@id="main"]/div[2]/div[1]/div[1]/h1/text()')[0].encode(
'ISO-8859-1').decode('gbk') + '.jpg'
# 上面获取的文件名会有乱码,可以通过.encode('ISO-8859-1').decode('gbk') 来解决乱码问题
img_url = 'https://pic.netbian.com' + img_uri # https://pic.netbian.com/uploads/allimg/220422/225834-165063951427ae.jpg
img = requests.get(img_url, headers=headers)
yield img_name, img.content
except:
traceback.print_exc()
pass
# 获取每一页链接
def get_index(url, page_num=1):
index_list = []
for i in range(1, page_num + 1):
index_url = 'https://pic.netbian.com/4kmeinv/index.html' if i == 1 else url.format(i)
index_list.append(index_url)
return index_list
# 下载图片并保存至本地
def main(img_base_dir):
index_list = get_index(url, page_num=1)
for index in index_list:
html = requests.get(index, headers=headers).text
image = get_img(html=html)
while 1:
try:
tup_image_info = next(image)
image_name = tup_image_info[0]
image_content = tup_image_info[1]
img_abs_path = os.path.join(img_base_dir, image_name)
with open(img_abs_path, mode='wb') as f:
f.write(image_content)
print('{0} 下载完成!!!'.format(image_name))
except StopIteration:
break
if __name__ == '__main__':
main(xs_dir)