一、准备工作
1.背景介绍
小说网站:新笔趣阁
2.爬虫步骤
大致分三个步骤:
- 发起请求:明确如何发起Http请求,获取到数据;
- 解析数据:获取到杂乱的数据,对数据进行清理;
- 保存数据:保存为自己想要的格式。
发起请求就用requests
解析数据有xpath、Beautiful Soup、正则表达式等,本文用BeautifulSoup
保存数据:用常规的文本保存,后续继续用docx和xlsx保存
3.Beautiful Soup
pip install bs4
4.小试牛刀
下载《斗破苍穹》部分章节
首先审查页面元素,分析第一章url
from typing import ChainMapimport requestsfrom bs4 import BeautifulSoupimport sysdef get_contents(server,target):url=server+targetreq=requests.get(url=url)req.encoding='utf-8'html=req.textbf=BeautifulSoup(html,'lxml')texts=bf.find('div',id='content')content=texts.text.replace('\xa0'*4,'\n\n')return contentdef get_urls(target):chapters=[]urls=[]nums=0req=requests.get(url=target,verify=False) #加上verify=false可避免get时指定ssl证书req.encoding='utf-8' #避免乱码html=req.textbs=BeautifulSoup(html,'lxml')a=bs.find('div',id="list")a=a.find_all('a')[100:200]nums=len(a)for each in a:urls.append(each.get('href'))chapters.append(each.string)return urls,chapters,numsdef writer(path,name,text):write_flag = Truewith open(path, 'a', encoding='utf-8') as f:f.write(name+'\n')f.writelines(text)f.write('\n\n')if __name__=='__main__':server='https://www.vbiquge.com'target='https://www.vbiquge.com/1_1413/'book_name='斗破苍穹.txt'urls,chapters,nums=get_urls(target)for i in range(nums):writer(book_name,chapters[i],get_contents(server,urls[i]))sys.stdout.write("已下载:{0}/{1}{2}".format(i,nums,'\r'))sys.stdout.flush()
- 面向对象
from bs4 import BeautifulSoupimport requestsimport sys"""类说明:下载《笔趣网》小说《一念永恒》parameters:无returns:无Modify:21.2.18"""class downloader(object):def __init__(self):self.server = 'https://www.bqkan.com'self.target = 'https://www.bqkan.com/1_1094/'self.chapters = [] # 存放章节名self.urls = [] # 存放章节链接self.nums = [] # 章节数"""函数说明:获取下载链接"""def get_download_url(self):req = requests.get(url=self.target)'''查看网页的源码发现网页的编码方式gbk,BeautifulSoup解析后得到的soup,打印出来是乱码,实际上其本身已经是正确的(从原始的GB2312编码)解析(为Unicode)后的了。之所以乱码,那是因为,打印soup时,调用的是__str__,其默认是UTF-8,所以输出到GBK的cmd中,才显示是乱码'''req.encoding = 'gb18030' #确保内容不乱码div_bf = BeautifulSoup(req.text,'lxml')div = div_bf.find_all('div', class_='listmain')a_bf = BeautifulSoup(str(div[0]))a = a_bf.find_all('a')self.nums = len(a[15:]) # 剔除前15章for each in a[15:]:self.chapters.append(each.string) #a标签中的章节名self.urls.append(each.get('href')) #a标签中的链接"""获取章节内容parameters:target 下载链接-stringreturns:texts 章节内容-string"""def get_contens(self, target):url=self.server+targetreq = requests.get(url)bf = BeautifulSoup(req.text,'lxml')texts = bf.find_all('div', class_='showtxt')texts = texts[0].text.replace('\xa0'*8, '\n\n') # 章节中的八个空格替换为回车return texts"""说明:将爬取的文件写入爬虫parameters:name-章节名称-stringpath-当前路径下,小说保存名-stringtext-章节内容-stringretruns:无"""def writer(self, name, path, text):write_flag = Truewith open(path, 'a', encoding='utf-8') as f:f.write(name+'\n')f.writelines(text)f.write('\n\n')if __name__ == '__main__':dl = downloader()dl.get_download_url()book_name='一念永恒.txt'print('开始下载:')for i in range(dl.nums):dl.writer(dl.chapters[i], book_name, dl.get_contens(dl.urls[i]))sys.stdout.write("已下载:{0:.2%}{1}".format(i/dl.nums,'\r'))sys.stdout.flush()print("下载完成")
