import requestsimport refrom bs4 import BeautifulSoupfrom lxml import etreeimport timefrom pip._vendor.retrying import retrydef getBook(book): murl = 'https://www.xbiquge.la/modules/article/waps.php' headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"} data={'searchkey':book} response=requests.post(url=murl,data=data,headers=headers) response.encoding = "utf-8" code=response.text soup=BeautifulSoup(code,'lxml') tab=soup.select('.even') all=re.findall(r'<td class="even">(.*?)</td>',str(tab)) if len(all)==0: return None name=re.findall(r'target="_blank">(.*?)</a>',str(all)) author=[] url=re.findall(r'href="(.*?)"',str(all)) for i,n in enumerate(all): if i%2!=0: author.append(n) for i in range(len(name)): if i == 0: print('序号\t书名\t作者\t网址') print('['+str(i)+']\t'+name[i]+'\t'+author[i]+'\t'+url[i]) burl=input("请输入你想获得txt的书的序号:") return url[int(burl)]def getChap(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"} response = requests.get(url=url,headers=headers) response.encoding = "utf-8" code = response.text tree=etree.HTML(code) nurl = tree.xpath('//div[@id="list"]/dl/dd/a/@href') name=str(tree.xpath('//div[@id="info"]/h1/text()')).split("'")[1] print(name+"共发现"+str(len(nurl))+"章,即将开始爬取") for i in range(len(nurl)): turl='http://www.xbiquge.la'+nurl[i] getContent(turl,name) print("爬取完成,剩余" + str(len(nurl)-i-1)+'章')@retrydef getContent(url,name): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"} response = requests.get(url=url, headers=headers) response.encoding='utf-8' time.sleep(1) code = response.text tree = etree.HTML(code) chap = tree.xpath('//div[@class="bookname"]/h1/text()')[0] content = tree.xpath('//div[@id="content"]/text()') with open(name + '.txt', 'a+', encoding='utf-8') as file: print("开始爬取:" + chap) file.write(chap + '\n\n') for i in content: text = str(i) text.strip() file.write(text) file.write("\n\n") file.close()book=input("请输入你要获取的书名或作者(请您少字也别输错字):")url=getBook(book)if url==None: print("未找到相关书籍")else: getChap(url) print("爬取完毕")