1. import requests
    2. import re
    3. from bs4 import BeautifulSoup
    4. from lxml import etree
    5. import time
    6. from pip._vendor.retrying import retry
    7. def getBook(book):
    8. murl = 'https://www.xbiquge.la/modules/article/waps.php'
    9. headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"}
    10. data={'searchkey':book}
    11. response=requests.post(url=murl,data=data,headers=headers)
    12. response.encoding = "utf-8"
    13. code=response.text
    14. soup=BeautifulSoup(code,'lxml')
    15. tab=soup.select('.even')
    16. all=re.findall(r'<td class="even">(.*?)</td>',str(tab))
    17. if len(all)==0:
    18. return None
    19. name=re.findall(r'target="_blank">(.*?)</a>',str(all))
    20. author=[]
    21. url=re.findall(r'href="(.*?)"',str(all))
    22. for i,n in enumerate(all):
    23. if i%2!=0:
    24. author.append(n)
    25. for i in range(len(name)):
    26. if i == 0:
    27. print('序号\t书名\t作者\t网址')
    28. print('['+str(i)+']\t'+name[i]+'\t'+author[i]+'\t'+url[i])
    29. burl=input("请输入你想获得txt的书的序号:")
    30. return url[int(burl)]
    31. def getChap(url):
    32. headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"}
    33. response = requests.get(url=url,headers=headers)
    34. response.encoding = "utf-8"
    35. code = response.text
    36. tree=etree.HTML(code)
    37. nurl = tree.xpath('//div[@id="list"]/dl/dd/a/@href')
    38. name=str(tree.xpath('//div[@id="info"]/h1/text()')).split("'")[1]
    39. print(name+"共发现"+str(len(nurl))+"章,即将开始爬取")
    40. for i in range(len(nurl)):
    41. turl='http://www.xbiquge.la'+nurl[i]
    42. getContent(turl,name)
    43. print("爬取完成,剩余" + str(len(nurl)-i-1)+'章')
    44. @retry
    45. def getContent(url,name):
    46. headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"}
    47. response = requests.get(url=url, headers=headers)
    48. response.encoding='utf-8'
    49. time.sleep(1)
    50. code = response.text
    51. tree = etree.HTML(code)
    52. chap = tree.xpath('//div[@class="bookname"]/h1/text()')[0]
    53. content = tree.xpath('//div[@id="content"]/text()')
    54. with open(name + '.txt', 'a+', encoding='utf-8') as file:
    55. print("开始爬取:" + chap)
    56. file.write(chap + '\n\n')
    57. for i in content:
    58. text = str(i)
    59. text.strip()
    60. file.write(text)
    61. file.write("\n\n")
    62. file.close()
    63. book=input("请输入你要获取的书名或作者(请您少字也别输错字):")
    64. url=getBook(book)
    65. if url==None:
    66. print("未找到相关书籍")
    67. else:
    68. getChap(url)
    69. print("爬取完毕")