代码

项目地址:https://github.com/xingstarx/crawl-zsxq

  1. import re
  2. import requests
  3. import json
  4. import os
  5. import pdfkit
  6. config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")
  7. from bs4 import BeautifulSoup
  8. from urllib.parse import quote
  9. html_template = """
  10. <!DOCTYPE html>
  11. <html lang="en">
  12. <head>
  13. <meta charset="UTF-8">
  14. </head>
  15. <body>
  16. <h1>{title}</h1>
  17. <p>{text}</p>
  18. </body>
  19. </html>
  20. """
  21. htmls = []
  22. num = 0
  23. def get_data(url):
  24. global htmls, num
  25. headers = {
  26. 'Authorization': '33A1AD5E-D03C-6EE9-1E8C-DDF321294280',
  27. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
  28. }
  29. rsp = requests.get(url, headers=headers)
  30. with open('test.json', 'w', encoding='utf-8') as f: # 将返回数据写入 test.json 方便查看
  31. f.write(json.dumps(rsp.json(), indent=2, ensure_ascii=False))
  32. with open('test.json', encoding='utf-8') as f:
  33. for topic in json.loads(f.read()).get('resp_data').get('topics'):
  34. content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution'))))
  35. # print(content)
  36. text = content.get('text', '')
  37. text = re.sub(r'<[^>]*>', '', text).strip()
  38. text = text.replace('\n', '<br>')
  39. title = str(num) + text[:9]
  40. num += 1
  41. if content.get('images'):
  42. soup = BeautifulSoup(html_template, 'html.parser')
  43. for img in content.get('images'):
  44. url = img.get('large').get('url')
  45. img_tag = soup.new_tag('img', src=url)
  46. soup.body.append(img_tag)
  47. html_img = str(soup)
  48. html = html_img.format(title=title, text=text)
  49. else:
  50. html = html_template.format(title=title, text=text)
  51. if topic.get('question'):
  52. answer = topic.get('answer').get('text', "")
  53. soup = BeautifulSoup(html, 'html.parser')
  54. answer_tag = soup.new_tag('p')
  55. answer_tag.string = answer
  56. soup.body.append(answer_tag)
  57. html_answer = str(soup)
  58. html = html_answer.format(title=title, text=text)
  59. htmls.append(html)
  60. next_page = rsp.json().get('resp_data').get('topics')
  61. if next_page:
  62. create_time = next_page[-1].get('create_time')
  63. if create_time[20:23] == "000":
  64. end_time = create_time[:20]+"999"+create_time[23:]
  65. else :
  66. res = int(create_time[20:23])-1
  67. end_time = create_time[:20]+str(res).zfill(3)+create_time[23:] # zfill 函数补足结果前面的零,始终为3位数
  68. end_time = quote(end_time)
  69. if len(end_time) == 33:
  70. end_time = end_time[:24] + '0' + end_time[24:]
  71. next_url = start_url + '&end_time=' + end_time
  72. print(next_url)
  73. get_data(next_url)
  74. return htmls
  75. def make_pdf(htmls):
  76. html_files = []
  77. for index, html in enumerate(htmls):
  78. file = str(index) + ".html"
  79. html_files.append(file)
  80. with open(file, "w", encoding="utf-8") as f:
  81. f.write(html)
  82. options = {
  83. "user-style-sheet": "test.css",
  84. "page-size": "Letter",
  85. "margin-top": "0.75in",
  86. "margin-right": "0.75in",
  87. "margin-bottom": "0.75in",
  88. "margin-left": "0.75in",
  89. "encoding": "UTF-8",
  90. "custom-header": [("Accept-Encoding", "gzip")],
  91. "cookie": [
  92. ("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2")
  93. ],
  94. "outline-depth": 10,
  95. }
  96. try:
  97. pdfkit.from_file(html_files, "电子书.pdf", options=options)
  98. except Exception as e:
  99. #pass
  100. print(e)
  101. for file in html_files:
  102. os.remove(file)
  103. print("已制作电子书在当前目录!")
  104. if __name__ == '__main__':
  105. start_url = 'https://api.zsxq.com/v1.10/groups/5825251554/topics?scope=digests&count=20'
  106. make_pdf(get_data(start_url))

效果

image.png

遇到的坑

1.No wkhtmltopdf executable found: “b’’”
python中需要安装wkhtmltopdf模块
image.png
然后windows系统中还要安装wkhtmltopdf软件
下载地址:https://wkhtmltopdf.org/downloads.html
image.png
2.找关键信息
image.png

以管理员方式运行pycharm