代码
项目地址:https://github.com/xingstarx/crawl-zsxq
import re
import requests
import json
import os
import pdfkit
config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")
from bs4 import BeautifulSoup
from urllib.parse import quote
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
<h1>{title}</h1>
<p>{text}</p>
</body>
</html>
"""
htmls = []
num = 0
def get_data(url):
global htmls, num
headers = {
'Authorization': '33A1AD5E-D03C-6EE9-1E8C-DDF321294280',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
rsp = requests.get(url, headers=headers)
with open('test.json', 'w', encoding='utf-8') as f: # 将返回数据写入 test.json 方便查看
f.write(json.dumps(rsp.json(), indent=2, ensure_ascii=False))
with open('test.json', encoding='utf-8') as f:
for topic in json.loads(f.read()).get('resp_data').get('topics'):
content = topic.get('question', topic.get('talk', topic.get('task', topic.get('solution'))))
# print(content)
text = content.get('text', '')
text = re.sub(r'<[^>]*>', '', text).strip()
text = text.replace('\n', '<br>')
title = str(num) + text[:9]
num += 1
if content.get('images'):
soup = BeautifulSoup(html_template, 'html.parser')
for img in content.get('images'):
url = img.get('large').get('url')
img_tag = soup.new_tag('img', src=url)
soup.body.append(img_tag)
html_img = str(soup)
html = html_img.format(title=title, text=text)
else:
html = html_template.format(title=title, text=text)
if topic.get('question'):
answer = topic.get('answer').get('text', "")
soup = BeautifulSoup(html, 'html.parser')
answer_tag = soup.new_tag('p')
answer_tag.string = answer
soup.body.append(answer_tag)
html_answer = str(soup)
html = html_answer.format(title=title, text=text)
htmls.append(html)
next_page = rsp.json().get('resp_data').get('topics')
if next_page:
create_time = next_page[-1].get('create_time')
if create_time[20:23] == "000":
end_time = create_time[:20]+"999"+create_time[23:]
else :
res = int(create_time[20:23])-1
end_time = create_time[:20]+str(res).zfill(3)+create_time[23:] # zfill 函数补足结果前面的零,始终为3位数
end_time = quote(end_time)
if len(end_time) == 33:
end_time = end_time[:24] + '0' + end_time[24:]
next_url = start_url + '&end_time=' + end_time
print(next_url)
get_data(next_url)
return htmls
def make_pdf(htmls):
html_files = []
for index, html in enumerate(htmls):
file = str(index) + ".html"
html_files.append(file)
with open(file, "w", encoding="utf-8") as f:
f.write(html)
options = {
"user-style-sheet": "test.css",
"page-size": "Letter",
"margin-top": "0.75in",
"margin-right": "0.75in",
"margin-bottom": "0.75in",
"margin-left": "0.75in",
"encoding": "UTF-8",
"custom-header": [("Accept-Encoding", "gzip")],
"cookie": [
("cookie-name1", "cookie-value1"), ("cookie-name2", "cookie-value2")
],
"outline-depth": 10,
}
try:
pdfkit.from_file(html_files, "电子书.pdf", options=options)
except Exception as e:
#pass
print(e)
for file in html_files:
os.remove(file)
print("已制作电子书在当前目录!")
if __name__ == '__main__':
start_url = 'https://api.zsxq.com/v1.10/groups/5825251554/topics?scope=digests&count=20'
make_pdf(get_data(start_url))
效果
遇到的坑
1.No wkhtmltopdf executable found: “b’’”
python中需要安装wkhtmltopdf模块
然后windows系统中还要安装wkhtmltopdf软件
下载地址:https://wkhtmltopdf.org/downloads.html
2.找关键信息