之前一直使用Typora+各种博客(Wordpress/Hexo/Typecho)来进行笔记和写作,后来接触并爱上了语雀,主要是贴图太方便了。(使用Typora的时候会搭配PicGo+云存储,但是有时候会粘贴了多余的图片或者想替换已有图片时,懒得打开云存储进行删除,久而久之就忘了,造成了一定的空间浪费。)
刚开始用语雀的时候还特地看了下,可以导出md格式的文章。但最近想批量导出知识库时,发现只能选择PDF或者语雀特定的格式,感觉不大放心。于是弄了个脚本通过语雀官方API导出了全部文章,并开始寻找本地存储的笔记软件。
结合个人情况进行筛选后发现Obsidian比较适合,但是由于一开始不会用,不会怎么处理图片路径的问题。语雀是没有目录这个概念的,所以导出的文章都放到了一起,然后图片等资源也统一放到了文章目录中的某一目录。而这也导致了如果我在Obsidian里通过建立多级文件夹的方式来分类文章,那么所有图片链接都要进行改动,差点弃坑了。还好在B站看了关于ob的视频,学到了通过索引的方式来进行管理

先上一张Obisidian的图:
image.png

语雀文章导出

  • 基于ExportMD进行了一些小改动,修复部分Bug以及适配Obsidian
    • 正则去除语雀导出时可能存在的标签
    • 导出的图片从./assets/修改为assets/,用于匹配obsidian
  • 使用方法:
    • NameSpace:访问语雀个人主页https://www.yuque.com/<xxx>中的xxx部分
    • Token:访问语雀Token新建,只需要给读取权限即可。 ```shell $ python3 ExportMD.py

      请输入语雀namespace: xxx 请输入语雀Token: xxx

  1. ![image.png](https://cdn.nlark.com/yuque/0/2022/png/520228/1646018580971-e3771e03-fa2c-4c73-8914-ed876edfb1c2.png#clientId=u7bf0b4eb-c38c-4&from=paste&height=410&id=uc35f1821&margin=%5Bobject%20Object%5D&name=image.png&originHeight=820&originWidth=1792&originalType=binary&ratio=1&size=153680&status=done&style=none&taskId=u5b3ba5a2-9ef3-4295-bfff-fbbe94e6819&width=896)
  2. - `ExportMD.py`完整代码,感谢 [@杜大哥](https://github.com/dzh929/ExportMD-rectify-pics) 协助修复
  3. ```python
  4. # -*- coding: UTF-8 -*-
  5. from prettytable import PrettyTable
  6. import re
  7. import os
  8. import aiohttp
  9. import asyncio
  10. from urllib import parse
  11. from PyInquirer import prompt, Separator
  12. from examples import custom_style_2
  13. from colr import color
  14. from cfonts import render, say
  15. class ExportMD:
  16. def __init__(self):
  17. self.repo_table = PrettyTable(["知识库ID", "名称"])
  18. self.namespace, self.Token = self.get_UserInfo()
  19. self.headers = {
  20. "Content-Type": "application/json",
  21. "User-Agent": "ExportMD",
  22. "X-Auth-Token": self.Token
  23. }
  24. self.repo = {}
  25. self.export_dir = './yuque'
  26. def print_logo(self):
  27. output = render('ExportMD', colors=['red', 'yellow'], align='center')
  28. print(output)
  29. # 语雀用户信息
  30. def get_UserInfo(self):
  31. f_name = ".userinfo"
  32. if os.path.isfile(f_name):
  33. with open(f_name, encoding="utf-8") as f:
  34. userinfo = f.read().split("&")
  35. else:
  36. namespace = input("请输入语雀namespace:")
  37. Token = input("请输入语雀Token:")
  38. userinfo = [namespace, Token]
  39. with open(f_name, "w") as f:
  40. f.write(namespace + "&" + Token)
  41. return userinfo
  42. # 发送请求
  43. async def req(self, session, api):
  44. url = "https://www.yuque.com/api/v2" + api
  45. # print(url)
  46. async with session.get(url, headers=self.headers) as resp:
  47. result = await resp.json()
  48. return result
  49. # 获取所有知识库
  50. async def getRepo(self):
  51. api = "/users/%s/repos" % self.namespace
  52. async with aiohttp.ClientSession() as session:
  53. result = await self.req(session, api)
  54. for repo in result.get('data'):
  55. repo_id = str(repo['id'])
  56. repo_name = repo['name']
  57. self.repo[repo_name] = repo_id
  58. self.repo_table.add_row([repo_id, repo_name])
  59. # 获取一个知识库的文档列表
  60. async def get_docs(self, repo_id):
  61. api = "/repos/%s/docs" % repo_id
  62. async with aiohttp.ClientSession() as session:
  63. result = await self.req(session, api)
  64. docs = {}
  65. for doc in result.get('data'):
  66. title = doc['title']
  67. slug = doc['slug']
  68. docs[slug] = title
  69. return docs
  70. # 获取正文 Markdown 源代码
  71. async def get_body(self, repo_id, slug):
  72. api = "/repos/%s/docs/%s" % (repo_id, slug)
  73. async with aiohttp.ClientSession() as session:
  74. result = await self.req(session, api)
  75. body = result['data']['body']
  76. body = re.sub("<a name=\".*\"></a>","", body) # 正则去除语雀导出的<a>标签
  77. body = re.sub(r'\<br \/\>!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的<br \>标签
  78. body = re.sub(r'\)\<br \/\>', ")\n", body) # 正则去除语雀导出的图片后紧跟的<br \>标签
  79. return body
  80. # 选择知识库
  81. def selectRepo(self):
  82. choices = [{"name": repo_name} for repo_name, _ in self.repo.items()]
  83. choices.insert(0, Separator('=== 知识库列表 ==='))
  84. questions = [
  85. {
  86. 'type': 'checkbox',
  87. 'qmark': '>>>',
  88. 'message': '选择知识库',
  89. 'name': 'repo',
  90. 'choices': choices
  91. }
  92. ]
  93. repo_name_list = prompt(questions, style=custom_style_2)
  94. return repo_name_list["repo"]
  95. # 创建文件夹
  96. def mkDir(self, dir):
  97. isExists = os.path.exists(dir)
  98. if not isExists:
  99. os.makedirs(dir)
  100. # 获取文章并执行保存
  101. async def download_md(self, repo_id, slug, repo_name, title):
  102. """
  103. :param repo_id: 知识库id
  104. :param slug: 文章id
  105. :param repo_name: 知识库名称
  106. :param title: 文章名称
  107. :return: none
  108. """
  109. body = await self.get_body(repo_id, slug)
  110. new_body, image_list = await self.to_local_image_src(body)
  111. if image_list:
  112. # 图片保存位置: .yuque/<repo_name>/assets/<filename>
  113. save_dir = os.path.join(self.export_dir, repo_name, "assets")
  114. self.mkDir(save_dir)
  115. async with aiohttp.ClientSession() as session:
  116. await asyncio.gather(
  117. *(self.download_image(session, image_info, save_dir) for image_info in image_list)
  118. )
  119. self.save(repo_name, title, new_body)
  120. print(" %s 导出成功!" % color(title, fore='green', style='bright'))
  121. # 将md里的图片地址替换成本地的图片地址
  122. async def to_local_image_src(self, body):
  123. body = re.sub(r'\<br \/\>!\[image.png\]',"\n![image.png]",body) # 正则去除语雀导出的图片后紧跟的<br \>标签
  124. body = re.sub(r'\)\<br \/\>', ")\n", body) # 正则去除语雀导出的图片后紧跟的<br \>标签
  125. pattern = r"!\[(?P<img_name>.*?)\]" \
  126. r"\((?P<img_src>https:\/\/cdn\.nlark\.com\/yuque.*\/(?P<slug>\d+)\/(?P<filename>.*?\.[a-zA-z]+)).*\)"
  127. repl = r"![\g<img_name>](assets/\g<filename>)"
  128. images = [_.groupdict() for _ in re.finditer(pattern, body)]
  129. new_body = re.sub(pattern, repl, body)
  130. return new_body, images
  131. # 下载图片
  132. async def download_image(self, session, image_info: dict, save_dir: str):
  133. img_src = image_info['img_src']
  134. filename = image_info["filename"]
  135. async with session.get(img_src) as resp:
  136. with open(os.path.join(save_dir, filename), 'wb') as f:
  137. f.write(await resp.read())
  138. # 保存文章
  139. def save(self, repo_name, title, body):
  140. # 将不能作为文件名的字符进行编码
  141. def check_safe_path(path: str):
  142. for char in r'/\<>?:"|*':
  143. path = path.replace(char, parse.quote_plus(char))
  144. return path
  145. repo_name = check_safe_path(repo_name)
  146. title = check_safe_path(title)
  147. save_path = "./yuque/%s/%s.md" % (repo_name, title)
  148. with open(save_path, "w", encoding="utf-8") as f:
  149. f.write(body)
  150. async def run(self):
  151. self.print_logo()
  152. await self.getRepo()
  153. repo_name_list = self.selectRepo()
  154. self.mkDir(self.export_dir) # 创建用于存储知识库文章的文件夹
  155. # 遍历所选知识库
  156. for repo_name in repo_name_list:
  157. dir_path = self.export_dir + "/" + repo_name.replace("/", "%2F")
  158. dir_path.replace("//", "/")
  159. self.mkDir(dir_path)
  160. repo_id = self.repo[repo_name]
  161. docs = await self.get_docs(repo_id)
  162. await asyncio.gather(
  163. *(self.download_md(repo_id, slug, repo_name, title) for slug, title in docs.items())
  164. )
  165. print("\n" + color('导出完成!', fore='green', style='bright'))
  166. print("已导出到:" + color(os.path.realpath(self.export_dir), fore='green', style='bright'))
  167. if __name__ == '__main__':
  168. export = ExportMD()
  169. loop = asyncio.get_event_loop()
  170. loop.run_until_complete(export.run())

可能出现的报错

  • 运行脚本时出现如下错误:

image.png

  • 原因是默认的最大打开文件数不够,修复方法: ```shell $ ulimit -n # 查看当前最大打开数文件 $ ulimit -n 512 # 设置多一点
  1. ![image.png](https://cdn.nlark.com/yuque/0/2021/png/520228/1640249324034-03ae2609-033a-4f8e-adbb-1c99ec76ea66.png#clientId=ue5d80edb-7f4b-4&from=paste&height=123&id=u2a4e60ea&margin=%5Bobject%20Object%5D&name=image.png&originHeight=246&originWidth=770&originalType=binary&ratio=1&size=35763&status=done&style=none&taskId=uf119372d-6c88-463f-8fd2-48da3fe0d50&width=385)
  2. <a name="cIIQ1"></a>
  3. ## 建立索引
  4. > 这里以语雀目录为内容,批量添加`obsidian`的内链格式`[[xxx]]`,以建立索引
  5. - 先复制语雀全部文档的标题,然后利用以下脚本批量添加内链格式,最后根据情况进行手动调整。
  6. ```python
  7. # -*- coding: UTF-8 -*-
  8. file = "list.txt"
  9. new_file = "list2.txt"
  10. datas = []
  11. with open(file, "r") as f:
  12. lines = f.readlines()
  13. for line in lines:
  14. data = "[[" + line.strip() + "]]"
  15. datas.append(data)
  16. with open(new_file, "w") as f2:
  17. for line in datas:
  18. f2.writelines(line + "\n")
  • obsidian一些配置

image.png