python 读取发票信息;python 电子发票;python 纸质发票;python 发票;

利用百度免费的api提取发票信息。可以提取pdf或者拍照的png图片。

  1. import requests
  2. import base64
  3. # pip install PyMuPDF
  4. # 安装PyMuPDF才能用fitz
  5. import fitz
  6. # fileName : string
  7. # 例如:"test.png",也可以是pdf
  8. class InvoiceParser:
  9. def __init__(self, fileName):
  10. self.fileName = fileName
  11. self.response = {}
  12. self.invoiceInfo = {"发票号码": -1,
  13. "开票日期": -1,
  14. "发票金额(含税)": -1,
  15. "销售方名称": -1,
  16. "购买物品": -1}
  17. self._set_up()
  18. def _set_up(self):
  19. self.get_response()
  20. # 录入二维码发票信息
  21. self.invoiceInfo["发票号码"] = self.response["InvoiceNum"]
  22. self.invoiceInfo["开票日期"] = self.response["InvoiceDate"]
  23. self.invoiceInfo["发票金额(含税)"] = self.response["AmountInFiguers"]
  24. self.invoiceInfo["销售方名称"] = self.response["SellerName"]
  25. self.invoiceInfo["购买物品"] = self.response["CommodityName"]
  26. # 通过百度云识别发票信息
  27. def get_response(self):
  28. # 发票文件转换为二进制
  29. f = open(self.fileName, 'rb')
  30. invoiceFile = base64.b64encode(f.read())
  31. # 设置url
  32. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
  33. access_token = "自己获取"
  34. request_url = request_url + "?access_token=" + access_token
  35. # 设置其他参数
  36. headers = {'content-type': 'application/x-www-form-urlencoded'}
  37. # 默认是图片
  38. params = {"image": invoiceFile}
  39. # 如果是pdf转pdf
  40. if ".pdf" in self.fileName:
  41. params = {"pdf_file": invoiceFile}
  42. response = requests.post(request_url, data=params, headers=headers)
  43. # 保存到self
  44. self.response = response.json()['words_result']
  45. return self.response
  46. def GetAccessToken():
  47. apiKey = "自己获取"
  48. secretKet = "自己获取"
  49. url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}&".format(
  50. apiKey, secretKet)
  51. response = requests.post(url=url)
  52. return eval(response.text)["access_token"]
  53. if __name__ == '__main__':
  54. # 获取get_response()函数中的access_token
  55. # 获取一次就可以注释掉了
  56. # print(GetAccessToken())
  57. tmp = InvoiceParser("u盘.png")
  58. print(str(tmp.invoiceInfo))
  59. # print(str(tmp.response))

参考链接

  1. 百度AI增值税发票识别
  2. 文字识别OCR文档