pdfmine

  • 环境:python2.7 + windows调试成功。
  • 测试文件test.pdf下载。

test.pdf

  1. import sys
  2. from pdfminer.pdfdocument import PDFDocument
  3. from pdfminer.pdfparser import PDFParser
  4. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  5. from pdfminer.pdfdevice import PDFDevice, TagExtractor
  6. from pdfminer.pdfpage import PDFPage
  7. from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
  8. from pdfminer.cmapdb import CMapDB
  9. from pdfminer.layout import LAParams
  10. from pdfminer.image import ImageWriter
  11. """
  12. print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
  13. ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
  14. ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
  15. ' [-t text|html|xml|tag] [-c codec] [-s scale]'
  16. ' file ...' % argv[0])
  17. """
  18. # main
  19. def extract_text(filename, password_param, output_file):
  20. # debug option
  21. debug = 0
  22. # input option
  23. password = ''
  24. pagenos = set()
  25. maxpages = 0
  26. # output option
  27. outfile = None
  28. outtype = None
  29. imagewriter = None
  30. rotation = 0
  31. stripcontrol = False
  32. layoutmode = 'normal'
  33. codec = 'utf-8'
  34. pageno = 1
  35. scale = 1
  36. caching = True
  37. showpageno = True
  38. laparams = LAParams()
  39. if filename.strip()[-4:] != '.pdf':
  40. print 'file type is not pdf!'
  41. return
  42. elif output_file is not None:
  43. outfile = output_file
  44. else:
  45. outfile = filename.strip()[:-4] + '.txt'
  46. print 'output file path: %s' % outfile
  47. if password_param is not None:
  48. password = password_param
  49. PDFDocument.debug = debug
  50. PDFParser.debug = debug
  51. CMapDB.debug = debug
  52. PDFPageInterpreter.debug = debug
  53. #
  54. rsrcmgr = PDFResourceManager(caching=caching)
  55. if not outtype:
  56. outtype = 'text'
  57. if outfile:
  58. if outfile.endswith('.htm') or outfile.endswith('.html'):
  59. outtype = 'html'
  60. elif outfile.endswith('.xml'):
  61. outtype = 'xml'
  62. elif outfile.endswith('.tag'):
  63. outtype = 'tag'
  64. if outfile:
  65. outfp = file(outfile, 'w')
  66. else:
  67. outfp = sys.stdout
  68. if outtype == 'text':
  69. device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
  70. imagewriter=imagewriter)
  71. elif outtype == 'xml':
  72. device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
  73. imagewriter=imagewriter,
  74. stripcontrol=stripcontrol)
  75. elif outtype == 'html':
  76. device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
  77. layoutmode=layoutmode, laparams=laparams,
  78. imagewriter=imagewriter, debug=debug)
  79. elif outtype == 'tag':
  80. device = TagExtractor(rsrcmgr, outfp, codec=codec)
  81. else:
  82. return
  83. fname = filename
  84. fp = file(fname, 'rb')
  85. interpreter = PDFPageInterpreter(rsrcmgr, device)
  86. print 'extracting text in pdf ... ...'
  87. page_cnt = 1
  88. for page in PDFPage.get_pages(fp, pagenos,
  89. maxpages=maxpages, password=password,
  90. caching=caching, check_extractable=True):
  91. page.rotate = (page.rotate+rotation) % 360
  92. print 'processing page %d ...' % page_cnt
  93. interpreter.process_page(page)
  94. page_cnt += 1
  95. fp.close()
  96. device.close()
  97. outfp.close()
  98. print 'text has been written into %s ' % outfile
  99. return outfile
  100. def check_line_valid(line):# only line like '1 ' will be invalid
  101. line = line[:-1]
  102. if line == '':
  103. return True
  104. digits = '0123456789'
  105. for c in line:
  106. if c != ' ' and c not in digits:
  107. return True
  108. return False
  109. def process_line(line):
  110. if line != '\n': # single line with '\n' will be ignored
  111. line = line[:-1]
  112. if line[-1:] == '-':
  113. line = line[:-1]
  114. else:
  115. line += ' '
  116. else:
  117. line += '\n'
  118. return line
  119. """
  120. """
  121. def reformat_output_file(outfile):
  122. text_reformated = ''
  123. file_handler = open(outfile)
  124. line = file_handler.readline()
  125. recording = False
  126. while line:
  127. if line == 'Abstract\n':
  128. recording = True
  129. if recording is True:
  130. if check_line_valid(line):
  131. line = process_line(line)
  132. text_reformated += line
  133. line = file_handler.readline()
  134. file_handler.close()
  135. print '%s has been reformated.' % outfile
  136. file_reformated_name = outfile[:-4] + '.reformated.txt'
  137. file_handler = open(file_reformated_name, 'w')
  138. file_handler.write(text_reformated)
  139. return text_reformated
  140. of = extract_text('D:/pdfmine/test.pdf', '', None)
  141. tr = reformat_output_file(of)