pdfmine
- 环境:python2.7 + windows调试成功。
- 测试文件test.pdf下载。
test.pdf
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
"""
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0])
"""
# main
def extract_text(filename, password_param, output_file):
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
outfile = None
outtype = None
imagewriter = None
rotation = 0
stripcontrol = False
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
caching = True
showpageno = True
laparams = LAParams()
if filename.strip()[-4:] != '.pdf':
print 'file type is not pdf!'
return
elif output_file is not None:
outfile = output_file
else:
outfile = filename.strip()[:-4] + '.txt'
print 'output file path: %s' % outfile
if password_param is not None:
password = password_param
PDFDocument.debug = debug
PDFParser.debug = debug
CMapDB.debug = debug
PDFPageInterpreter.debug = debug
#
rsrcmgr = PDFResourceManager(caching=caching)
if not outtype:
outtype = 'text'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter, debug=debug)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return
fname = filename
fp = file(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
print 'extracting text in pdf ... ...'
page_cnt = 1
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
page.rotate = (page.rotate+rotation) % 360
print 'processing page %d ...' % page_cnt
interpreter.process_page(page)
page_cnt += 1
fp.close()
device.close()
outfp.close()
print 'text has been written into %s ' % outfile
return outfile
def check_line_valid(line):# only line like '1 ' will be invalid
line = line[:-1]
if line == '':
return True
digits = '0123456789'
for c in line:
if c != ' ' and c not in digits:
return True
return False
def process_line(line):
if line != '\n': # single line with '\n' will be ignored
line = line[:-1]
if line[-1:] == '-':
line = line[:-1]
else:
line += ' '
else:
line += '\n'
return line
"""
"""
def reformat_output_file(outfile):
text_reformated = ''
file_handler = open(outfile)
line = file_handler.readline()
recording = False
while line:
if line == 'Abstract\n':
recording = True
if recording is True:
if check_line_valid(line):
line = process_line(line)
text_reformated += line
line = file_handler.readline()
file_handler.close()
print '%s has been reformated.' % outfile
file_reformated_name = outfile[:-4] + '.reformated.txt'
file_handler = open(file_reformated_name, 'w')
file_handler.write(text_reformated)
return text_reformated
of = extract_text('D:/pdfmine/test.pdf', '', None)
tr = reformat_output_file(of)