pdfmine
pdfmine

环境：python2.7 + windows调试成功。
测试文件test.pdf下载。
import sys
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
"""
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
"""
# main
def extract_text(filename, password_param, output_file):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    if filename.strip()[-4:] != '.pdf':
        print 'file type is not pdf!'
        return
    elif output_file is not None:
        outfile = output_file
    else:
        outfile = filename.strip()[:-4] + '.txt'
        print 'output file path: %s' % outfile
    if password_param is not None:
        password = password_param
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return
    fname = filename
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    print 'extracting text in pdf ... ...'
    page_cnt = 1
    for page in PDFPage.get_pages(fp, pagenos,
                                maxpages=maxpages, password=password,
                                caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        print 'processing page %d ...' % page_cnt
        interpreter.process_page(page)
        page_cnt += 1
    fp.close()
    device.close()
    outfp.close()
    print 'text has been written into %s ' % outfile
    return outfile
def check_line_valid(line):# only line like '1 ' will be invalid
    line = line[:-1]
    if line == '':
        return True
    digits = '0123456789'
    for c in line:
        if c != ' ' and c not in digits:
            return True
    return False
def process_line(line):
    if line != '\n':  # single line with '\n' will be ignored
        line = line[:-1]
        if line[-1:] == '-':
            line = line[:-1]
        else:
            line += ' '
    else:
        line += '\n'
    return line
"""
"""
def reformat_output_file(outfile):
    text_reformated = ''
    file_handler = open(outfile)
    line = file_handler.readline()
    recording = False
    while line:
        if line == 'Abstract\n':
            recording = True
        if recording is True:
            if check_line_valid(line):
                line = process_line(line)
                text_reformated += line
        line = file_handler.readline()
    file_handler.close()
    print '%s has been reformated.' % outfile
    file_reformated_name = outfile[:-4] + '.reformated.txt'
    file_handler = open(file_reformated_name, 'w')
    file_handler.write(text_reformated)
    return text_reformated
of = extract_text('D:/pdfmine/test.pdf', '', None)
tr = reformat_output_file(of)
乐谷技术学习笔记（可公开）

【20190126】python 提取pdf中的文字

pdfmine