pdfmine
- 环境:python2.7 + windows调试成功。
- 测试文件test.pdf下载。
test.pdf
import sysfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfdevice import PDFDevice, TagExtractorfrom pdfminer.pdfpage import PDFPagefrom pdfminer.converter import XMLConverter, HTMLConverter, TextConverterfrom pdfminer.cmapdb import CMapDBfrom pdfminer.layout import LAParamsfrom pdfminer.image import ImageWriter""" print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0])"""# maindef extract_text(filename, password_param, output_file): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if filename.strip()[-4:] != '.pdf': print 'file type is not pdf!' return elif output_file is not None: outfile = output_file else: outfile = filename.strip()[:-4] + '.txt' print 'output file path: %s' % outfile if password_param is not None: password = password_param PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return fname = filename fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) print 'extracting text in pdf ... ...' page_cnt = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 print 'processing page %d ...' % page_cnt interpreter.process_page(page) page_cnt += 1 fp.close() device.close() outfp.close() print 'text has been written into %s ' % outfile return outfiledef check_line_valid(line):# only line like '1 ' will be invalid line = line[:-1] if line == '': return True digits = '0123456789' for c in line: if c != ' ' and c not in digits: return True return Falsedef process_line(line): if line != '\n': # single line with '\n' will be ignored line = line[:-1] if line[-1:] == '-': line = line[:-1] else: line += ' ' else: line += '\n' return line""""""def reformat_output_file(outfile): text_reformated = '' file_handler = open(outfile) line = file_handler.readline() recording = False while line: if line == 'Abstract\n': recording = True if recording is True: if check_line_valid(line): line = process_line(line) text_reformated += line line = file_handler.readline() file_handler.close() print '%s has been reformated.' % outfile file_reformated_name = outfile[:-4] + '.reformated.txt' file_handler = open(file_reformated_name, 'w') file_handler.write(text_reformated) return text_reformatedof = extract_text('D:/pdfmine/test.pdf', '', None)tr = reformat_output_file(of)