Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库,它可以用于创建新的PDF文档,修改现有的PDF文档,还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。
Apache PDFBox主要有以下特征:
PDF读取、创建、打印、转换、验证、合并分割等特征。
所需jar包
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.12</version>
</dependency>
文本内容提取
//
public static void PdfReader(String filePath){
File pdfFile = new File(filePath);
PDDocument document = null;
try {
// PDF文件加载方式一
/*
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
*/
// PDF文件加载方式二
document=PDDocument.load(pdfFile);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);
} catch(Exception e) {
e.printStackTrace();
}
}
图片提取
此方法可以取出源PDF中图片对象PDImageXObject,然后可以对该对象进行相关处理,本代码实现了将提取出来的每一个图片对象,插入到一个空白的PDF文档中。
public static void readImage(String filePath) throws IOException {
// 待解析PDF
File pdfFile = new File(filePath);
// 文件所在目录
String fileDirectory = pdfFile.getParent();
// 空白PDF - 内部没有任何内容的PDF
File pdfFileOut = new File(fileDirectory+"\\testout.pdf");
PDDocument document = null;
PDDocument documentOut = null;
try {
document = PDDocument.load(pdfFile);
documentOut = PDDocument.load(pdfFileOut);
} catch (IOException e) {
e.printStackTrace();
}
int pages_size = document == null ? 0:document.getNumberOfPages();
int j=0;
for(int i=0;i<pages_size;i++) {
PDPage page = document.getPage(i);
PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);
PDResources resources = page.getResources();
Iterable xobjects = resources.getXObjectNames();
if (xobjects != null) {
for (Object xobject : xobjects) {
COSName key = (COSName) xobject;
if (resources.isImageXObject(key)) {
try {
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
// 将PDF文档中的图片 分别存到一个空白PDF中。
PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);
float scale = 1f;
contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
contentStream.close();
documentOut.save(fileDirectory + "/test" + j + ".pdf");
System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//image count
j++;
}
}
}
}
}
PDF文件打印
/**
* 根据文档地址打印PDF文件
* @param filePath 文件地址
* @param printerName
* @throws Exception
*/
public static void PdfPrint(File file , String printerName) throws Exception {
PDDocument document = null;
try {
document = PDDocument.load(file);
PrinterJob printJob = PrinterJob.getPrinterJob();
printJob.setJobName(file.getName());
// 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();
if (printerName != null) {
// 查找并设置打印机
//获得本台电脑连接的所有打印机
PrintService[] printServices = PrinterJob.lookupPrintServices();
if(printServices == null || printServices.length == 0) {
System.out.print("打印失败,未找到可用打印机,请检查。");
return ;
}
PrintService printService = null;
//匹配指定打印机
for (int i = 0;i < printServices.length; i++) {
//System.out.println(printServices[i].getName());
if (printServices[i].getName().contains(printerName)) {
printService = printServices[i];
break;
}
}
if(printService!=null){
printJob.setPrintService(printService);
}else{
System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
return ;
}
}
//设置纸张及缩放
PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
//设置多页打印
Book book = new Book();
PageFormat pageFormat = new PageFormat();
//设置打印方向
pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向
pageFormat.setPaper(getPaper());//设置纸张
book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
printJob.setPageable(book);
printJob.setCopies(1);//设置打印份数
//添加打印属性
HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
pars.add(Sides.DUPLEX); //设置单双页
printJob.print(pars);
}finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 根据URL地址打印PDF文件
* @param urlStr url地址
* @param printerName 打印机名字
* @param orientation 1=竖向 2=横向
* @throws Exception
*/
public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {
PDDocument document = null;
try {
URL url = new URL(urlStr);
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
httpURLConnection.setRequestProperty("Charset", "UTF-8");
httpURLConnection.setConnectTimeout(5*1000);
httpURLConnection.connect();
// 构造待打印的文件流
InputStream fis=httpURLConnection.getInputStream();
document = PDDocument.load(fis);
PrinterJob printJob = PrinterJob.getPrinterJob();
// 查找并设置打印机
//获得本台电脑连接的所有打印机
PrintService[] printServices = PrinterJob.lookupPrintServices();
if(printServices == null || printServices.length == 0) {
System.out.print("打印失败,未找到可用打印机,请检查。");
return ;
}
PrintService printService = null;
//匹配指定打印机
for (int i = 0;i < printServices.length; i++) {
//System.out.println(printServices[i].getName());
if (printServices[i].getName().contains(printerName)) {
printService = printServices[i];
break;
}
}
if(printService!=null){
printJob.setPrintService(printService);
}else{
System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
return ;
}
//设置纸张及缩放
PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
//设置多页打印
Book book = new Book();
PageFormat pageFormat = new PageFormat();
//设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向
pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);
//设置纸张
pageFormat.setPaper(getPaper());
book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
printJob.setPageable(book);
//设置打印份数
printJob.setCopies(1);
//添加打印属性
HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
//设置单双页
pars.add(Sides.DUPLEX);
printJob.print(pars);
}finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static Paper getPaper() {
Paper paper = new Paper();
// 默认为A4纸张,对应像素宽和高分别为 595, 842
int width = 595;
int height = 842;
// 设置边距,单位是像素,10mm边距,对应 28px
int marginLeft = 10;
int marginRight = 0;
int marginTop = 10;
int marginBottom = 0;
paper.setSize(width, height);
// 下面一行代码,解决了打印内容为空的问题
paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));
return paper;
}
PdfBoxUtils文件完整代码
package com.example.demo.common;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.printing.PDFPrintable;
import org.apache.pdfbox.printing.Scaling;
import org.apache.pdfbox.text.PDFTextStripper;
import javax.print.PrintService;
import javax.print.PrintServiceLookup;
import javax.print.attribute.HashPrintRequestAttributeSet;
import javax.print.attribute.standard.Sides;
import java.awt.print.Book;
import java.awt.print.PageFormat;
import java.awt.print.Paper;
import java.awt.print.PrinterJob;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
public class PDFBoxUtils {
public static void main(String[] args) throws Exception {
String filePath = "C:\\Users\\92384\\Documents\\test.pdf";//文件路径
//PdfReader(filePath);
readImage(filePath);
String printerName = "Microsoft Print to PDF";//打印机名包含字串
//PdfPrint(filePath,printerName);
String urlStr= "https://dlj.51fapiao.cn/dlj/v7/27a91fc541ac427967ef6d8f5019a98928f255";
//PdfPrintByUrl(urlStr,printerName,2);
}
public static void PdfReader(String filePath){
File pdfFile = new File(filePath);
PDDocument document = null;
try {
// PDF文件加载方式一
/*
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
*/
// PDF文件加载方式二
document=PDDocument.load(pdfFile);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);
} catch(Exception e) {
e.printStackTrace();
}
}
public static void readImage(String filePath) throws IOException {
// 待解析PDF
File pdfFile = new File(filePath);
// 文件所在目录
String fileDirectory = pdfFile.getParent();
// 空白PDF - 内部没有任何内容的PDF
File pdfFileOut = new File(fileDirectory+"\\testout.pdf");
PDDocument document = null;
PDDocument documentOut = null;
try {
document = PDDocument.load(pdfFile);
documentOut = PDDocument.load(pdfFileOut);
} catch (IOException e) {
e.printStackTrace();
}
int pages_size = document == null ? 0:document.getNumberOfPages();
int j=0;
for(int i=0;i<pages_size;i++) {
PDPage page = document.getPage(i);
PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);
PDResources resources = page.getResources();
Iterable xobjects = resources.getXObjectNames();
if (xobjects != null) {
for (Object xobject : xobjects) {
COSName key = (COSName) xobject;
if (resources.isImageXObject(key)) {
try {
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
// 将PDF文档中的图片 分别存到一个空白PDF中。
PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);
float scale = 1f;
contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
contentStream.close();
documentOut.save(fileDirectory + "/test" + j + ".pdf");
System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//image count
j++;
}
}
}
}
}
/**
* 根据文档地址打印PDF文件
* @param filePath 文件地址
* @param printerName
* @throws Exception
*/
public static void PdfPrint(String filePath , String printerName) throws Exception {
File file = null;
PDDocument document = null;
try {
file = new File(filePath);
document = PDDocument.load(file);
PrinterJob printJob = PrinterJob.getPrinterJob();
printJob.setJobName(file.getName());
// 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();
if (printerName != null) {
// 查找并设置打印机
//获得本台电脑连接的所有打印机
PrintService[] printServices = PrinterJob.lookupPrintServices();
if(printServices == null || printServices.length == 0) {
System.out.print("打印失败,未找到可用打印机,请检查。");
return ;
}
PrintService printService = null;
//匹配指定打印机
for (int i = 0;i < printServices.length; i++) {
//System.out.println(printServices[i].getName());
if (printServices[i].getName().contains(printerName)) {
printService = printServices[i];
break;
}
}
if(printService!=null){
printJob.setPrintService(printService);
}else{
System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
return ;
}
}
//设置纸张及缩放
PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
//设置多页打印
Book book = new Book();
PageFormat pageFormat = new PageFormat();
//设置打印方向
pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向
pageFormat.setPaper(getPaper());//设置纸张
book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
printJob.setPageable(book);
printJob.setCopies(1);//设置打印份数
//添加打印属性
HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
pars.add(Sides.DUPLEX); //设置单双页
printJob.print(pars);
}finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 根据URL地址打印PDF文件
* @param urlStr url地址
* @param printerName 打印机名字
* @param orientation 1=竖向 2=横向
* @throws Exception
*/
public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {
PDDocument document = null;
try {
URL url = new URL(urlStr);
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
httpURLConnection.setRequestProperty("Charset", "UTF-8");
httpURLConnection.setConnectTimeout(5*1000);
httpURLConnection.connect();
// 构造待打印的文件流
InputStream fis=httpURLConnection.getInputStream();
document = PDDocument.load(fis);
PrinterJob printJob = PrinterJob.getPrinterJob();
// 查找并设置打印机
//获得本台电脑连接的所有打印机
PrintService[] printServices = PrinterJob.lookupPrintServices();
if(printServices == null || printServices.length == 0) {
System.out.print("打印失败,未找到可用打印机,请检查。");
return ;
}
PrintService printService = null;
//匹配指定打印机
for (int i = 0;i < printServices.length; i++) {
//System.out.println(printServices[i].getName());
if (printServices[i].getName().contains(printerName)) {
printService = printServices[i];
break;
}
}
if(printService!=null){
printJob.setPrintService(printService);
}else{
System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
return ;
}
//设置纸张及缩放
PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
//设置多页打印
Book book = new Book();
PageFormat pageFormat = new PageFormat();
//设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向
pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);
//设置纸张
pageFormat.setPaper(getPaper());
book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
printJob.setPageable(book);
//设置打印份数
printJob.setCopies(1);
//添加打印属性
HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
//设置单双页
pars.add(Sides.DUPLEX);
printJob.print(pars);
}finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static Paper getPaper() {
Paper paper = new Paper();
// 默认为A4纸张,对应像素宽和高分别为 595, 842
int width = 595;
int height = 842;
// 设置边距,单位是像素,10mm边距,对应 28px
int marginLeft = 10;
int marginRight = 0;
int marginTop = 10;
int marginBottom = 0;
paper.setSize(width, height);
// 下面一行代码,解决了打印内容为空的问题
paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));
return paper;
}
}