Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库,它可以用于创建新的PDF文档,修改现有的PDF文档,还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。

Apache PDFBox主要有以下特征:
PDF读取、创建、打印、转换、验证、合并分割等特征。

所需jar包

  1. <dependency>
  2. <groupId>org.apache.pdfbox</groupId>
  3. <artifactId>pdfbox</artifactId>
  4. <version>2.0.12</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>org.apache.pdfbox</groupId>
  8. <artifactId>fontbox</artifactId>
  9. <version>2.0.12</version>
  10. </dependency>

文本内容提取

  1. //
  2. public static void PdfReader(String filePath){
  3. File pdfFile = new File(filePath);
  4. PDDocument document = null;
  5. try {
  6. // PDF文件加载方式一
  7. /*
  8. InputStream input = null;
  9. input = new FileInputStream( pdfFile );
  10. //加载 pdf 文档
  11. PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
  12. parser.parse();
  13. document = parser.getPDDocument();
  14. */
  15. // PDF文件加载方式二
  16. document=PDDocument.load(pdfFile);
  17. // 获取页码
  18. int pages = document.getNumberOfPages();
  19. // 读文本内容
  20. PDFTextStripper stripper=new PDFTextStripper();
  21. // 设置按顺序输出
  22. stripper.setSortByPosition(true);
  23. stripper.setStartPage(1);
  24. stripper.setEndPage(pages);
  25. String content = stripper.getText(document);
  26. System.out.println(content);
  27. } catch(Exception e) {
  28. e.printStackTrace();
  29. }
  30. }

图片提取

此方法可以取出源PDF中图片对象PDImageXObject,然后可以对该对象进行相关处理,本代码实现了将提取出来的每一个图片对象,插入到一个空白的PDF文档中。

  1. public static void readImage(String filePath) throws IOException {
  2. // 待解析PDF
  3. File pdfFile = new File(filePath);
  4. // 文件所在目录
  5. String fileDirectory = pdfFile.getParent();
  6. // 空白PDF - 内部没有任何内容的PDF
  7. File pdfFileOut = new File(fileDirectory+"\\testout.pdf");
  8. PDDocument document = null;
  9. PDDocument documentOut = null;
  10. try {
  11. document = PDDocument.load(pdfFile);
  12. documentOut = PDDocument.load(pdfFileOut);
  13. } catch (IOException e) {
  14. e.printStackTrace();
  15. }
  16. int pages_size = document == null ? 0:document.getNumberOfPages();
  17. int j=0;
  18. for(int i=0;i<pages_size;i++) {
  19. PDPage page = document.getPage(i);
  20. PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);
  21. PDResources resources = page.getResources();
  22. Iterable xobjects = resources.getXObjectNames();
  23. if (xobjects != null) {
  24. for (Object xobject : xobjects) {
  25. COSName key = (COSName) xobject;
  26. if (resources.isImageXObject(key)) {
  27. try {
  28. PDImageXObject image = (PDImageXObject) resources.getXObject(key);
  29. // 将PDF文档中的图片 分别存到一个空白PDF中。
  30. PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);
  31. float scale = 1f;
  32. contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
  33. contentStream.close();
  34. documentOut.save(fileDirectory + "/test" + j + ".pdf");
  35. System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());
  36. } catch (IOException e) {
  37. // TODO Auto-generated catch block
  38. e.printStackTrace();
  39. }
  40. //image count
  41. j++;
  42. }
  43. }
  44. }
  45. }
  46. }

PDF文件打印

  1. /**
  2. * 根据文档地址打印PDF文件
  3. * @param filePath 文件地址
  4. * @param printerName
  5. * @throws Exception
  6. */
  7. public static void PdfPrint(File file , String printerName) throws Exception {
  8. PDDocument document = null;
  9. try {
  10. document = PDDocument.load(file);
  11. PrinterJob printJob = PrinterJob.getPrinterJob();
  12. printJob.setJobName(file.getName());
  13. // 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();
  14. if (printerName != null) {
  15. // 查找并设置打印机
  16. //获得本台电脑连接的所有打印机
  17. PrintService[] printServices = PrinterJob.lookupPrintServices();
  18. if(printServices == null || printServices.length == 0) {
  19. System.out.print("打印失败,未找到可用打印机,请检查。");
  20. return ;
  21. }
  22. PrintService printService = null;
  23. //匹配指定打印机
  24. for (int i = 0;i < printServices.length; i++) {
  25. //System.out.println(printServices[i].getName());
  26. if (printServices[i].getName().contains(printerName)) {
  27. printService = printServices[i];
  28. break;
  29. }
  30. }
  31. if(printService!=null){
  32. printJob.setPrintService(printService);
  33. }else{
  34. System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
  35. return ;
  36. }
  37. }
  38. //设置纸张及缩放
  39. PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
  40. //设置多页打印
  41. Book book = new Book();
  42. PageFormat pageFormat = new PageFormat();
  43. //设置打印方向
  44. pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向
  45. pageFormat.setPaper(getPaper());//设置纸张
  46. book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
  47. printJob.setPageable(book);
  48. printJob.setCopies(1);//设置打印份数
  49. //添加打印属性
  50. HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
  51. pars.add(Sides.DUPLEX); //设置单双页
  52. printJob.print(pars);
  53. }finally {
  54. if (document != null) {
  55. try {
  56. document.close();
  57. } catch (IOException e) {
  58. e.printStackTrace();
  59. }
  60. }
  61. }
  62. }
  63. /**
  64. * 根据URL地址打印PDF文件
  65. * @param urlStr url地址
  66. * @param printerName 打印机名字
  67. * @param orientation 1=竖向 2=横向
  68. * @throws Exception
  69. */
  70. public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {
  71. PDDocument document = null;
  72. try {
  73. URL url = new URL(urlStr);
  74. HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
  75. httpURLConnection.setRequestProperty("Charset", "UTF-8");
  76. httpURLConnection.setConnectTimeout(5*1000);
  77. httpURLConnection.connect();
  78. // 构造待打印的文件流
  79. InputStream fis=httpURLConnection.getInputStream();
  80. document = PDDocument.load(fis);
  81. PrinterJob printJob = PrinterJob.getPrinterJob();
  82. // 查找并设置打印机
  83. //获得本台电脑连接的所有打印机
  84. PrintService[] printServices = PrinterJob.lookupPrintServices();
  85. if(printServices == null || printServices.length == 0) {
  86. System.out.print("打印失败,未找到可用打印机,请检查。");
  87. return ;
  88. }
  89. PrintService printService = null;
  90. //匹配指定打印机
  91. for (int i = 0;i < printServices.length; i++) {
  92. //System.out.println(printServices[i].getName());
  93. if (printServices[i].getName().contains(printerName)) {
  94. printService = printServices[i];
  95. break;
  96. }
  97. }
  98. if(printService!=null){
  99. printJob.setPrintService(printService);
  100. }else{
  101. System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
  102. return ;
  103. }
  104. //设置纸张及缩放
  105. PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
  106. //设置多页打印
  107. Book book = new Book();
  108. PageFormat pageFormat = new PageFormat();
  109. //设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向
  110. pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);
  111. //设置纸张
  112. pageFormat.setPaper(getPaper());
  113. book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
  114. printJob.setPageable(book);
  115. //设置打印份数
  116. printJob.setCopies(1);
  117. //添加打印属性
  118. HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
  119. //设置单双页
  120. pars.add(Sides.DUPLEX);
  121. printJob.print(pars);
  122. }finally {
  123. if (document != null) {
  124. try {
  125. document.close();
  126. } catch (IOException e) {
  127. e.printStackTrace();
  128. }
  129. }
  130. }
  131. }
  132. public static Paper getPaper() {
  133. Paper paper = new Paper();
  134. // 默认为A4纸张,对应像素宽和高分别为 595, 842
  135. int width = 595;
  136. int height = 842;
  137. // 设置边距,单位是像素,10mm边距,对应 28px
  138. int marginLeft = 10;
  139. int marginRight = 0;
  140. int marginTop = 10;
  141. int marginBottom = 0;
  142. paper.setSize(width, height);
  143. // 下面一行代码,解决了打印内容为空的问题
  144. paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));
  145. return paper;
  146. }

PdfBoxUtils文件完整代码

  1. package com.example.demo.common;
  2. import org.apache.pdfbox.cos.COSName;
  3. import org.apache.pdfbox.pdmodel.PDDocument;
  4. import org.apache.pdfbox.pdmodel.PDPage;
  5. import org.apache.pdfbox.pdmodel.PDPageContentStream;
  6. import org.apache.pdfbox.pdmodel.PDResources;
  7. import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
  8. import org.apache.pdfbox.printing.PDFPrintable;
  9. import org.apache.pdfbox.printing.Scaling;
  10. import org.apache.pdfbox.text.PDFTextStripper;
  11. import javax.print.PrintService;
  12. import javax.print.PrintServiceLookup;
  13. import javax.print.attribute.HashPrintRequestAttributeSet;
  14. import javax.print.attribute.standard.Sides;
  15. import java.awt.print.Book;
  16. import java.awt.print.PageFormat;
  17. import java.awt.print.Paper;
  18. import java.awt.print.PrinterJob;
  19. import java.io.*;
  20. import java.net.HttpURLConnection;
  21. import java.net.URL;
  22. public class PDFBoxUtils {
  23. public static void main(String[] args) throws Exception {
  24. String filePath = "C:\\Users\\92384\\Documents\\test.pdf";//文件路径
  25. //PdfReader(filePath);
  26. readImage(filePath);
  27. String printerName = "Microsoft Print to PDF";//打印机名包含字串
  28. //PdfPrint(filePath,printerName);
  29. String urlStr= "https://dlj.51fapiao.cn/dlj/v7/27a91fc541ac427967ef6d8f5019a98928f255";
  30. //PdfPrintByUrl(urlStr,printerName,2);
  31. }
  32. public static void PdfReader(String filePath){
  33. File pdfFile = new File(filePath);
  34. PDDocument document = null;
  35. try {
  36. // PDF文件加载方式一
  37. /*
  38. InputStream input = null;
  39. input = new FileInputStream( pdfFile );
  40. //加载 pdf 文档
  41. PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
  42. parser.parse();
  43. document = parser.getPDDocument();
  44. */
  45. // PDF文件加载方式二
  46. document=PDDocument.load(pdfFile);
  47. // 获取页码
  48. int pages = document.getNumberOfPages();
  49. // 读文本内容
  50. PDFTextStripper stripper=new PDFTextStripper();
  51. // 设置按顺序输出
  52. stripper.setSortByPosition(true);
  53. stripper.setStartPage(1);
  54. stripper.setEndPage(pages);
  55. String content = stripper.getText(document);
  56. System.out.println(content);
  57. } catch(Exception e) {
  58. e.printStackTrace();
  59. }
  60. }
  61. public static void readImage(String filePath) throws IOException {
  62. // 待解析PDF
  63. File pdfFile = new File(filePath);
  64. // 文件所在目录
  65. String fileDirectory = pdfFile.getParent();
  66. // 空白PDF - 内部没有任何内容的PDF
  67. File pdfFileOut = new File(fileDirectory+"\\testout.pdf");
  68. PDDocument document = null;
  69. PDDocument documentOut = null;
  70. try {
  71. document = PDDocument.load(pdfFile);
  72. documentOut = PDDocument.load(pdfFileOut);
  73. } catch (IOException e) {
  74. e.printStackTrace();
  75. }
  76. int pages_size = document == null ? 0:document.getNumberOfPages();
  77. int j=0;
  78. for(int i=0;i<pages_size;i++) {
  79. PDPage page = document.getPage(i);
  80. PDPage pageOut = documentOut ==null?null:documentOut.getPage(0);
  81. PDResources resources = page.getResources();
  82. Iterable xobjects = resources.getXObjectNames();
  83. if (xobjects != null) {
  84. for (Object xobject : xobjects) {
  85. COSName key = (COSName) xobject;
  86. if (resources.isImageXObject(key)) {
  87. try {
  88. PDImageXObject image = (PDImageXObject) resources.getXObject(key);
  89. // 将PDF文档中的图片 分别存到一个空白PDF中。
  90. PDPageContentStream contentStream = new PDPageContentStream(documentOut, pageOut, PDPageContentStream.AppendMode.APPEND, true);
  91. float scale = 1f;
  92. contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
  93. contentStream.close();
  94. documentOut.save(fileDirectory + "/test" + j + ".pdf");
  95. System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());
  96. } catch (IOException e) {
  97. // TODO Auto-generated catch block
  98. e.printStackTrace();
  99. }
  100. //image count
  101. j++;
  102. }
  103. }
  104. }
  105. }
  106. }
  107. /**
  108. * 根据文档地址打印PDF文件
  109. * @param filePath 文件地址
  110. * @param printerName
  111. * @throws Exception
  112. */
  113. public static void PdfPrint(String filePath , String printerName) throws Exception {
  114. File file = null;
  115. PDDocument document = null;
  116. try {
  117. file = new File(filePath);
  118. document = PDDocument.load(file);
  119. PrinterJob printJob = PrinterJob.getPrinterJob();
  120. printJob.setJobName(file.getName());
  121. // 获取默认打印机 PrintService printService = PrintServiceLookup.lookupDefaultPrintService();
  122. if (printerName != null) {
  123. // 查找并设置打印机
  124. //获得本台电脑连接的所有打印机
  125. PrintService[] printServices = PrinterJob.lookupPrintServices();
  126. if(printServices == null || printServices.length == 0) {
  127. System.out.print("打印失败,未找到可用打印机,请检查。");
  128. return ;
  129. }
  130. PrintService printService = null;
  131. //匹配指定打印机
  132. for (int i = 0;i < printServices.length; i++) {
  133. //System.out.println(printServices[i].getName());
  134. if (printServices[i].getName().contains(printerName)) {
  135. printService = printServices[i];
  136. break;
  137. }
  138. }
  139. if(printService!=null){
  140. printJob.setPrintService(printService);
  141. }else{
  142. System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
  143. return ;
  144. }
  145. }
  146. //设置纸张及缩放
  147. PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
  148. //设置多页打印
  149. Book book = new Book();
  150. PageFormat pageFormat = new PageFormat();
  151. //设置打印方向
  152. pageFormat.setOrientation(PageFormat.PORTRAIT);//纵向
  153. pageFormat.setPaper(getPaper());//设置纸张
  154. book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
  155. printJob.setPageable(book);
  156. printJob.setCopies(1);//设置打印份数
  157. //添加打印属性
  158. HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
  159. pars.add(Sides.DUPLEX); //设置单双页
  160. printJob.print(pars);
  161. }finally {
  162. if (document != null) {
  163. try {
  164. document.close();
  165. } catch (IOException e) {
  166. e.printStackTrace();
  167. }
  168. }
  169. }
  170. }
  171. /**
  172. * 根据URL地址打印PDF文件
  173. * @param urlStr url地址
  174. * @param printerName 打印机名字
  175. * @param orientation 1=竖向 2=横向
  176. * @throws Exception
  177. */
  178. public static void PdfPrintByUrl(String urlStr ,String printerName ,Integer orientation) throws Exception {
  179. PDDocument document = null;
  180. try {
  181. URL url = new URL(urlStr);
  182. HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
  183. httpURLConnection.setRequestProperty("Charset", "UTF-8");
  184. httpURLConnection.setConnectTimeout(5*1000);
  185. httpURLConnection.connect();
  186. // 构造待打印的文件流
  187. InputStream fis=httpURLConnection.getInputStream();
  188. document = PDDocument.load(fis);
  189. PrinterJob printJob = PrinterJob.getPrinterJob();
  190. // 查找并设置打印机
  191. //获得本台电脑连接的所有打印机
  192. PrintService[] printServices = PrinterJob.lookupPrintServices();
  193. if(printServices == null || printServices.length == 0) {
  194. System.out.print("打印失败,未找到可用打印机,请检查。");
  195. return ;
  196. }
  197. PrintService printService = null;
  198. //匹配指定打印机
  199. for (int i = 0;i < printServices.length; i++) {
  200. //System.out.println(printServices[i].getName());
  201. if (printServices[i].getName().contains(printerName)) {
  202. printService = printServices[i];
  203. break;
  204. }
  205. }
  206. if(printService!=null){
  207. printJob.setPrintService(printService);
  208. }else{
  209. System.out.print("打印失败,未找到名称为" + printerName + "的打印机,请检查。");
  210. return ;
  211. }
  212. //设置纸张及缩放
  213. PDFPrintable pdfPrintable = new PDFPrintable(document, Scaling.ACTUAL_SIZE);
  214. //设置多页打印
  215. Book book = new Book();
  216. PageFormat pageFormat = new PageFormat();
  217. //设置打印方向 PORTRAIT 竖向 REVERSE_LANDSCAPE 横向
  218. pageFormat.setOrientation(orientation == 1 ? PageFormat.PORTRAIT : PageFormat.REVERSE_LANDSCAPE);
  219. //设置纸张
  220. pageFormat.setPaper(getPaper());
  221. book.append(pdfPrintable, pageFormat, document.getNumberOfPages());
  222. printJob.setPageable(book);
  223. //设置打印份数
  224. printJob.setCopies(1);
  225. //添加打印属性
  226. HashPrintRequestAttributeSet pars = new HashPrintRequestAttributeSet();
  227. //设置单双页
  228. pars.add(Sides.DUPLEX);
  229. printJob.print(pars);
  230. }finally {
  231. if (document != null) {
  232. try {
  233. document.close();
  234. } catch (IOException e) {
  235. e.printStackTrace();
  236. }
  237. }
  238. }
  239. }
  240. public static Paper getPaper() {
  241. Paper paper = new Paper();
  242. // 默认为A4纸张,对应像素宽和高分别为 595, 842
  243. int width = 595;
  244. int height = 842;
  245. // 设置边距,单位是像素,10mm边距,对应 28px
  246. int marginLeft = 10;
  247. int marginRight = 0;
  248. int marginTop = 10;
  249. int marginBottom = 0;
  250. paper.setSize(width, height);
  251. // 下面一行代码,解决了打印内容为空的问题
  252. paper.setImageableArea(marginLeft, marginRight, width - (marginLeft + marginRight), height - (marginTop + marginBottom));
  253. return paper;
  254. }
  255. }