PDF - 《Python》 - 极客文档

PYPDF2
pdfplumber
- Read


© getcodify.com

由於語法渲染問題而影響閱讀體驗，請移步博客閱讀～
本文GitPage地址

PYPDF2

Read & Write

from PyPDF2 import PdfFileReader, PdfFileWriter
readFile = 'read.pdf'
writeFile = 'write.pdf'
pdfReader = PdfFileReader(open(readFile, 'rb'))
pdfWriter.write(open(writeFile, 'wb'))

Pick First two Page

from PyPDF2 import PdfFileReader, PdfFileWriter
import PyPDF2
readFile = 'SA.pdf'
writeFile = 'write.pdf'
pdfWriter = PyPDF2.PdfFileWriter()
pdfReader = PdfFileReader(open(readFile, 'rb'))
pdfWriter.write(open(writeFile, 'wb'))
for page in range(2):
        pageObj = pdfReader.getPage(page)
        pdfWriter.addPage(pageObj)
newFile = open(writeFile,'wb')
pdfWriter.write(newFile)
newFile.close()

2.1 Double the Pages

from PyPDF2 import PdfFileReader, PdfFileWriter
import PyPDF2
readFile = 'SA.pdf'
writeFile = 'write.pdf'
pdfWriter = PyPDF2.PdfFileWriter()
pdfReader = PdfFileReader(open(readFile, 'rb'))
pdfWriter.write(open(writeFile, 'wb'))
for page in range(2):
        pageObj = pdfReader.getPage(page)
        pdfWriter.addPage(pageObj)
        pageObj = pdfReader.getPage(page)
        pdfWriter.addPage(pageObj)
newFile = open(writeFile,'wb')
pdfWriter.write(newFile)
newFile.close()

PDF - 图2

3. Water Mark

cm =1
def create_watermark(content):
    #默认大小为21cm*29.7cm
    c = canvas.Canvas('mark.pdf', pagesize = (30*cm, 30*cm))   
    c.translate(10*cm, 10*cm) #移动坐标原点(坐标系左下为(0,0)))                                                                                                                             
    #c.setFont('song',22)#设置字体为宋体，大小22号
    c.setFillColorRGB(0.5,0.5,0.5)#灰色                                                                                                                         
    c.rotate(45)#旋转45度，坐标系被旋转
    c.drawString(-7*cm, 0*cm, content)
    c.drawString(7*cm, 0*cm, content)
    c.drawString(0*cm, 7*cm, content)
    c.drawString(0*cm, -7*cm, content)                                                                                                                              
    c.save()#关闭并保存pdf文件

PDF - 图3

from reportlab.pdfgen import canvas
cm =1
def create_watermark(W, H):
    #默认大小为21cm*29.7cm
    c = canvas.Canvas('mark.pdf', pagesize = (W, H))   
    c.translate(10*cm, 10*cm) #移动坐标原点(坐标系左下为(0,0)))                                                                                                                             
    #c.setFont('song',22)#设置字体为宋体，大小22号
    #c.setFillColorRGB(0.5,0.5,0.5)#灰色                                                                                                                         
    #c.rotate(45)#旋转45度，坐标系被旋转
    #c.drawString(-7*cm, 0*cm, content)
    #c.drawString(7*cm, 0*cm, content)
    #c.drawString(0*cm, 7*cm, content)
    #c.drawString(0*cm, -7*cm, content)
    #指定描边的颜色
    #c.setStrokeColorRGB(0, 1, 0)
    #指定填充颜色
    c.setFillColorRGB(255, 255, 255)
    #画一个矩形
    c.rect(0, 0, W, H/2 -10 , fill=1)
    c.save()#关闭并保存pdf文件
create_watermark(580,820)
add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out)

##encoding=utf-8
##author: walker
##date: 2014-03-18
##function:给pdf添加水印
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
##所有路径为绝对路径
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(pdf_file_in)                                                                                
    # PDF文件被加密了
    if pdf_input.getIsEncrypted():
        print( '该PDF文件被加密了.')
        # 尝试用空密码解密
        try:
            pdf_input.decrypt('')
        except Exception or e:
            print( '尝试用空密码解密失败.')
            return False
        else:
            print( '用空密码解密成功.')
    # 获取PDF文件的页数
    pageNum = pdf_input.getNumPages()
    #读入水印pdf文件
    pdf_watermark = PdfFileReader(open(pdf_file_mark, 'rb'))
    # 给每一页打水印
    for i in range(pageNum):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()   #压缩内容
        pdf_output.addPage(page)
    return pdf_output
PDF1 = add_watermark("GRE阅读白皮书.pdf", pdf_file_mark, pdf_file_out)
PDF2 = add_watermark("GRE阅读白皮书.pdf", pdf_file_mark2, pdf_file_out)
pdf_output = PdfFileWriter()
for i in range(PDF2.getNumPages()):
    page = PDF1.getPage(i)
    pdf_output.addPage(page)
    page = PDF2.getPage(i)
    pdf_output.addPage(page)
newFile = open(pdf_file_out,'wb')
pdf_output.write(newFile)
newFile.close()

Add Page number

###!/usr/bin/env python3
## -*- coding:utf-8 -*-
## 本示例使用两个第三方库来实现为PDF文件添加文字水印
## 这两个库是pyPdf和reportlab
## 使用的Python版本是Python 3.7
## origing from https://www.cnblogs.com/kayb/p/10846341.html
## 作者：小磊
##链接：https://www.zhihu.com/question/19628465/answer/353504051
##来源：知乎
##著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。
##!/usr/bin/env python3
## -*- coding: utf-8 -*-
helpDoc = '''
Add Page Number to PDF file with Python
Python 给 PDF 添加 页码
usage:
    python addPageNumberToPDF.py [PDF path]
require:
    pip install reportlab pypdf2
    Support both Python2/3, But more recommend Python3
tips:
    * output file will save at pdfWithNumbers/[PDF path]_page.pdf
    * only support A4 size PDF
    * tested on Python2/Python3@ubuntu
    * more large size of PDF require more RAM
    * if segmentation fault, plaese try use Python 3
    * if generate PDF document is damaged, plaese try use Python 3
Author:
    Lei Yang (ylxx@live.com)
GitHub:
    https://gist.github.com/DIYer22/b9ede6b5b96109788a47973649645c1f
'''
print(helpDoc)
import reportlab
from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from PyPDF2 import PdfFileWriter, PdfFileReader
path = 'test.pdf'
def createPagePdf(num, tmp):
    c = canvas.Canvas(tmp)
    for i in range(1,num+1):
        c.drawString((210//2)*mm, (4)*mm, str(i))
        c.showPage()
    c.save()
    return
    with open(tmp, 'rb') as f:
        pdf = PdfFileReader(f)
        layer = pdf.getPage(0)
    return layer
if __name__ == "__main__":
    pass
    import sys,os
    if len(sys.argv) == 1:
        if not os.path.isfile(path):
            sys.exit(1)
    else:
        path = sys.argv[1]
    base = os.path.basename(path)
    tmp = "__tmp.pdf"
    batch = 10
    batch = 0
    output = PdfFileWriter()
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f,strict=False)
        n = pdf.getNumPages()
        if batch == 0:
            batch = -n
        createPagePdf(n,tmp)
        if not os.path.isdir('pdfWithNumbers/'):
            os.mkdir('pdfWithNumbers/')
        with open(tmp, 'rb') as ftmp:
            numberPdf = PdfFileReader(ftmp)
            for p in range(n):
                if not p%batch and p:
                    newpath = path.replace(base, 'pdfWithNumbers/'+ base[:-4] + '_page_%d'%(p//batch) + path[-4:])
                    with open(newpath, 'wb') as f:
                        output.write(f)
                    output = PdfFileWriter()
                print('page: %d of %d'%(p, n))
                page = pdf.getPage(p)
                numberLayer = numberPdf.getPage(p)
                page.mergePage(numberLayer)
                output.addPage(page)
            if output.getNumPages():
                newpath = path.replace(base, 'pdfWithNumbers/' + base[:-4] + '_page_%d'%(p//batch + 1)  + path[-4:])
                with open(newpath, 'wb') as f:
                    output.write(f)
        os.remove(tmp)

Access the size of pages

Reference: SUN_SU3 2020

def pdf_size(path, page =0):
    pdf = PdfFileReader(open(path, 'rb'))
    page_1 = pdf.getPage(page)
    if page_1.get('/Rotate', 0) in [90, 270]:
        return page_1['/MediaBox'][2], page_1['/MediaBox'][3]
    else:
        return page_1['/MediaBox'][3], page_1['/MediaBox'][2]
height, width = pdf_size(path)
print('height: %s, width: %s'%(height, width))

height: 767.06, width: 575.29

This is the size of PDF file made by Sony DPT-1

Crop the pages of PDF

For doing this, you need to know the size of your pdf and the width/height ratio.

File = "Improving_Reading_Skills.pdf"
height, width = pdf_size(File,20) # Function from above
C_width =  round(float(690)/(767.06/575.29),2)
with open(File, "rb") as in_f:
    input1 = PdfFileReader(in_f)
    output = PdfFileWriter()
    # number
    numPages = input1.getNumPages()
    print ("document has %s pages." % numPages)
    # Start
    for i in range(10):
        page = input1.getPage(i)
        print( page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y())
        # (x, y) from left to right, from botton to top
        #page.trimBox.lowerLeft = (400, 700)
        page.cropBox.lowerLeft = (500, 600)
        page.cropBox.upperRight = (100, 200)
        output.addPage(page)
    # End
    with open("out.pdf", "wb") as out_f:
        output.write(out_f)

PDF - 图4

When left and right page is different

with open(File, "rb") as in_f:
    input1 = PdfFileReader(in_f)
    output = PdfFileWriter()
    # number
    numPages = input1.getNumPages()
    print ("document has %s pages." % numPages)
    # Start
    for i in range(numPages):
        page = input1.getPage(i)
        print( page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y())
        # (x, y) from left to right, from botton to top
        #page.trimBox.lowerLeft = (400, 700)
        if i%2 == 0:
          page.cropBox.lowerLeft = (540, 680)
          page.trimBox.lowerLeft = (540, 680)
          page.cropBox.upperRight = (60, 40)
          page.trimBox.upperRight = (60, 40)
        if i%2 == 1:
          page.cropBox.lowerLeft = (580, 680)
          page.trimBox.lowerLeft = (580, 680)
          page.cropBox.upperRight = (100, 40)
          page.trimBox.upperRight = (100, 40)
        output.addPage(page)
    # End
    with open("out.pdf", "wb") as out_f:
        output.write(out_f)

pdfplumber

Read

import pdfplumber
path = 'MMR.pdf'
pdf =  pdfplumber.open(path)

Reference: SUN_SU3 2020

import pdfplumber
path = 'MMR.pdf'
def run(path):
    with pdfplumber.open(path) as pdf:
        page_1 = pdf.pages[0]
        return page_1.height, page_1.width
height, width = run(path)
print('height: %s, width: %s'%(height, width))

height: 841.920, width: 595.200

Enjoy~

本文由Python腳本GitHub/語雀自動更新

由於語法渲染問題而影響閱讀體驗，請移步博客閱讀～
本文GitPage地址

GitHub: Karobben
Blog:Karobben
BiliBili:史上最不正經的生物狗