forked from py-pdf/pypdf
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request py-pdf#56 from switham/pdfcat
Command-line concatenation tools
- Loading branch information
Showing
7 changed files
with
298 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from .pdf import PdfFileReader, PdfFileWriter | ||
from .merger import PdfFileMerger | ||
from .pagerange import PageRange, parse_filename_page_ranges | ||
from ._version import __version__ | ||
__all__ = ["pdf", "PdfFileMerger"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Representation and utils for ranges of PDF file pages. | ||
Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>. | ||
All rights reserved. This software is available under a BSD license; | ||
see https://github.com/mstamy2/PyPDF2/LICENSE | ||
""" | ||
|
||
import re | ||
|
||
# "Str" maintains compatibility with Python 2.x. | ||
# The next line is obfuscated like this so 2to3 won't change it. | ||
Str = getattr(__builtins__, "basestring", str) | ||
|
||
_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0". | ||
PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE) | ||
# groups: 12 34 5 6 7 8 | ||
|
||
|
||
class ParseError(Exception): | ||
pass | ||
|
||
|
||
PAGE_RANGE_HELP = """Remember, page indices start with zero. | ||
Page range expression examples: | ||
: all pages. -1 last page. | ||
22 just the 23rd page. :-1 all but the last page. | ||
0:3 the first three pages. -2 second-to-last page. | ||
:3 the first three pages. -2: last two pages. | ||
5: from the sixth page onward. -3:-1 third & second to last. | ||
The third, "stride" or "step" number is also recognized. | ||
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. | ||
1:10:2 1 3 5 7 9 2::-1 2 1 0. | ||
::-1 all pages in reverse order. | ||
""" | ||
|
||
|
||
class PageRange(object): | ||
""" | ||
A slice-like representation of a range of page indices, | ||
i.e. page numbers, only starting at zero. | ||
The syntax is like what you would put between brackets [ ]. | ||
The slice is one of the few Python types that can't be subclassed, | ||
but this class converts to and from slices, and allows similar use. | ||
o PageRange(str) parses a string representing a page range. | ||
o PageRange(slice) directly "imports" a slice. | ||
o to_slice() gives the equivalent slice. | ||
o str() and repr() allow printing. | ||
o indices(n) is like slice.indices(n). | ||
""" | ||
|
||
def __init__(self, arg): | ||
""" | ||
Initialize with either a slice -- giving the equivalent page range, | ||
or a PageRange object -- making a copy, | ||
or a string like | ||
"int", "[int]:[int]" or "[int]:[int]:[int]", | ||
where the brackets indicate optional ints. | ||
{page_range_help} | ||
Note the difference between this notation and arguments to slice(): | ||
slice(3) means the first three pages; | ||
PageRange("3") means the range of only the fourth page. | ||
However PageRange(slice(3)) means the first three pages. | ||
""" | ||
if isinstance(arg, slice): | ||
self._slice = arg | ||
return | ||
|
||
if isinstance(arg, PageRange): | ||
self._slice = arg.to_slice() | ||
return | ||
|
||
m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg) | ||
if not m: | ||
raise ParseError(arg) | ||
elif m.group(2): | ||
# Special case: just an int means a range of one page. | ||
start = int(m.group(2)) | ||
stop = start + 1 if start != -1 else None | ||
self._slice = slice(start, stop) | ||
else: | ||
self._slice = slice(*[int(g) if g else None | ||
for g in m.group(4, 6, 8)]) | ||
|
||
__init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP) | ||
|
||
@staticmethod | ||
def valid(input): | ||
""" True if input is a valid initializer for a PageRange. """ | ||
return isinstance(input, slice) or \ | ||
isinstance(input, PageRange) or \ | ||
(isinstance(input, Str) | ||
and bool(re.match(PAGE_RANGE_RE, input))) | ||
|
||
def to_slice(self): | ||
""" Return the slice equivalent of this page range. """ | ||
return self._slice | ||
|
||
def __str__(self): | ||
""" A string like "1:2:3". """ | ||
s = self._slice | ||
if s.step == None: | ||
if s.start != None and s.stop == s.start + 1: | ||
return str(s.start) | ||
|
||
indices = s.start, s.stop | ||
else: | ||
indices = s.start, s.stop, s.step | ||
return ':'.join("" if i == None else str(i) for i in indices) | ||
|
||
def __repr__(self): | ||
""" A string like "PageRange('1:2:3')". """ | ||
return "PageRange(" + repr(str(self)) + ")" | ||
|
||
def indices(self, n): | ||
""" | ||
n is the length of the list of pages to choose from. | ||
Returns arguments for range(). See help(slice.indices). | ||
""" | ||
return self._slice.indices(n) | ||
|
||
|
||
PAGE_RANGE_ALL = PageRange(":") # The range of all pages. | ||
|
||
|
||
def parse_filename_page_ranges(args): | ||
""" | ||
Given a list of filenames and page ranges, return a list of | ||
(filename, page_range) pairs. | ||
First arg must be a filename; other ags are filenames, page-range | ||
expressions, slice objects, or PageRange objects. | ||
A filename not followed by a page range indicates all pages of the file. | ||
""" | ||
pairs = [] | ||
pdf_filename = None | ||
did_page_range = False | ||
for arg in args + [None]: | ||
if PageRange.valid(arg): | ||
if not pdf_filename: | ||
raise ValueError("The first argument must be a filename, " \ | ||
"not a page range.") | ||
|
||
pairs.append( (pdf_filename, PageRange(arg)) ) | ||
did_page_range = True | ||
else: | ||
# New filename or end of list--do all of the previous file? | ||
if pdf_filename and not did_page_range: | ||
pairs.append( (pdf_filename, PAGE_RANGE_ALL) ) | ||
|
||
pdf_filename = arg | ||
did_page_range = False | ||
return pairs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env python | ||
"Make some simple multipage pdf files." | ||
|
||
from __future__ import print_function | ||
from sys import argv | ||
|
||
from reportlab.pdfgen import canvas | ||
|
||
point = 1 | ||
inch = 72 | ||
|
||
TEXT = """%s page %d of %d | ||
a wonderful file | ||
created with Sample_Code/makesimple.py""" | ||
|
||
def make_pdf_file(output_filename, np): | ||
title = output_filename | ||
c = canvas.Canvas(output_filename, pagesize=(8.5 * inch, 11 * inch)) | ||
c.setStrokeColorRGB(0,0,0) | ||
c.setFillColorRGB(0,0,0) | ||
c.setFont("Helvetica", 12 * point) | ||
for pn in range(1, np + 1): | ||
v = 10 * inch | ||
for subtline in (TEXT % (output_filename, pn, np)).split( '\n' ): | ||
c.drawString( 1 * inch, v, subtline ) | ||
v -= 12 * point | ||
c.showPage() | ||
c.save() | ||
|
||
if __name__ == "__main__": | ||
nps = [None, 5, 11, 17] | ||
for i, np in enumerate(nps): | ||
if np: | ||
filename = "simple%d.pdf" % i | ||
make_pdf_file(filename, np) | ||
print ("Wrote", filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/sh | ||
|
||
n=1 | ||
for np in 5 11 17; do | ||
p=1 | ||
f=simple$n.pdf | ||
while expr $p \<= $np > /dev/null; do | ||
if [ $p != 1 ]; then | ||
echo "\c" | ||
fi | ||
echo "$f page $p of $np" | ||
echo "" | ||
echo "an incredible, yet simple example" | ||
echo "Created with Sample_Code/makesimple.sh" | ||
p=$(expr $p + 1) | ||
done | enscript --no-header -o - |ps2pdf - $f | ||
echo $f | ||
n=$(expr $n + 1) | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Concatenate pages from pdf files into a single pdf file. | ||
Page ranges refer to the previously-named file. | ||
A file not followed by a page range means all the pages of the file. | ||
PAGE RANGES are like Python slices. | ||
{page_range_help} | ||
EXAMPLES | ||
pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 | ||
Concatenate all of head.pdf, all but page seven of content.pdf, | ||
and the last page of tail.pdf, producing output.pdf. | ||
pdfcat chapter*.pdf >book.pdf | ||
You can specify the output file by redirection. | ||
pdfcat chapter?.pdf chapter10.pdf >book.pdf | ||
In case you don't want chapter 10 before chapter 2. | ||
""" | ||
# Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>. | ||
# All rights reserved. This software is available under a BSD license; | ||
# see https://github.com/mstamy2/PyPDF2/LICENSE | ||
|
||
from __future__ import print_function | ||
import argparse | ||
from PyPDF2.pagerange import PAGE_RANGE_HELP | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description=__doc__.format(page_range_help=PAGE_RANGE_HELP), | ||
formatter_class=argparse.RawDescriptionHelpFormatter) | ||
parser.add_argument("args", nargs="+", | ||
metavar="filename or page range expression") | ||
parser.add_argument("-o", "--output", | ||
metavar="output_file") | ||
parser.add_argument("-v", "--verbose", action="store_true", | ||
help="show page ranges as they are being read") | ||
return parser.parse_args() | ||
|
||
from sys import stderr, stdout, exit | ||
import os | ||
import traceback | ||
from collections import defaultdict | ||
|
||
from PyPDF2 import PdfFileMerger, parse_filename_page_ranges | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
filename_page_ranges = parse_filename_page_ranges(args.args) | ||
if args.output: | ||
output = open(args.output, "wb") | ||
else: | ||
stdout.flush() | ||
output = os.fdopen(stdout.fileno(), "wb") | ||
|
||
merger = PdfFileMerger() | ||
in_fs = dict() | ||
try: | ||
for (filename, page_range) in filename_page_ranges: | ||
if args.verbose: | ||
print(filename, page_range, file=stderr) | ||
if filename not in in_fs: | ||
in_fs[filename] = open(filename, "rb") | ||
merger.append(in_fs[filename], pages=page_range) | ||
except: | ||
print(traceback.format_exc(), file=stderr) | ||
print("Error while reading " + filename, file=stderr) | ||
exit(1) | ||
merger.write(output) | ||
# In 3.0, input files must stay open until output is written. | ||
# Not closing the in_fs because this script exits now. | ||
|