Skip to content

Commit

Permalink
Merge pull request py-pdf#56 from switham/pdfcat
Browse files Browse the repository at this point in the history
Command-line concatenation tools
  • Loading branch information
mstamy2 committed Jan 13, 2014
2 parents 24b270d + 8e1f566 commit 1d0650d
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 7 deletions.
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Copyright (c) 2006-2008, Mathieu Fenniak
Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>

All rights reserved.

Expand Down
1 change: 1 addition & 0 deletions PyPDF2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .pdf import PdfFileReader, PdfFileWriter
from .merger import PdfFileMerger
from .pagerange import PageRange, parse_filename_page_ranges
from ._version import __version__
__all__ = ["pdf", "PdfFileMerger"]
20 changes: 13 additions & 7 deletions PyPDF2/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from .generic import *
from .pdf import PdfFileReader, PdfFileWriter
from .pagerange import PageRange
from sys import version_info
if version_info < ( 3, 0 ):
from cStringIO import StringIO
Expand Down Expand Up @@ -82,8 +83,9 @@ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=T
You may prevent the source document's bookmarks from being imported by
specifying "import_bookmarks" as False.
You may also use the "pages" parameter to merge only the specified range of
pages from the source document into the output document.
The optional "pages" parameter can be a PageRange or a
(start, stop[, step]) tuple to merge only the specified range of pages
from the source document into the output document.
"""

# This parameter is passed to self.inputs.append and means
Expand Down Expand Up @@ -116,11 +118,13 @@ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=T
# (either file or StringIO) created above
pdfr = PdfFileReader(fileobj, strict=self.strict)

# Find the range of pages to merge
# Find the range of pages to merge.
if pages == None:
pages = (0, pdfr.getNumPages())
elif type(pages) in (int, float, str, str):
raise TypeError('"pages" must be a tuple of (start, end)')
elif isinstance(pages, PageRange):
pages = pages.indices(pdfr.getNumPages())
elif not isinstance(pages, tuple):
raise TypeError('"pages" must be a tuple of (start, stop[, step])')

srcpages = []
if bookmark:
Expand Down Expand Up @@ -234,7 +238,8 @@ def setPageMode(self, mode):

def _trim_dests(self, pdf, dests, pages):
"""
Removes any named destinations that are not a part of the specified page set
Removes any named destinations that are not a part of the specified
page set.
"""
new_dests = []
prev_header_added = True
Expand All @@ -249,7 +254,8 @@ def _trim_dests(self, pdf, dests, pages):

def _trim_outline(self, pdf, outline, pages):
"""
Removes any outline/bookmark entries that are not a part of the specified page set
Removes any outline/bookmark entries that are not a part of the
specified page set.
"""
new_outline = []
prev_header_added = True
Expand Down
153 changes: 153 additions & 0 deletions PyPDF2/pagerange.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python
"""
Representation and utils for ranges of PDF file pages.
Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
All rights reserved. This software is available under a BSD license;
see https://github.com/mstamy2/PyPDF2/LICENSE
"""

import re

# "Str" maintains compatibility with Python 2.x.
# The next line is obfuscated like this so 2to3 won't change it.
Str = getattr(__builtins__, "basestring", str)

_INT_RE = r"(0|-?[1-9]\d*)" # A decimal int, don't allow "-0".
PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
# groups: 12 34 5 6 7 8


class ParseError(Exception):
pass


PAGE_RANGE_HELP = """Remember, page indices start with zero.
Page range expression examples:
: all pages. -1 last page.
22 just the 23rd page. :-1 all but the last page.
0:3 the first three pages. -2 second-to-last page.
:3 the first three pages. -2: last two pages.
5: from the sixth page onward. -3:-1 third & second to last.
The third, "stride" or "step" number is also recognized.
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
1:10:2 1 3 5 7 9 2::-1 2 1 0.
::-1 all pages in reverse order.
"""


class PageRange(object):
"""
A slice-like representation of a range of page indices,
i.e. page numbers, only starting at zero.
The syntax is like what you would put between brackets [ ].
The slice is one of the few Python types that can't be subclassed,
but this class converts to and from slices, and allows similar use.
o PageRange(str) parses a string representing a page range.
o PageRange(slice) directly "imports" a slice.
o to_slice() gives the equivalent slice.
o str() and repr() allow printing.
o indices(n) is like slice.indices(n).
"""

def __init__(self, arg):
"""
Initialize with either a slice -- giving the equivalent page range,
or a PageRange object -- making a copy,
or a string like
"int", "[int]:[int]" or "[int]:[int]:[int]",
where the brackets indicate optional ints.
{page_range_help}
Note the difference between this notation and arguments to slice():
slice(3) means the first three pages;
PageRange("3") means the range of only the fourth page.
However PageRange(slice(3)) means the first three pages.
"""
if isinstance(arg, slice):
self._slice = arg
return

if isinstance(arg, PageRange):
self._slice = arg.to_slice()
return

m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg)
if not m:
raise ParseError(arg)
elif m.group(2):
# Special case: just an int means a range of one page.
start = int(m.group(2))
stop = start + 1 if start != -1 else None
self._slice = slice(start, stop)
else:
self._slice = slice(*[int(g) if g else None
for g in m.group(4, 6, 8)])

__init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)

@staticmethod
def valid(input):
""" True if input is a valid initializer for a PageRange. """
return isinstance(input, slice) or \
isinstance(input, PageRange) or \
(isinstance(input, Str)
and bool(re.match(PAGE_RANGE_RE, input)))

def to_slice(self):
""" Return the slice equivalent of this page range. """
return self._slice

def __str__(self):
""" A string like "1:2:3". """
s = self._slice
if s.step == None:
if s.start != None and s.stop == s.start + 1:
return str(s.start)

indices = s.start, s.stop
else:
indices = s.start, s.stop, s.step
return ':'.join("" if i == None else str(i) for i in indices)

def __repr__(self):
""" A string like "PageRange('1:2:3')". """
return "PageRange(" + repr(str(self)) + ")"

def indices(self, n):
"""
n is the length of the list of pages to choose from.
Returns arguments for range(). See help(slice.indices).
"""
return self._slice.indices(n)


PAGE_RANGE_ALL = PageRange(":") # The range of all pages.


def parse_filename_page_ranges(args):
"""
Given a list of filenames and page ranges, return a list of
(filename, page_range) pairs.
First arg must be a filename; other ags are filenames, page-range
expressions, slice objects, or PageRange objects.
A filename not followed by a page range indicates all pages of the file.
"""
pairs = []
pdf_filename = None
did_page_range = False
for arg in args + [None]:
if PageRange.valid(arg):
if not pdf_filename:
raise ValueError("The first argument must be a filename, " \
"not a page range.")

pairs.append( (pdf_filename, PageRange(arg)) )
did_page_range = True
else:
# New filename or end of list--do all of the previous file?
if pdf_filename and not did_page_range:
pairs.append( (pdf_filename, PAGE_RANGE_ALL) )

pdf_filename = arg
did_page_range = False
return pairs
37 changes: 37 additions & 0 deletions Sample_Code/makesimple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python
"Make some simple multipage pdf files."

from __future__ import print_function
from sys import argv

from reportlab.pdfgen import canvas

point = 1
inch = 72

TEXT = """%s page %d of %d
a wonderful file
created with Sample_Code/makesimple.py"""

def make_pdf_file(output_filename, np):
title = output_filename
c = canvas.Canvas(output_filename, pagesize=(8.5 * inch, 11 * inch))
c.setStrokeColorRGB(0,0,0)
c.setFillColorRGB(0,0,0)
c.setFont("Helvetica", 12 * point)
for pn in range(1, np + 1):
v = 10 * inch
for subtline in (TEXT % (output_filename, pn, np)).split( '\n' ):
c.drawString( 1 * inch, v, subtline )
v -= 12 * point
c.showPage()
c.save()

if __name__ == "__main__":
nps = [None, 5, 11, 17]
for i, np in enumerate(nps):
if np:
filename = "simple%d.pdf" % i
make_pdf_file(filename, np)
print ("Wrote", filename)
19 changes: 19 additions & 0 deletions Sample_Code/makesimple.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/sh

n=1
for np in 5 11 17; do
p=1
f=simple$n.pdf
while expr $p \<= $np > /dev/null; do
if [ $p != 1 ]; then
echo " \c"
fi
echo "$f page $p of $np"
echo ""
echo "an incredible, yet simple example"
echo "Created with Sample_Code/makesimple.sh"
p=$(expr $p + 1)
done | enscript --no-header -o - |ps2pdf - $f
echo $f
n=$(expr $n + 1)
done
74 changes: 74 additions & 0 deletions Sample_Code/pdfcat
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python
"""
Concatenate pages from pdf files into a single pdf file.
Page ranges refer to the previously-named file.
A file not followed by a page range means all the pages of the file.
PAGE RANGES are like Python slices.
{page_range_help}
EXAMPLES
pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1
Concatenate all of head.pdf, all but page seven of content.pdf,
and the last page of tail.pdf, producing output.pdf.
pdfcat chapter*.pdf >book.pdf
You can specify the output file by redirection.
pdfcat chapter?.pdf chapter10.pdf >book.pdf
In case you don't want chapter 10 before chapter 2.
"""
# Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
# All rights reserved. This software is available under a BSD license;
# see https://github.com/mstamy2/PyPDF2/LICENSE

from __future__ import print_function
import argparse
from PyPDF2.pagerange import PAGE_RANGE_HELP

def parse_args():
parser = argparse.ArgumentParser(
description=__doc__.format(page_range_help=PAGE_RANGE_HELP),
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("args", nargs="+",
metavar="filename or page range expression")
parser.add_argument("-o", "--output",
metavar="output_file")
parser.add_argument("-v", "--verbose", action="store_true",
help="show page ranges as they are being read")
return parser.parse_args()

from sys import stderr, stdout, exit
import os
import traceback
from collections import defaultdict

from PyPDF2 import PdfFileMerger, parse_filename_page_ranges


if __name__ == "__main__":
args = parse_args()
filename_page_ranges = parse_filename_page_ranges(args.args)
if args.output:
output = open(args.output, "wb")
else:
stdout.flush()
output = os.fdopen(stdout.fileno(), "wb")

merger = PdfFileMerger()
in_fs = dict()
try:
for (filename, page_range) in filename_page_ranges:
if args.verbose:
print(filename, page_range, file=stderr)
if filename not in in_fs:
in_fs[filename] = open(filename, "rb")
merger.append(in_fs[filename], pages=page_range)
except:
print(traceback.format_exc(), file=stderr)
print("Error while reading " + filename, file=stderr)
exit(1)
merger.write(output)
# In 3.0, input files must stay open until output is written.
# Not closing the in_fs because this script exits now.

0 comments on commit 1d0650d

Please sign in to comment.