Merge pull request py-pdf#56 from switham/pdfcat

Command-line concatenation tools
oyv · Jan 13, 2014 · 1d0650d · 1d0650d
2 parents 24b270d + 8e1f566
commit 1d0650d
Show file tree

Hide file tree

Showing 7 changed files with 298 additions and 7 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,6 @@
 Copyright (c) 2006-2008, Mathieu Fenniak
 Some contributions copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
+Some contributions copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>
 
 All rights reserved.
 

diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py
@@ -1,4 +1,5 @@
 from .pdf import PdfFileReader, PdfFileWriter
 from .merger import PdfFileMerger
+from .pagerange import PageRange, parse_filename_page_ranges
 from ._version import __version__
 __all__ = ["pdf", "PdfFileMerger"]
diff --git a/PyPDF2/merger.py b/PyPDF2/merger.py
@@ -29,6 +29,7 @@
 
 from .generic import *
 from .pdf import PdfFileReader, PdfFileWriter
+from .pagerange import PageRange
 from sys import version_info
 if version_info < ( 3, 0 ):
     from cStringIO import StringIO
@@ -82,8 +83,9 @@ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=T
         You may prevent the source document's bookmarks from being imported by
         specifying "import_bookmarks" as False.
         
-        You may also use the "pages" parameter to merge only the specified range of 
-        pages from the source document into the output document.
+        The optional "pages" parameter can be a PageRange or a 
+        (start, stop[, step]) tuple to merge only the specified range of pages
+        from the source document into the output document.
         """
 
         # This parameter is passed to self.inputs.append and means
@@ -116,11 +118,13 @@ def merge(self, position, fileobj, bookmark=None, pages=None, import_bookmarks=T
         # (either file or StringIO) created above
         pdfr = PdfFileReader(fileobj, strict=self.strict)
 
-        # Find the range of pages to merge
+        # Find the range of pages to merge.
         if pages == None:
             pages = (0, pdfr.getNumPages())
-        elif type(pages) in (int, float, str, str):
-            raise TypeError('"pages" must be a tuple of (start, end)')
+        elif isinstance(pages, PageRange):
+            pages = pages.indices(pdfr.getNumPages())
+        elif not isinstance(pages, tuple):
+            raise TypeError('"pages" must be a tuple of (start, stop[, step])')
 
         srcpages = []
         if bookmark:
@@ -234,7 +238,8 @@ def setPageMode(self, mode):
 
     def _trim_dests(self, pdf, dests, pages):
         """
-        Removes any named destinations that are not a part of the specified page set
+        Removes any named destinations that are not a part of the specified 
+        page set.
         """
         new_dests = []
         prev_header_added = True
@@ -249,7 +254,8 @@ def _trim_dests(self, pdf, dests, pages):
 
     def _trim_outline(self, pdf, outline, pages):
         """
-        Removes any outline/bookmark entries that are not a part of the specified page set
+        Removes any outline/bookmark entries that are not a part of the 
+        specified page set.
         """
         new_outline = []
         prev_header_added = True

diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+"""
+Representation and utils for ranges of PDF file pages.
+
+Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
+All rights reserved. This software is available under a BSD license;
+see https://github.com/mstamy2/PyPDF2/LICENSE
+"""
+
+import re
+
+# "Str" maintains compatibility with Python 2.x.
+# The next line is obfuscated like this so 2to3 won't change it.
+Str = getattr(__builtins__, "basestring", str)
+
+_INT_RE = r"(0|-?[1-9]\d*)"  # A decimal int, don't allow "-0".
+PAGE_RANGE_RE = "^({int}|({int}?(:{int}?(:{int}?)?)))$".format(int=_INT_RE)
+# groups:         12     34     5 6     7 8
+
+
+class ParseError(Exception):
+    pass
+
+
+PAGE_RANGE_HELP = """Remember, page indices start with zero.
+        Page range expression examples:
+            :     all pages.                   -1    last page.
+            22    just the 23rd page.          :-1   all but the last page.
+            0:3   the first three pages.       -2    second-to-last page.
+            :3    the first three pages.       -2:   last two pages.
+            5:    from the sixth page onward.  -3:-1 third & second to last.
+        The third, "stride" or "step" number is also recognized.
+            ::2       0 2 4 ... to the end.    3:0:-1    3 2 1 but not 0.
+            1:10:2    1 3 5 7 9                2::-1     2 1 0.
+            ::-1      all pages in reverse order.
+"""
+
+
+class PageRange(object):
+    """ 
+    A slice-like representation of a range of page indices,
+        i.e. page numbers, only starting at zero. 
+    The syntax is like what you would put between brackets [ ].
+    The slice is one of the few Python types that can't be subclassed,
+    but this class converts to and from slices, and allows similar use.
+      o  PageRange(str) parses a string representing a page range.
+      o  PageRange(slice) directly "imports" a slice.
+      o  to_slice() gives the equivalent slice.
+      o  str() and repr() allow printing.
+      o  indices(n) is like slice.indices(n).
+    """
+
+    def __init__(self, arg):
+        """
+        Initialize with either a slice -- giving the equivalent page range,
+        or a PageRange object -- making a copy,
+        or a string like
+            "int", "[int]:[int]" or "[int]:[int]:[int]",
+            where the brackets indicate optional ints.
+        {page_range_help}
+        Note the difference between this notation and arguments to slice():
+            slice(3) means the first three pages;
+            PageRange("3") means the range of only the fourth page.
+            However PageRange(slice(3)) means the first three pages.
+        """
+        if isinstance(arg, slice):
+            self._slice = arg
+            return
+
+        if isinstance(arg, PageRange):
+            self._slice = arg.to_slice()
+            return
+
+        m = isinstance(arg, Str) and re.match(PAGE_RANGE_RE, arg)
+        if not m:
+            raise ParseError(arg)
+        elif m.group(2):
+            # Special case: just an int means a range of one page.
+            start = int(m.group(2))
+            stop = start + 1 if start != -1 else None
+            self._slice = slice(start, stop)
+        else:
+            self._slice = slice(*[int(g) if g else None 
+                                  for g in m.group(4, 6, 8)])
+
+    __init__.__doc__ = __init__.__doc__.format(page_range_help=PAGE_RANGE_HELP)
+
+    @staticmethod
+    def valid(input):
+        """ True if input is a valid initializer for a PageRange. """
+        return isinstance(input, slice)  or \
+               isinstance(input, PageRange) or \
+               (isinstance(input, Str)
+                and bool(re.match(PAGE_RANGE_RE, input)))
+
+    def to_slice(self):
+        """ Return the slice equivalent of this page range. """
+        return self._slice
+
+    def __str__(self):
+        """ A string like "1:2:3". """
+        s = self._slice
+        if s.step == None:
+            if s.start != None  and  s.stop == s.start + 1:
+                return str(s.start)
+
+            indices = s.start, s.stop
+        else:
+            indices = s.start, s.stop, s.step
+        return ':'.join("" if i == None else str(i) for i in indices)
+
+    def __repr__(self):
+        """ A string like "PageRange('1:2:3')". """
+        return "PageRange(" + repr(str(self)) + ")"
+
+    def indices(self, n):
+        """
+        n is the length of the list of pages to choose from.
+        Returns arguments for range().  See help(slice.indices).
+        """
+        return self._slice.indices(n)
+
+
+PAGE_RANGE_ALL = PageRange(":")  # The range of all pages.
+
+
+def parse_filename_page_ranges(args):
+    """
+    Given a list of filenames and page ranges, return a list of
+    (filename, page_range) pairs.
+    First arg must be a filename; other ags are filenames, page-range 
+    expressions, slice objects, or PageRange objects.
+    A filename not followed by a page range indicates all pages of the file.
+    """
+    pairs = []
+    pdf_filename = None
+    did_page_range = False
+    for arg in args + [None]:
+        if PageRange.valid(arg):
+            if not pdf_filename:
+                raise ValueError("The first argument must be a filename, " \
+                                 "not a page range.")
+
+            pairs.append( (pdf_filename, PageRange(arg)) )
+            did_page_range = True
+        else:
+            # New filename or end of list--do all of the previous file?
+            if pdf_filename and not did_page_range:
+                pairs.append( (pdf_filename, PAGE_RANGE_ALL) )
+
+            pdf_filename = arg
+            did_page_range = False
+    return pairs
diff --git a/Sample_Code/makesimple.py b/Sample_Code/makesimple.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+"Make some simple multipage pdf files."
+
+from __future__ import print_function
+from sys import argv
+
+from reportlab.pdfgen import canvas
+
+point = 1
+inch = 72
+
+TEXT = """%s    page %d of %d
+
+a wonderful file
+created with Sample_Code/makesimple.py"""
+
+def make_pdf_file(output_filename, np):
+    title = output_filename
+    c = canvas.Canvas(output_filename, pagesize=(8.5 * inch, 11 * inch))
+    c.setStrokeColorRGB(0,0,0)
+    c.setFillColorRGB(0,0,0)
+    c.setFont("Helvetica", 12 * point) 
+    for pn in range(1, np + 1):
+        v = 10 * inch
+        for subtline in (TEXT % (output_filename, pn, np)).split( '\n' ):
+            c.drawString( 1 * inch, v, subtline )
+            v -= 12 * point
+        c.showPage()
+    c.save()
+
+if __name__ == "__main__":
+    nps = [None, 5, 11, 17]
+    for i, np in enumerate(nps):
+        if np:
+            filename = "simple%d.pdf" % i
+            make_pdf_file(filename, np)
+            print ("Wrote", filename)
diff --git a/Sample_Code/makesimple.sh b/Sample_Code/makesimple.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+n=1
+for np in 5 11 17; do
+   p=1
+   f=simple$n.pdf
+   while expr $p \<= $np > /dev/null; do
+     if [ $p != 1 ]; then
+       echo "\c"
+      fi
+     echo "$f           page $p of $np"
+     echo ""
+     echo "an incredible, yet simple example"
+     echo "Created with Sample_Code/makesimple.sh"
+     p=$(expr $p + 1)
+    done | enscript --no-header -o - |ps2pdf - $f
+   echo $f
+   n=$(expr $n + 1)
+ done
diff --git a/Sample_Code/pdfcat b/Sample_Code/pdfcat
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+"""
+Concatenate pages from pdf files into a single pdf file.
+
+Page ranges refer to the previously-named file.
+A file not followed by a page range means all the pages of the file.
+
+PAGE RANGES are like Python slices.
+        {page_range_help}
+EXAMPLES
+    pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1
+        Concatenate all of head.pdf, all but page seven of content.pdf, 
+        and the last page of tail.pdf, producing output.pdf.
+
+    pdfcat chapter*.pdf >book.pdf
+        You can specify the output file by redirection.
+
+    pdfcat chapter?.pdf chapter10.pdf >book.pdf
+        In case you don't want chapter 10 before chapter 2.
+"""
+# Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
+# All rights reserved. This software is available under a BSD license;
+# see https://github.com/mstamy2/PyPDF2/LICENSE
+
+from __future__ import print_function
+import argparse
+from PyPDF2.pagerange import PAGE_RANGE_HELP
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__.format(page_range_help=PAGE_RANGE_HELP),
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("args", nargs="+",
+                        metavar="filename or page range expression")
+    parser.add_argument("-o", "--output",
+                        metavar="output_file")
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="show page ranges as they are being read")
+    return parser.parse_args()
+
+from sys import stderr, stdout, exit
+import os
+import traceback
+from collections import defaultdict
+
+from PyPDF2 import PdfFileMerger, parse_filename_page_ranges
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    filename_page_ranges = parse_filename_page_ranges(args.args)
+    if args.output:
+        output = open(args.output, "wb")
+    else:
+        stdout.flush()
+        output = os.fdopen(stdout.fileno(), "wb")
+
+    merger = PdfFileMerger()
+    in_fs = dict()
+    try:
+        for (filename, page_range) in filename_page_ranges:
+            if args.verbose:
+                print(filename, page_range, file=stderr)
+            if filename not in in_fs:
+                in_fs[filename] = open(filename, "rb")
+            merger.append(in_fs[filename], pages=page_range)
+    except:
+        print(traceback.format_exc(), file=stderr)
+        print("Error while reading " + filename, file=stderr)
+        exit(1)
+    merger.write(output)
+    # In 3.0, input files must stay open until output is written.
+    # Not closing the in_fs because this script exits now.
+