From e19aea932d10938a995e89aea79fb304da5a82ff Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Fri, 6 May 2022 22:02:32 +0200 Subject: [PATCH 01/11] Bump version 20220506 & fix small issue with types --- CHANGELOG.md | 2 +- pdfminer/layout.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97b5236f..eedc8fdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [Unreleased] +## [20220506] ### Fixed diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 9196a88e..5158f0eb 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -495,7 +495,8 @@ def __repr__(self) -> str: ) def analyze(self, laparams: LAParams) -> None: - LTTextContainer.analyze(self, laparams) + for obj in self._objs: + obj.analyze(laparams) LTContainer.add(self, LTAnno("\n")) return From f2c967f5000ac26f731936303979fd365f5b56b0 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Fri, 6 May 2022 16:15:00 -0400 Subject: [PATCH 02/11] Ignore path constructors that do not begin with m (#749) * Ignore path constructors that do not begin with m Per PDF Reference Section 4.4.1, "path construction operators may be invoked in any sequence, but the first one invoked must be m or re to begin a new subpath." Since pdfminer.six already converts all `re` (rectangle) operators to their equivelent `mlllh` representation, paths ingested by `.paint_path(...)` that do not begin with the `m` operator are invalid. In addition to the advantage of hewing to the PDF Reference, this change also avoids the `ValueError: not enough values to unpack (expected 2, got 1)` error raised by the ` pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]` line in `converter.py` when parsing PDFs that (erroneously) include `("h",)` paths. * Update CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 6 ++++++ pdfminer/converter.py | 11 ++++++++++- tests/test_converter.py | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eedc8fdf..76c8dbda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- Ignoring (invalid) path constructors that do not begin with `m` ([#749](https://github.com/pdfminer/pdfminer.six/pull/749)) + +## [20220506] + +### Fixed + - `IndexError` when handling invalid bfrange code map in CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731)) - `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732)) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index a4147994..3da2fcbb 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -109,7 +109,16 @@ def paint_path( """Paint paths described in section 4.4 of the PDF reference manual""" shape = "".join(x[0] for x in path) - if shape.count("m") > 1: + if shape[:1] != "m": + # Per PDF Reference Section 4.4.1, "path construction operators may + # be invoked in any sequence, but the first one invoked must be m + # or re to begin a new subpath." Since pdfminer.six already + # converts all `re` (rectangle) operators to their equivelent + # `mlllh` representation, paths ingested by `.paint_path(...)` that + # do not begin with the `m` operator are invalid. + pass + + elif shape.count("m") > 1: # recurse if there are multiple m's in this shape for m in re.finditer(r"m[^m]+", shape): subpath = path[m.start(0) : m.end(0)] diff --git a/tests/test_converter.py b/tests/test_converter.py index e9d18e84..bae442fe 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -215,6 +215,15 @@ def parse(path): (71.41, 434.89), ] + def test_paint_path_without_starting_m(self): + gs = PDFGraphicState() + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 100, 0, 100]) + paths = [[("h",)], [("l", 72.41, 433.89), ("l", 82.41, 433.89), ("h",)]] + for path in paths: + analyzer.paint_path(gs, False, False, False, path) + assert len(analyzer.cur_item._objs) == 0 + class TestBinaryDetector: def test_stringio(self): From 7f97e2686900339f85b9d6a092ebc071f3bea4f5 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sat, 7 May 2022 20:35:18 +0200 Subject: [PATCH 03/11] Remove upper version bounds (#755) Using an upper bound for dependency versions on a library is a source of troubles for users. Let's not do it as it makes pdfminer wreck havoc downstream. Signed-off-by: Philippe Ombredanne --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e5a0067d..0196b481 100644 --- a/setup.py +++ b/setup.py @@ -17,8 +17,8 @@ packages=["pdfminer"], package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]}, install_requires=[ - "charset-normalizer~=2.0.0", - "cryptography~=36.0.0", + "charset-normalizer >= 2.0.0", + "cryptography >= 36.0.0", ], extras_require={ "dev": ["pytest", "nox", "black", "mypy == 0.931"], From 0b09d5f8db1551d839d748f808eb246b95736b77 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 24 May 2022 19:41:54 +0200 Subject: [PATCH 04/11] Update CHANGELOG.md for #755 --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76c8dbda..571bb133 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Ignoring (invalid) path constructors that do not begin with `m` ([#749](https://github.com/pdfminer/pdfminer.six/pull/749)) +### Changed + +- Removed upper version bounds ([#755](https://github.com/pdfminer/pdfminer.six/pull/755)) + ## [20220506] ### Fixed From 86e34873e4daeb7329aa7420208e82f8d41e38b6 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Tue, 24 May 2022 19:07:04 +0100 Subject: [PATCH 05/11] Fix Sphinx warnings and error (#760) * Fix Sphinx warnings howto/acro_forms.rst:4: WARNING: Title underline too short. howto/acro_forms.rst:81: WARNING: Bullet list ends without a blank line; unexpected unindent. howto/acro_forms.rst:88: WARNING: Bullet list ends without a blank line; unexpected unindent. howto/acro_forms.rst:122: WARNING: Bullet list ends without a blank line; unexpected unindent. tutorial/extract_pages.rst:6: WARNING: Failed to create a cross reference. A title or caption not found: api_extract_pages * Fix documenting pdf2txt.py reference/commandline.rst:12: ERROR: Module "tools.pdf2txt" has no attribute "maketheparser" Incorrect argparse :module: or :func: values? * Add CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 8 +++++++- docs/source/howto/acro_forms.rst | 11 +++++++---- docs/source/reference/commandline.rst | 2 +- docs/source/reference/highlevel.rst | 4 ++-- tools/pdf2txt.py | 8 ++++++-- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 571bb133..e23fcf0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,13 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [20220506] +## [Unreleased] + +### Fixed + +- Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) + +## [20220524] ### Fixed diff --git a/docs/source/howto/acro_forms.rst b/docs/source/howto/acro_forms.rst index 23444ff5..276dccff 100644 --- a/docs/source/howto/acro_forms.rst +++ b/docs/source/howto/acro_forms.rst @@ -1,7 +1,7 @@ .. _acro_forms: How to extract AcroForm interactive form fields from a PDF using PDFMiner -******************************** +************************************************************************* Before you start, make sure you have :ref:`installed pdfminer.six`. @@ -78,14 +78,16 @@ How it works: doc = PDFDocument(parser) - Get the catalog -(the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://www.adobe.com/devnet/pdf/pdf_reference.html) + + (the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://www.adobe.com/devnet/pdf/pdf_reference.html) .. code-block:: python res = resolve1(doc.catalog) - Check if the catalog contains the AcroForm key and raise ValueError if not -(the PDF does not contain Acroform type of interactive forms if this key is missing in the catalog, see section 12.7.2 of PDF 32000-1:2008 specs) + + (the PDF does not contain Acroform type of interactive forms if this key is missing in the catalog, see section 12.7.2 of PDF 32000-1:2008 specs) .. code-block:: python @@ -119,7 +121,8 @@ How it works: values = resolve1(value) - Call the value(s) decoding method as needed -(a single field can hold multiple values, for example a combo box can hold more than one value at time) + + (a single field can hold multiple values, for example a combo box can hold more than one value at time) .. code-block:: python diff --git a/docs/source/reference/commandline.rst b/docs/source/reference/commandline.rst index 7e4aba31..2195432d 100644 --- a/docs/source/reference/commandline.rst +++ b/docs/source/reference/commandline.rst @@ -11,7 +11,7 @@ pdf2txt.py .. argparse:: :module: tools.pdf2txt - :func: maketheparser + :func: create_parser :prog: python tools/pdf2txt.py .. _api_dumppdf: diff --git a/docs/source/reference/highlevel.rst b/docs/source/reference/highlevel.rst index b764e901..ef7ae768 100644 --- a/docs/source/reference/highlevel.rst +++ b/docs/source/reference/highlevel.rst @@ -21,10 +21,10 @@ extract_text_to_fp .. autofunction:: extract_text_to_fp +.. _api_extract_pages: + extract_pages ============= .. currentmodule:: pdfminer.high_level .. autofunction:: extract_pages - -.. _api_extract_pages: \ No newline at end of file diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index e872f5f6..0511b937 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -63,7 +63,7 @@ def extract_text( return outfp -def parse_args(args: Optional[List[str]]) -> argparse.Namespace: +def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", @@ -272,7 +272,11 @@ def parse_args(args: Optional[List[str]]) -> argparse.Namespace: "Only used when output_type is xml.", ) - parsed_args = parser.parse_args(args=args) + return parser + + +def parse_args(args: Optional[List[str]]) -> argparse.Namespace: + parsed_args = create_parser().parse_args(args=args) # Propagate parsed layout parameters to LAParams object if parsed_args.no_laparams: From 6cbee25b3ed83aedd0581c2ca54b775c6cff22b0 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sat, 25 Jun 2022 23:11:10 +0200 Subject: [PATCH 06/11] Deprecate usage of `if __name__ == "__main__"` in scripts that are not documented. Also deprecate usage of scripts that are only there for testing purposes. (#756) * Deprecate usage of `if __name__ == "__main__"` in scripts that are not document. Also deprecate usage of scripts that are only there for testing purposes. * Add CHANGELOG.md * Cleanup CHANGELOG.md * Cleanup CHANGELOG.md * Undo deleting conf_glyphlist.py and conf_afm.py and add a deprecation warning instead --- CHANGELOG.md | 22 ++++++++------- pdfminer/cmapdb.py | 9 +++++++ pdfminer/fontmetrics.py | 42 +++++++++++++++++++++++++++++ pdfminer/glyphlist.py | 26 ++++++++++++++++++ pdfminer/pdffont.py | 13 +++++++-- tests/test_highlevel_extracttext.py | 4 --- tools/conv_afm.py | 8 ++++++ tools/conv_cmap.py | 5 ++-- tools/conv_glyphlist.py | 7 +++++ tools/pdfdiff.py | 8 ++++++ tools/pdfstats.py | 19 ++++++++----- tools/prof.py | 9 +++++++ 12 files changed, 149 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e23fcf0d..3b0c3ce5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) +### Deprecated + +- Usage of `if __name__ == "__main__"` where it was only intended for testing purposes ([#756](https://github.com/pdfminer/pdfminer.six/pull/756)) + ## [20220524] ### Fixed @@ -86,7 +90,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Using `io.TextIOBase` as the file to write to ([#616](https://github.com/pdfminer/pdfminer.six/pull/616)) - Parsing \r\n after the escape character in a literal string ([#616](https://github.com/pdfminer/pdfminer.six/pull/616)) -## Removed +### Removed - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525)) - Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523)) @@ -152,12 +156,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Group text lines if they are centered ([#384](https://github.com/pdfminer/pdfminer.six/pull/384)) -## [20200124] - 2020-01-24 +## [20200124] ### Security - Removed samples/issue-00152-embedded-pdf.pdf because it contains a possible security thread; a javascript enabled object ([#364](https://github.com/pdfminer/pdfminer.six/pull/364)) -## [20200121] - 2020-01-21 +## [20200121] ### Fixed - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352)) @@ -168,20 +172,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - The command-line utility latin2ascii.py ([#360](https://github.com/pdfminer/pdfminer.six/pull/360)) -## [20200104] - 2019-01-04 +## [20200104] -## Removed +### Removed - Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346)) ### Changed - Enforce pep8 coding style by adding flake8 to CI ([#345](https://github.com/pdfminer/pdfminer.six/pull/345)) -## [20191110] - 2019-11-10 +## [20191110] ### Fixed - Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335)) -## [20191107] - 2019-11-07 +## [20191107] ### Deprecated - The argument `_py2_no_more_posargs` because Python2 is removed on January @@ -208,7 +212,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - Files for external applications such as django, cgi and pyinstaller ([#320](https://github.com/pdfminer/pdfminer.six/pull/320)) -## [20191020] - 2019-10-20 +## [20191020] ### Deprecated - Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307)) @@ -230,7 +234,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219)) -## [20181108] - 2018-11-08 +## [20181108] ### Changed - Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141)) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 704a9d34..01306ed2 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -477,6 +477,15 @@ def _warn_once(self, msg: str) -> None: def main(argv: List[str]) -> None: + from warnings import warn + + warn( + "The function main() from cmapdb.py will be removed in 2023. It was probably " + "introduced for testing purposes a long time ago, and no longer relevant. " + "Feel free to create a GitHub issue if you disagree.", + DeprecationWarning, + ) + args = argv[1:] for fname in args: fp = open(fname, "rb") diff --git a/pdfminer/fontmetrics.py b/pdfminer/fontmetrics.py index 4fdf28b0..2ed0f024 100644 --- a/pdfminer/fontmetrics.py +++ b/pdfminer/fontmetrics.py @@ -27,6 +27,48 @@ ### END Verbatim copy of the license part # flake8: noqa +from typing import Dict + + +def convert_font_metrics(path: str) -> None: + """Convert an AFM file to a mapping of font metrics. + + See below for the output. + """ + fonts = {} + with open(path, "r") as fileinput: + for line in fileinput.readlines(): + f = line.strip().split(" ") + if not f: + continue + k = f[0] + if k == "FontName": + fontname = f[1] + props = {"FontName": fontname, "Flags": 0} + chars: Dict[int, int] = {} + fonts[fontname] = (props, chars) + elif k == "C": + cid = int(f[1]) + if 0 <= cid and cid <= 255: + width = int(f[4]) + chars[cid] = width + elif k in ("CapHeight", "XHeight", "ItalicAngle", "Ascender", "Descender"): + k = {"Ascender": "Ascent", "Descender": "Descent"}.get(k, k) + props[k] = float(f[1]) + elif k in ("FontName", "FamilyName", "Weight"): + k = {"FamilyName": "FontFamily", "Weight": "FontWeight"}.get(k, k) + props[k] = f[1] + elif k == "IsFixedPitch": + if f[1].lower() == "true": + props["Flags"] = 64 + elif k == "FontBBox": + props[k] = tuple(map(float, f[1:5])) + print("# -*- python -*-") + print("FONT_METRICS = {") + for (fontname, (props, chars)) in fonts.items(): + print(" {!r}: {!r},".format(fontname, (props, chars))) + print("}") + FONT_METRICS = { "Courier": ( diff --git a/pdfminer/glyphlist.py b/pdfminer/glyphlist.py index 46c32352..9d4eb908 100644 --- a/pdfminer/glyphlist.py +++ b/pdfminer/glyphlist.py @@ -51,6 +51,32 @@ # (1) glyph name # (2) Unicode scalar value + +def convert_glyphlist(path: str) -> None: + """Convert a glyph list into a python representation. + + See output below. + """ + state = 0 + with open(path, "r") as fileinput: + for line in fileinput.readlines(): + line = line.strip() + if not line or line.startswith("#"): + if state == 1: + state = 2 + print("}\n") + print(line) + continue + if state == 0: + print("\nglyphname2unicode = {") + state = 1 + (name, x) = line.split(";") + codes = x.split(" ") + print( + " {!r}: u'{}',".format(name, "".join("\\u%s" % code for code in codes)) + ) + + glyphname2unicode = { "A": "\u0041", "AE": "\u00C6", diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 0b3e00a7..0c337938 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -19,12 +19,12 @@ from . import settings from .cmapdb import CMap -from .cmapdb import IdentityUnicodeMap from .cmapdb import CMapBase from .cmapdb import CMapDB from .cmapdb import CMapParser -from .cmapdb import UnicodeMap from .cmapdb import FileUnicodeMap +from .cmapdb import IdentityUnicodeMap +from .cmapdb import UnicodeMap from .encodingdb import EncodingDB from .encodingdb import name2unicode from .fontmetrics import FONT_METRICS @@ -1187,6 +1187,15 @@ def to_unichr(self, cid: int) -> str: def main(argv: List[str]) -> None: + from warnings import warn + + warn( + "The function main() from pdffont.py will be removed in 2023. It was probably " + "introduced for testing purposes a long time ago, and no longer relevant. " + "Feel free to create a GitHub issue if you disagree.", + DeprecationWarning, + ) + for fname in argv[1:]: fp = open(fname, "rb") font = CFFFont(fname, fp) diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index b7733c01..842459d2 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -168,7 +168,3 @@ def test_no_boxes_flow(self): elements = [element for element in page if isinstance(element, LTTextContainer)] self.assertEqual(len(elements), 1) self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n") - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/conv_afm.py b/tools/conv_afm.py index cb91baa1..f666ee18 100755 --- a/tools/conv_afm.py +++ b/tools/conv_afm.py @@ -2,6 +2,7 @@ import sys import fileinput +from warnings import warn def main(argv): @@ -41,4 +42,11 @@ def main(argv): if __name__ == "__main__": + warn( + "The file conf_afm.py will be removed in 2023. Its functionality is" + "moved to pdfminer/font_metrics.py. Feel free to create a GitHub " + "issue if you disagree.", + DeprecationWarning, + ) + sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/tools/conv_cmap.py b/tools/conv_cmap.py index e39c17ed..e265ee4c 100755 --- a/tools/conv_cmap.py +++ b/tools/conv_cmap.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import sys -import pickle as pickle import codecs +import pickle as pickle +import sys class CMapConverter: @@ -19,6 +19,7 @@ def get_encs(self): def get_maps(self, enc): if enc.endswith("-H"): + (hmapenc, vmapenc) = (enc, None) elif enc == "H": (hmapenc, vmapenc) = ("H", "V") diff --git a/tools/conv_glyphlist.py b/tools/conv_glyphlist.py index 7a1183fd..a572059e 100755 --- a/tools/conv_glyphlist.py +++ b/tools/conv_glyphlist.py @@ -2,6 +2,7 @@ import sys import fileinput +from warnings import warn def main(argv): @@ -23,4 +24,10 @@ def main(argv): if __name__ == "__main__": + warn( + "The file conf_glpyhlist.py will be removed in 2023. Its functionality" + "is moved to pdfminer/glyphlist.py. Feel free to create a GitHub issue " + "if you disagree.", + DeprecationWarning, + ) sys.exit(main(sys.argv)) # type: ignore[no-untyped-call] diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py index 43156e8d..57ae4ef3 100644 --- a/tools/pdfdiff.py +++ b/tools/pdfdiff.py @@ -7,10 +7,18 @@ import logging import sys from typing import Any, Iterable, List, Optional +from warnings import warn import pdfminer.settings from pdfminer import high_level, layout +warn( + "The file pdfdiff.py will be removed in 2023. It was probably introduced for " + "testing purposes a long time ago, and no longer relevant. Feel free to create a " + "GitHub issue if you disagree.", + DeprecationWarning, +) + pdfminer.settings.STRICT = False diff --git a/tools/pdfstats.py b/tools/pdfstats.py index 1b57b809..4eae67f9 100755 --- a/tools/pdfstats.py +++ b/tools/pdfstats.py @@ -4,18 +4,25 @@ # print some stats to stdout # Usage: pdfstats.py -import sys -import os import collections +import os +import sys from typing import Any, Counter, Iterator, List +from warnings import warn -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer +from pdfminer.pdfdocument import PDFDocument, PDFTextExtractionNotAllowed +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser +warn( + "The file pdfstats.py will be removed in 2023. It was probably introduced for " + "testing purposes a long time ago, and no longer relevant. Feel free to create a " + "GitHub issue if you disagree.", + DeprecationWarning, +) _, SCRIPT = os.path.split(__file__) diff --git a/tools/prof.py b/tools/prof.py index 0477fd9f..b725e716 100644 --- a/tools/prof.py +++ b/tools/prof.py @@ -2,6 +2,15 @@ import sys from typing import List +from warnings import warn + +warn( + "The file prof.py will be removed in 2023. It was probably introduced for " + "testing purposes a long time ago, and no longer relevant. Feel free to create a " + "GitHub issue if you disagree.", + DeprecationWarning, +) + def prof_main(argv: List[str]) -> int: import hotshot.stats # type: ignore[import] From 1044fc05e8c24ee7cd5805cf184c8ca39da7a030 Mon Sep 17 00:00:00 2001 From: gosiafilipek <90677030+gosiafilipek@users.noreply.github.com> Date: Sat, 25 Jun 2022 23:16:28 +0200 Subject: [PATCH 07/11] Fix `TypeError` when getting default width of font (#772) * Issue #720 resolve1 when getting the default width. * Add CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + pdfminer/pdffont.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b0c3ce5..7c9d9b61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) +- `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720)) ### Deprecated diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 0c337938..13629c77 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -867,6 +867,7 @@ def __init__( self.default_width = num_value(descriptor.get("MissingWidth", 0)) else: self.default_width = default_width + self.default_width = resolve1(self.default_width) self.leading = num_value(descriptor.get("Leading", 0)) self.bbox = cast( Rect, list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))) From f63e9fbee908a8700826747e8d37c938081d2915 Mon Sep 17 00:00:00 2001 From: Florian Apolloner Date: Sun, 26 Jun 2022 17:25:30 +0200 Subject: [PATCH 08/11] Fix `ValueError` with unencrypted metadata values (Fixes #766). (#774) * Fix crash with unencrypted metadata values (pdfminer#766). * Explicitly check for length * Update CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + pdfminer/pdftypes.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c9d9b61..98eaaded 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) - `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720)) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index f4543b97..d7a2f412 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -138,6 +138,8 @@ def resolve_all(x: object, default: object = None) -> Any: def decipher_all(decipher: DecipherCallable, objid: int, genno: int, x: object) -> Any: """Recursively deciphers the given object.""" if isinstance(x, bytes): + if len(x) == 0: + return x return decipher(objid, genno, x) if isinstance(x, list): x = [decipher_all(decipher, objid, genno, v) for v in x] From ebf92acf0cad575d51d3205304921f55434eb360 Mon Sep 17 00:00:00 2001 From: Christian Christiansen Date: Sun, 26 Jun 2022 15:46:39 +0000 Subject: [PATCH 09/11] Fix `TypeError` by Ignoring null characters in PSBaseParser (#768) * Ignore null characters in PSBaseParser Beforehand, null characters were encoded as PSKeyword tokens. This caused issue #617, as pdfdevice.py would attempt to decode the null character PSKeyword, when it expects a byte string, as opposed to a PSKeyword, causing pdfminer.six to crash. As null characters are superfluous within PSBaseParser, ignore them. * Update CHANGELOG.md Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + pdfminer/psparser.py | 2 ++ 2 files changed, 3 insertions(+) mode change 100644 => 100755 pdfminer/psparser.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 98eaaded..84a0c5df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) - `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720)) +- `TypeError` in cmapdb.py when parsing null characters ([#768](https://github.com/pdfminer/pdfminer.six/pull/768)) ### Deprecated diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py old mode 100644 new mode 100755 index c7f8a175..0b94e327 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -334,6 +334,8 @@ def _parse_main(self, s: bytes, i: int) -> int: self._curtoken = b"" self._parse1 = self._parse_wclose return j + 1 + elif c == b"\x00": + return j + 1 else: self._add_token(KWD(c)) return j + 1 From 4733eb333abca8c38736fd4e0b391aa86900f660 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 26 Jun 2022 17:47:28 +0200 Subject: [PATCH 10/11] Install typing_extensions on Python 3.6 and 3.7 (#775) * Install typing_extensions on Python 3.6 and 3.7 * Add CHANGELOG.md * Black setup.py --- CHANGELOG.md | 1 + pdfminer/image.py | 1 + setup.py | 1 + 3 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84a0c5df..dc3ec737 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) - `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720)) +- Install typing-extensions on Python 3.6 and 3.7 ([#775](https://github.com/pdfminer/pdfminer.six/pull/775)) - `TypeError` in cmapdb.py when parsing null characters ([#768](https://github.com/pdfminer/pdfminer.six/pull/768)) ### Deprecated diff --git a/pdfminer/image.py b/pdfminer/image.py index 2b412534..54b14929 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -7,6 +7,7 @@ try: from typing import Literal except ImportError: + # Literal was introduced in Python 3.8 from typing_extensions import Literal # type: ignore[misc] from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter diff --git a/setup.py b/setup.py index 0196b481..2bcba0bc 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ install_requires=[ "charset-normalizer >= 2.0.0", "cryptography >= 36.0.0", + 'typing_extensions; python_version < "3.8"', ], extras_require={ "dev": ["pytest", "nox", "black", "mypy == 0.931"], From 8f52578e85b27831ab8a68a6d86721ea3348a553 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 26 Jun 2022 18:25:28 +0200 Subject: [PATCH 11/11] Run black locally with nox (#776) * Run black locally with nox * Update contributor instructions * Fix workflow --- .github/pull_request_template.md | 24 +++++++----------------- .github/workflows/actions.yml | 15 ++++++++++++--- CONTRIBUTING.md | 19 +++++++++---------- noxfile.py | 21 +++++++++++++++++++-- setup.py | 3 +-- 5 files changed, 48 insertions(+), 34 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 1159de80..09f14a34 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,25 +1,15 @@ **Pull request** -Please remove this paragraph and replace it with a description of your PR. -Also include links to the issues that it fixes. +Please *remove* this paragraph and replace it with a description of your PR. Also include the issue that it fixes. **How Has This Been Tested?** -Please repalce this paragraph with a description of how this PR has been -tested. Include the necessary instructions and files such that other can -reproduce it. +Please *remove* this paragraph with a description of how this PR has been tested. **Checklist** -- [ ] I have formatted my code with [black](https://github.com/psf/black). -- [ ] I have added tests that prove my fix is effective or that my feature - works -- [ ] I have added docstrings to newly created methods and classes -- [ ] I have optimized the code at least one time after creating the initial - version -- [ ] I have updated the [README.md](../README.md) or verified that this - is not necessary -- [ ] I have updated the [readthedocs](../docs/source) documentation or - verified that this is not necessary -- [ ] I have added a concise human-readable description of the change to - [CHANGELOG.md](../CHANGELOG.md) +- [ ] I have read [CONTRIBUTING.md](../CONTRIBUTING.md). +- [ ] I have added a concise human-readable description of the change to [CHANGELOG.md](../CHANGELOG.md). +- [ ] I have tested that this fix is effective or that this feature works. +- [ ] I have added docstrings to newly created methods and classes. +- [ ] I have updated the [README.md](../README.md) and the [readthedocs](../docs/source) documentation. Or verified that this is not necessary. diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index d0fb1155..25186bae 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -16,13 +16,22 @@ env: jobs: check-code-formatting: - name: Check code formatting + name: Check coding style runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v2 - - name: Check code formatting - uses: psf/black@stable + - name: Set up Python ${{ env.default-python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.default-python }} + - name: Upgrade pip, Install nox + run: | + python -m pip install --upgrade pip + python -m pip install nox + - name: Check coding style + run: | + nox --error-on-missing-interpreters --non-interactive --session format check-coding-style: name: Check coding style diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index be55249a..493610c1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,20 +26,25 @@ Any contribution is appreciated! You might want to: ## Guideline for creating pull request -* A pull request should close an existing issue. -* Pull requests should be merged to master. Version tags are used indicate the releases. +* A pull request should close an existing issue. For example, use "Fix #123" to indicate that your PR fixes issue 123. +* Pull requests should be merged to master. * Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case of features, this will show that your code works correctly. * Code should work for Python 3.6+. -* Code should be formatted with [black](https://github.com/psf/black). +* Test your code by using nox (see below). * New features should be well documented using docstrings. +* Check if the [README.md](../README.md) or [readthedocs](../docs/source) documentation needs to be updated. * Check spelling and grammar. -* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]) +* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]). ## Guidelines for posting comments * [Be cordial and positive](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way) +## Guidelines for publishing + +* Publishing is automated. Add a YYYYMMDD version tag and GitHub workflows will do the rest. + ## Getting started 1. Clone the repository @@ -68,9 +73,3 @@ Any contribution is appreciated! You might want to: ```sh nox -e py36 ``` - -4. After changing the code, run the black formatter. - - ```sh - black . - ``` diff --git a/noxfile.py b/noxfile.py index a0cffe26..f55bbadb 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,20 +1,37 @@ +import os + import nox PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] +PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"] + + +@nox.session +def format(session): + session.install("black") + # Format files locally with black, but only check in cicd + if "CI" in os.environ: + session.run("black", "--check", *PYTHON_MODULES) + else: + session.run("black", *PYTHON_MODULES) @nox.session def lint(session): session.install("flake8") - session.run("flake8", "pdfminer/", "tools/", "tests/", "--count", "--statistics") + session.run("flake8", *PYTHON_MODULES, "--count", "--statistics") @nox.session def types(session): session.install("mypy") session.run( - "mypy", "--install-types", "--non-interactive", "--show-error-codes", "." + "mypy", + "--install-types", + "--non-interactive", + "--show-error-codes", + *PYTHON_MODULES, ) diff --git a/setup.py b/setup.py index 2bcba0bc..8f257c3f 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,7 @@ from os import path sys.path.append(str(Path(__file__).parent)) -import pdfminer as package - +import pdfminer as package # noqa: E402 with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f: readme = f.read()