Adjust hex/octal string decoding (#627)

Add a second check to be sure a string is hexadecimal before applying the `pack()` function. This ensures we avoid `illegal hex digit` and resolves #499 PdfParser currently only decodes triple digit escaped octal codes, when single, double and triple digits are all allowed. See PDF Reference 1.7 Section 3.2 Objects (page 55): https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf Modify the regexp to search for escaped octal codes from one to three digits, and exclude escaped backslashes. In sections of text that aren't escaped octal codes, un-escape backslashes and parentheses as described in PDF Reference 1.7 Section 3.2 Table 3.2. This resolves #470 Adjust the unit test `testDecodeOctal()` to escape the valid octal code `\\1` so that the output matches the existing expected value `AB \199`.
smalot · Aug 7, 2023 · f97e38c · f97e38c
1 parent 2608ac3
commit f97e38c
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php
@@ -319,12 +319,12 @@ public static function decodeHexadecimal(string $hexa, bool $add_braces = false)
         }
 
         $text = '';
-        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
 
         foreach ($parts as $part) {
-            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
-                // strip line breaks
-                $part = preg_replace("/[\r\n]/", '', $part);
+            if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
+                // strip whitespace
+                $part = preg_replace("/\s/", '', $part);
                 $part = trim($part, '<>');
                 if ($add_braces) {
                     $text .= '(';
@@ -349,14 +349,14 @@ public static function decodeHexadecimal(string $hexa, bool $add_braces = false)
      */
     public static function decodeOctal(string $text): string
     {
-        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(?<!\\\\)(\\\\[0-7]{1,3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
         $text = '';
 
         foreach ($parts as $part) {
-            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
+            if (preg_match('/^\\\\[0-7]{1,3}$/', $part)) {
                 $text .= \chr(octdec(trim($part, '\\')));
             } else {
-                $text .= $part;
+                $text .= str_replace(['\\\\', '\\(', '\\)'], ['\\', '(', ')'], $part);
             }
         }
 

diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php
@@ -280,7 +280,7 @@ public function testDecodeOctal(): void
     {
         $this->assertEquals('AB C', Font::decodeOctal('\\101\\102\\040\\103'));
         $this->assertEquals('AB CD', Font::decodeOctal('\\101\\102\\040\\103D'));
-        $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\199'));
+        $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\\\199'));
     }
 
     public function testDecodeEntities(): void