pdfparser.php

<?php

/**
 * @file
 * Class PdfParser
 * 
 * @author : Sebastien MALOT <sebastien@malot.fr>
 * @date : 2013-08-08
 *
 * References :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 */
class PdfParser
{
  /**
   * Parse PDF file
   *
   * @param string $filename
   * @return string
   */
  public static function parseFile($filename)
  {
    $content = file_get_contents($filename);

    return self::extractText($content);
  }

  /**
   * Parse PDF content
   *
   * @param string $content
   * @return string
   */
  public static function parseContent($content)
  {
    return self::extractText($content);
  }

  /**
   * Convert a PDF into text.
   *
   * @param string $filename The filename to extract the data from.
   * @return string The extracted text from the PDF
   */
  protected static function extractText($data)
  {
    /**
     * Split apart the PDF document into sections. We will address each
     * section separately.
     */
    $a_obj    = self::getDataArray($data, 'obj', 'endobj');
    $j        = 0;
    $a_chunks = array();

    /**
     * Attempt to extract each part of the PDF document into a 'filter'
     * element and a 'data' element. This can then be used to decode the
     * data.
     */
    foreach ($a_obj as $obj) {
      $a_filter = self::getDataArray($obj, '<<', '>>');

      if (is_array($a_filter) && isset($a_filter[0])) {
        $a_chunks[$j]['filter'] = $a_filter[0];
        $a_data = self::getDataArray($obj, 'stream', 'endstream');

        if (is_array($a_data) && isset($a_data[0])) {
          $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
        }

        $j++;
      }
    }

    $result_data = null;

    // decode the chunks
    foreach ($a_chunks as $chunk) {
      // Look at each chunk decide if we can decode it by looking at the contents of the filter
      if (isset($chunk['data'])) {

        // look at the filter to find out which encoding has been used
        if (strpos($chunk['filter'], 'FlateDecode') !== false) {
          // Use gzuncompress but suppress error messages.
          $data =@ gzuncompress($chunk['data']);
        } else {
          $data = $chunk['data'];
        }

        if (trim($data) != '') {
          // If we got data then attempt to extract it.
          $result_data .= ' ' . self::extractTextElements($data);
        }
      }
    }

    /**
     * Make sure we don't have large blocks of white space before and after
     * our string. Also extract alphanumerical information to reduce
     * redundant data.
     */
    if (trim($result_data) == '') {
      return null;
    } else {
      // Optimize hyphened words
      $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
      $result_data = preg_replace('/\s+/', ' ', $result_data);

      return $result_data;
    }
  }

  protected static function extractTextElements($content)
  {
    if (strpos($content, '/CIDInit') === 0) {
      return '';
    }

    $text  = '';
    $lines = explode("\n", $content);

    foreach ($lines as $line) {
      $line = trim($line);
      $matches = array();

      // Parse each lines to extract command and operator values
      if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
        $command = trim($matches['command']);

        // Convert octal encoding
        $found_octal_values = array();
        preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);

        foreach($found_octal_values[0] as $value) {
          $octal = substr($value, 1);

          if (intval($octal) < 40) {
            // Skips non printable chars
            $command = str_replace($value, '', $command);
          } else {
            $command = str_replace($value, chr(octdec($octal)), $command);
          }
        }
        // Removes encoded new lines, tabs, ...
        $command = preg_replace('/\\\\[\r\n]/', '', $command);
        $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
        // Force UTF-8 charset
        $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
        if (strtoupper($encoding) != 'UTF-8') {
          if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
            $command = $decoded;
          }
        }
        // Removes leading spaces
        $operator = trim($matches['operator']);
      } else {
        $command = $line;
        $operator = '';
      }

      // Handle main operators
      switch ($operator) {
        // Set character spacing.
        case 'Tc':
          break;

        // Move text current point.
        case 'Td':
          $values = explode(' ', $command);
          $y = array_pop($values);
          $x = array_pop($values);
          if ($x > 0) {
            $text .= ' ';
          }
          if ($y < 0) {
            $text .= ' ';
          }
          break;

        // Move text current point and set leading.
        case 'TD':
          $values = explode(' ', $command);
          $y = array_pop($values);
          if ($y < 0) {
            $text .= "\n";
          }
          break;

        // Set font name and size.
        case 'Tf':
          $text.= ' ';
          break;

        // Display text, allowing individual character positioning
        case 'TJ':
          $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ']', null, 'UTF-8');
          $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
          break;

        // Display text.
        case 'Tj':
          $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
          $end   = mb_strrpos($command, ')', null, 'UTF-8');
          $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
          break;

        // Set leading.
        case 'TL':

        // Set text matrix.
        case 'Tm':
//          $text.= ' ';
          break;

        // Set text rendering mode.
        case 'Tr':
          break;

        // Set super/subscripting text rise.
        case 'Ts':
          break;

        // Set text spacing.
        case 'Tw':
          break;

        // Set horizontal scaling.
        case 'Tz':
          break;

        // Move to start of next line.
        case 'T*':
          $text.= "\n";
          break;

        // Internal use
        case 'g':
        case 'gs':
        case 're':
        case 'f':
        // Begin text
        case 'BT':
        // End text
        case 'ET':
          break;

        case '':
          break;

        default:
      }
    }

    $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);

    return $text;
  }

  /**
   * Strip out the text from a small chunk of data.
   *
   * @param string $text
   * @param int $font_size Currently not used
   *
   * @return string
   */
  protected static function parseTextCommand($text, $font_size = 0) {

    $result = '';
    $cur_start_pos = 0;

    while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
      // New text element found
      if ($cur_start_text - $cur_start_pos > 8) {
        $spacing = ' ';
      } else {
        $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');

        if ($spacing_size < -50) {
          $spacing = ' ';
        } else {
          $spacing = '';
        }
      }
      $cur_start_text++;

      $start_search_end = $cur_start_text;
      while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
        if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
          break;
        }
        $start_search_end = $cur_start_pos + 1;
      }

      // something wrong happened
      if ($cur_start_pos === false) {
        break;
      }

      // Add to result
      $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
      $cur_start_pos++;
    }

    return $result;
  }

  /**
   * Convert a section of data into an array, separated by the start and end words.
   *
   * @param  string $data       The data.
   * @param  string $start_word The start of each section of data.
   * @param  string $end_word   The end of each section of data.
   * @return array              The array of data.
   */
  protected static function getDataArray($data, $start_word, $end_word)
  {
    $start     = 0;
    $end       = 0;
    $a_results = array();

    while ($start !== false && $end !== false) {
      $start = strpos($data, $start_word, $end);
      $end   = strpos($data, $end_word, $start);

      if ($end !== false && $start !== false) {
        // data is between start and end
        $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
      }
    }

    return $a_results;
  }
}