From 38ea1e04a833b39e9b44b1951a7841f99b18714f Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:20:52 -0400 Subject: [PATCH 1/9] Allow inline parsers to match on more than just single characters --- CHANGELOG.md | 3 + docs/2.0/customization/inline-parsing.md | 59 +++++++++++-------- .../Parser/AttributesInlineParser.php | 8 +-- .../Parser/Inline/AutolinkParser.php | 16 +++-- .../Parser/Inline/BacktickParser.php | 8 +-- .../CommonMark/Parser/Inline/BangParser.php | 8 +-- .../Parser/Inline/CloseBracketParser.php | 8 +-- .../CommonMark/Parser/Inline/EntityParser.php | 8 +-- .../Parser/Inline/EscapableParser.php | 8 +-- .../Parser/Inline/HtmlInlineParser.php | 8 +-- .../Parser/Inline/OpenBracketParser.php | 8 +-- .../Parser/AnonymousFootnoteRefParser.php | 8 +-- .../Footnote/Parser/FootnoteRefParser.php | 8 +-- src/Extension/Mention/MentionParser.php | 8 +-- .../SmartPunct/PunctuationParser.php | 8 +-- src/Extension/SmartPunct/QuoteParser.php | 8 +-- .../TaskList/TaskListItemMarkerParser.php | 8 +-- src/Parser/Inline/InlineParserInterface.php | 5 +- src/Parser/Inline/InlineParserMatch.php | 59 +++++++++++++++++++ src/Parser/Inline/NewlineParser.php | 7 +-- .../FakeInjectableInlineParser.php | 8 +-- .../Parser/Inline/InlineParserMatchTest.php | 43 ++++++++++++++ 22 files changed, 196 insertions(+), 116 deletions(-) create mode 100644 src/Parser/Inline/InlineParserMatch.php create mode 100644 tests/unit/Parser/Inline/InlineParserMatchTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce1f4ab62..7f762d0596 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ See for detailed informatio - `DocumentRenderedEvent` - `HtmlRendererInterface` - `InlineParserEngineInterface` + - `InlineParserMatch` - `MarkdownParserState` - `MarkdownParserStateInterface` - `ReferenceableInterface` @@ -58,6 +59,8 @@ See for detailed informatio - `ConfigurableEnvironmentInterface::addBlockParser()` is now `ConfigurableEnvironmentInterface::addBlockParserFactory()` - `ReferenceParser` was re-implemented and works completely different than before - The paragraph parser no longer needs to be added manually to the environment + - Implemented a new approach to inline parsing where parsers can now specify longer strings or regular expressions they want to parse (instead of just single characters): + - `InlineParserInterface::getCharacters()` is now `getMatchDefinition()` and returns an instance of `InlineParserMatch` - Changed block and inline rendering to use common methods and interfaces - `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface. - `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()` diff --git a/docs/2.0/customization/inline-parsing.md b/docs/2.0/customization/inline-parsing.md index 16a3c23b24..968ca062bf 100644 --- a/docs/2.0/customization/inline-parsing.md +++ b/docs/2.0/customization/inline-parsing.md @@ -29,28 +29,43 @@ If your syntax looks like that, consider using a [delimiter processor](/2.0/cust Inline parsers should implement `InlineParserInterface` and the following two methods: -### getCharacters() +### getMatchDefinition() -This method should return an array of single characters which the inline parser engine should stop on. When it does find a match in the current line the `parse()` method below may be called. +This method should return an instance of `InlineParserMatch` which defines the text the parser is looking for. Examples of this might be something like: + +```php +use League\CommonMark\Parser\Inline\InlineParserMatch; + +InlineParserMatch::string('@'); // Match any '@' characters found in the text +InlineParserMatch::string('foo'); // Match the text 'foo' (case insensitive) + +InlineParserMatch::oneOf('@', '!'); // Match either character +InlineParserMatch::oneOf('http://', 'https://'); // Match either string + +InlineParserMatch::regex('\d+'); // Match the regular expression (omit the regex delimiters and any flags) +``` + +Once a match is found, the `parse()` method below may be called. ### parse() This method will be called if both conditions are met: -1. The engine has stopped at a matching character; and, -2. No other inline parsers have successfully parsed the character +1. The engine has found at a matching string in the current line; and, +2. No other inline parsers with a [higher priority](/2.0/customization/environment/#addinlineparser) have successfully parsed the text at this point in the line #### Parameters -* `InlineParserContext $inlineContext` - Encapsulates the current state of the inline parser, including the [`Cursor`](/2.0/customization/cursor/) used to parse the current line. +* `string $match` - Contains the text that matches the start pattern from `getMatchDefinition()` +* `InlineParserContext $inlineContext` - Encapsulates the current state of the inline parser, including the [`Cursor`](/2.0/customization/cursor/) used to parse the current line. (Note that the cursor will be positioned **before** the matching text, so you must advance it yourself if you determine it's a valid match) #### Return value -`parse()` should return `false` if it's unable to handle the current line/character for any reason. (The [`Cursor`](/2.0/customization/cursor/) state should be restored before returning false if modified). Other parsers will then have a chance to try parsing the line. If all registered parsers return false, the character will be added as plain text. +`parse()` should return `false` if it's unable to handle the text at the current position for any reason. Other parsers will then have a chance to try parsing that text. If all registered parsers return false, the text will be added as plain text. Returning `true` tells the engine that you've successfully parsed the character (and related ones after it). It is your responsibility to: -1. Advance the cursor to the end of the parsed text +1. Advance the cursor to the end of the parsed/matched text 2. Add the parsed inline to the container (`$inlineContext->getContainer()->appendChild(...)`) ## Inline Parser Examples @@ -65,15 +80,17 @@ Let's say you wanted to autolink Twitter handles without using the link syntax. use League\CommonMark\Environment\Environment; use League\CommonMark\Extension\CommonMark\Node\Inline\Link; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; class TwitterHandleParser implements InlineParserInterface { - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['@']; + // Note that you could match the entire regex here instead of in parse() if you wish + return InlineParserMatch::string('@'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); // The @ symbol must not have any other characters immediately prior @@ -113,33 +130,27 @@ Let's say you want to automatically convert smilies (or "frownies") to emoticon use League\CommonMark\Environment\Environment; use League\CommonMark\Extension\CommonMark\Node\Inline\Image; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; class SmilieParser implements InlineParserInterface { - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return [':']; + return InlineParserMatch::oneOf(':)', ':('); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); - // The next character must be a paren; if not, then bail - // We use peek() to quickly check without affecting the cursor - $nextChar = $cursor->peek(); - if ($nextChar !== '(' && $nextChar !== ')') { - return false; - } - // Advance the cursor past the 2 matched chars since we're able to parse them successfully $cursor->advanceBy(2); // Add the corresponding image - if ($nextChar === ')') { + if ($match === ':)') { $inlineContext->getContainer()->appendChild(new Image('/img/happy.png')); - } elseif ($nextChar === '(') { + } elseif ($match === ':(') { $inlineContext->getContainer()->appendChild(new Image('/img/sad.png')); } @@ -153,6 +164,8 @@ $environment->addInlineParser(new SmilieParserParser()); ## Tips -* For best performance, `return false` **as soon as possible**. +* For best performance: + * Avoid using overly-complex regular expressions in `getMatchDefinition()` - use the simplest regex you can and have `parse()` do the heavier validation + * Have your `parse()` method `return false` **as soon as possible**. * You can `peek()` without modifying the cursor state. This makes it useful for validating nearby characters as it's quick and you can bail without needed to restore state. * You can look at (and modify) any part of the AST if needed (via `$inlineContext->getContainer()`). diff --git a/src/Extension/Attributes/Parser/AttributesInlineParser.php b/src/Extension/Attributes/Parser/AttributesInlineParser.php index 4b7a5041dc..bd4aaa3ecc 100644 --- a/src/Extension/Attributes/Parser/AttributesInlineParser.php +++ b/src/Extension/Attributes/Parser/AttributesInlineParser.php @@ -17,16 +17,14 @@ use League\CommonMark\Extension\Attributes\Node\AttributesInline; use League\CommonMark\Extension\Attributes\Util\AttributesHelper; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class AttributesInlineParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return [' ', '{']; + return InlineParserMatch::oneOf(' ', '{'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php b/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php index 20d317ce54..48cd7d4dba 100644 --- a/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php +++ b/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php @@ -18,33 +18,31 @@ use League\CommonMark\Extension\CommonMark\Node\Inline\Link; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Util\UrlEncoder; final class AutolinkParser implements InlineParserInterface { - private const EMAIL_REGEX = '/^<([a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/'; - private const OTHER_LINK_REGEX = '/^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i'; + private const EMAIL_REGEX = '<([a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>'; + private const OTHER_LINK_REGEX = '<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>'; - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['<']; + return InlineParserMatch::regex(self::EMAIL_REGEX . '|' . self::OTHER_LINK_REGEX); } public function parse(InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); - if ($m = $cursor->match(self::EMAIL_REGEX)) { + if ($m = $cursor->match('/^' . self::EMAIL_REGEX . '/')) { $email = \substr($m, 1, -1); $inlineContext->getContainer()->appendChild(new Link('mailto:' . UrlEncoder::unescapeAndEncode($email), $email)); return true; } - if ($m = $cursor->match(self::OTHER_LINK_REGEX)) { + if ($m = $cursor->match('/^' . self::OTHER_LINK_REGEX . '/')) { $dest = \substr($m, 1, -1); $inlineContext->getContainer()->appendChild(new Link(UrlEncoder::unescapeAndEncode($dest), $dest)); diff --git a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php index bf34e3cb31..3312f22c47 100644 --- a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php @@ -19,16 +19,14 @@ use League\CommonMark\Extension\CommonMark\Node\Inline\Code; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class BacktickParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['`']; + return InlineParserMatch::regex('`+'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/BangParser.php b/src/Extension/CommonMark/Parser/Inline/BangParser.php index e354931906..d5931700fd 100644 --- a/src/Extension/CommonMark/Parser/Inline/BangParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BangParser.php @@ -19,16 +19,14 @@ use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class BangParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['!']; + return InlineParserMatch::string('!['); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php index 92b1d87a3f..5572e19299 100644 --- a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php @@ -25,6 +25,7 @@ use League\CommonMark\Node\Inline\AdjacentTextMerger; use League\CommonMark\Parser\Cursor; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Reference\ReferenceInterface; use League\CommonMark\Reference\ReferenceMapInterface; @@ -40,12 +41,9 @@ final class CloseBracketParser implements InlineParserInterface, EnvironmentAwar */ private $environment; - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return [']']; + return InlineParserMatch::string(']'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/EntityParser.php b/src/Extension/CommonMark/Parser/Inline/EntityParser.php index 5565a567e5..a410852e24 100644 --- a/src/Extension/CommonMark/Parser/Inline/EntityParser.php +++ b/src/Extension/CommonMark/Parser/Inline/EntityParser.php @@ -18,18 +18,16 @@ use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Util\Html5EntityDecoder; use League\CommonMark\Util\RegexHelper; final class EntityParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['&']; + return InlineParserMatch::regex(RegexHelper::PARTIAL_ENTITY); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/EscapableParser.php b/src/Extension/CommonMark/Parser/Inline/EscapableParser.php index 53716da387..64e6fab851 100644 --- a/src/Extension/CommonMark/Parser/Inline/EscapableParser.php +++ b/src/Extension/CommonMark/Parser/Inline/EscapableParser.php @@ -19,17 +19,15 @@ use League\CommonMark\Node\Inline\Newline; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Util\RegexHelper; final class EscapableParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['\\']; + return InlineParserMatch::string('\\'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php b/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php index 274f0f2105..62c416e59d 100644 --- a/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php +++ b/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php @@ -18,17 +18,15 @@ use League\CommonMark\Extension\CommonMark\Node\Inline\HtmlInline; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Util\RegexHelper; final class HtmlInlineParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['<']; + return InlineParserMatch::regex(RegexHelper::PARTIAL_HTMLTAG); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php index b8901289f9..2b52d1cdc6 100644 --- a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php @@ -19,16 +19,14 @@ use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class OpenBracketParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['[']; + return InlineParserMatch::string('['); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php b/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php index 660cefa48f..78bc141ac5 100644 --- a/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php +++ b/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php @@ -20,6 +20,7 @@ use League\CommonMark\Normalizer\SlugNormalizer; use League\CommonMark\Normalizer\TextNormalizerInterface; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Reference\Reference; @@ -40,12 +41,9 @@ public function __construct() $this->slugNormalizer = new SlugNormalizer(); } - /** - * {@inheritDoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['^']; + return InlineParserMatch::regex('\^\[[^\]]+\]'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/Footnote/Parser/FootnoteRefParser.php b/src/Extension/Footnote/Parser/FootnoteRefParser.php index c8ede30c0b..2209eafb44 100644 --- a/src/Extension/Footnote/Parser/FootnoteRefParser.php +++ b/src/Extension/Footnote/Parser/FootnoteRefParser.php @@ -18,6 +18,7 @@ use League\CommonMark\Configuration\ConfigurationInterface; use League\CommonMark\Extension\Footnote\Node\FootnoteRef; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Reference\Reference; @@ -26,12 +27,9 @@ final class FootnoteRefParser implements InlineParserInterface, ConfigurationAwa /** @var ConfigurationInterface */ private $config; - /** - * {@inheritDoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['[']; + return InlineParserMatch::regex('\[\^([^\s\]]+)\]'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/Mention/MentionParser.php b/src/Extension/Mention/MentionParser.php index 5333734540..7319fd91ac 100644 --- a/src/Extension/Mention/MentionParser.php +++ b/src/Extension/Mention/MentionParser.php @@ -17,6 +17,7 @@ use League\CommonMark\Extension\Mention\Generator\MentionGeneratorInterface; use League\CommonMark\Extension\Mention\Generator\StringTemplateLinkGenerator; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class MentionParser implements InlineParserInterface @@ -49,12 +50,9 @@ public function __construct(string $symbol, string $mentionRegex, MentionGenerat $this->mentionGenerator = $mentionGenerator; } - /** - * {@inheritDoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return [$this->symbol]; + return InlineParserMatch::string($this->symbol); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/SmartPunct/PunctuationParser.php b/src/Extension/SmartPunct/PunctuationParser.php index 73f2940e51..469381fa42 100644 --- a/src/Extension/SmartPunct/PunctuationParser.php +++ b/src/Extension/SmartPunct/PunctuationParser.php @@ -18,16 +18,14 @@ use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class PunctuationParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['-', '.']; + return InlineParserMatch::oneOf('-', '.'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Extension/SmartPunct/QuoteParser.php b/src/Extension/SmartPunct/QuoteParser.php index 64a0eabf5a..f42821064f 100644 --- a/src/Extension/SmartPunct/QuoteParser.php +++ b/src/Extension/SmartPunct/QuoteParser.php @@ -18,6 +18,7 @@ use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Util\RegexHelper; @@ -26,12 +27,9 @@ final class QuoteParser implements InlineParserInterface public const DOUBLE_QUOTES = [Quote::DOUBLE_QUOTE, Quote::DOUBLE_QUOTE_OPENER, Quote::DOUBLE_QUOTE_CLOSER]; public const SINGLE_QUOTES = [Quote::SINGLE_QUOTE, Quote::SINGLE_QUOTE_OPENER, Quote::SINGLE_QUOTE_CLOSER]; - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return \array_merge(self::DOUBLE_QUOTES, self::SINGLE_QUOTES); + return InlineParserMatch::oneOf(...\array_merge(self::DOUBLE_QUOTES, self::SINGLE_QUOTES)); } /** diff --git a/src/Extension/TaskList/TaskListItemMarkerParser.php b/src/Extension/TaskList/TaskListItemMarkerParser.php index 6a26492f80..fbdfd09e22 100644 --- a/src/Extension/TaskList/TaskListItemMarkerParser.php +++ b/src/Extension/TaskList/TaskListItemMarkerParser.php @@ -16,16 +16,14 @@ use League\CommonMark\Extension\CommonMark\Node\Block\ListItem; use League\CommonMark\Node\Block\Paragraph; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class TaskListItemMarkerParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ['[']; + return InlineParserMatch::oneOf('[ ]', '[x]'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/src/Parser/Inline/InlineParserInterface.php b/src/Parser/Inline/InlineParserInterface.php index e49ba2f3e0..fd13435bcf 100644 --- a/src/Parser/Inline/InlineParserInterface.php +++ b/src/Parser/Inline/InlineParserInterface.php @@ -17,10 +17,7 @@ interface InlineParserInterface { - /** - * @return string[] - */ - public function getCharacters(): array; + public function getMatchDefinition(): InlineParserMatch; public function parse(InlineParserContext $inlineContext): bool; } diff --git a/src/Parser/Inline/InlineParserMatch.php b/src/Parser/Inline/InlineParserMatch.php new file mode 100644 index 0000000000..09cd6e16e9 --- /dev/null +++ b/src/Parser/Inline/InlineParserMatch.php @@ -0,0 +1,59 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Parser\Inline; + +final class InlineParserMatch +{ + /** @var string */ + private $regex; + + private function __construct(string $regex) + { + $this->regex = $regex; + } + + /** + * @internal + */ + public function getRegex(): string + { + return $this->regex; + } + + /** + * Match the given string (case-insensitive) + */ + public static function string(string $str): self + { + return new self('/' . \preg_quote($str, '/') . '/i'); + } + + /** + * Match any of the given strings (case-insensitive) + */ + public static function oneOf(string ...$str): self + { + return new self('/' . \implode('|', \array_map(static function (string $str): string { + return \preg_quote($str, '/'); + }, $str)) . '/i'); + } + + /** + * Match a partial regular expression without starting/ending delimiters, anchors, or flags + */ + public static function regex(string $regex): self + { + return new self('/' . $regex . '/i'); + } +} diff --git a/src/Parser/Inline/NewlineParser.php b/src/Parser/Inline/NewlineParser.php index f21d0548cc..eb10d917e6 100644 --- a/src/Parser/Inline/NewlineParser.php +++ b/src/Parser/Inline/NewlineParser.php @@ -22,12 +22,9 @@ final class NewlineParser implements InlineParserInterface { - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return ["\n"]; + return InlineParserMatch::regex('\\n'); } public function parse(InlineParserContext $inlineContext): bool diff --git a/tests/unit/Environment/FakeInjectableInlineParser.php b/tests/unit/Environment/FakeInjectableInlineParser.php index 655cb9e9c1..75e6b9074e 100644 --- a/tests/unit/Environment/FakeInjectableInlineParser.php +++ b/tests/unit/Environment/FakeInjectableInlineParser.php @@ -16,18 +16,16 @@ use League\CommonMark\Configuration\ConfigurationAwareInterface; use League\CommonMark\Environment\EnvironmentAwareInterface; use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class FakeInjectableInlineParser implements InlineParserInterface, ConfigurationAwareInterface, EnvironmentAwareInterface { use FakeInjectableTrait; - /** - * {@inheritdoc} - */ - public function getCharacters(): array + public function getMatchDefinition(): InlineParserMatch { - return []; + return InlineParserMatch::oneOf(''); } public function parse(InlineParserContext $inlineContext): bool diff --git a/tests/unit/Parser/Inline/InlineParserMatchTest.php b/tests/unit/Parser/Inline/InlineParserMatchTest.php new file mode 100644 index 0000000000..559d5e2c96 --- /dev/null +++ b/tests/unit/Parser/Inline/InlineParserMatchTest.php @@ -0,0 +1,43 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Tests\Unit\Parser\Inline; + +use League\CommonMark\Parser\Inline\InlineParserMatch; +use PHPUnit\Framework\TestCase; + +final class InlineParserMatchTest extends TestCase +{ + /** + * @dataProvider provideDataForTesting + */ + public function testGetRegex(InlineParserMatch $definition, string $expectedRegex): void + { + $this->assertSame($expectedRegex, $definition->getRegex()); + } + + /** + * @return iterable + */ + public function provideDataForTesting(): iterable + { + yield [InlineParserMatch::string('.'), '/\./i']; + yield [InlineParserMatch::string('...'), '/\.\.\./i']; + yield [InlineParserMatch::string('foo'), '/foo/i']; + yield [InlineParserMatch::string('🎉'), '/🎉/i']; + yield [InlineParserMatch::string('/r/'), '/\/r\//i']; + yield [InlineParserMatch::oneOf('foo', 'bar'), '/foo|bar/i']; + yield [InlineParserMatch::oneOf('foo', '.', '[x]'), '/foo|\.|\[x\]/i']; + yield [InlineParserMatch::regex('[\w-_]{3,}'), '/[\w-_]{3,}/i']; + } +} From 5ae1602e6ac8b293b5781c061b47a141080df76a Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:31:08 -0400 Subject: [PATCH 2/9] Simplify handing of inline parsers within the Environment --- CHANGELOG.md | 3 + src/Environment/Environment.php | 71 ++------------- src/Environment/EnvironmentInterface.php | 9 +- tests/unit/Environment/EnvironmentTest.php | 100 +-------------------- 4 files changed, 15 insertions(+), 168 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f762d0596..4f1f8ded1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ See for detailed informatio - `RenderedContentInterface` - Added several new methods: - `Environment::setEventDispatcher()` + - `EnvironmentInterface::getInlineParsers()` - `FencedCode::setInfo()` - `Heading::setLevel()` - `HtmlRenderer::renderDocument()` @@ -162,6 +163,8 @@ See for detailed informatio - `AbstractBlock::finalize()` - `ConfigurableEnvironmentInterface::addBlockParser()` - `Delimiter::setCanClose()` + - `EnvironmentInterface::getInlineParsersForCharacter()` + - `EnvironmentInterface::getInlineParserCharacterRegex()` - `HtmlRenderer::renderBlock()` - `HtmlRenderer::renderBlocks()` - `HtmlRenderer::renderInline()` diff --git a/src/Environment/Environment.php b/src/Environment/Environment.php index 55fa68de5a..83556476c6 100644 --- a/src/Environment/Environment.php +++ b/src/Environment/Environment.php @@ -70,13 +70,6 @@ final class Environment implements ConfigurableEnvironmentInterface, ListenerPro */ private $inlineParsers; - /** - * @var array> - * - * @psalm-readonly-allow-private-mutation - */ - private $inlineParsersByCharacter = []; - /** * @var DelimiterProcessorCollection * @@ -108,13 +101,6 @@ final class Environment implements ConfigurableEnvironmentInterface, ListenerPro */ private $config; - /** - * @var string - * - * @psalm-readonly-allow-private-mutation - */ - private $inlineParserCharacterRegex; - /** * @param array $config */ @@ -173,14 +159,6 @@ public function addInlineParser(InlineParserInterface $parser, int $priority = 0 $this->inlineParsers->add($parser, $priority); $this->injectEnvironmentAndConfigurationIfNeeded($parser); - foreach ($parser->getCharacters() as $character) { - if (! isset($this->inlineParsersByCharacter[$character])) { - $this->inlineParsersByCharacter[$character] = new PrioritizedList(); - } - - $this->inlineParsersByCharacter[$character]->add($parser, $priority); - } - return $this; } @@ -219,22 +197,6 @@ public function getBlockStartParsers(): iterable return $this->blockStartParsers->getIterator(); } - /** - * {@inheritdoc} - */ - public function getInlineParsersForCharacter(string $character): iterable - { - if (! $this->extensionsInitialized) { - $this->initializeExtensions(); - } - - if (! isset($this->inlineParsersByCharacter[$character])) { - return []; - } - - return $this->inlineParsersByCharacter[$character]->getIterator(); - } - public function getDelimiterProcessors(): DelimiterProcessorCollection { if (! $this->extensionsInitialized) { @@ -308,9 +270,6 @@ private function initializeExtensions(): void $this->extensionsInitialized = true; - // Lastly, let's build a regex which matches non-inline characters - // This will enable a huge performance boost with inline parsing - $this->buildInlineParserCharacterRegex(); } private function injectEnvironmentAndConfigurationIfNeeded(object $object): void @@ -350,11 +309,6 @@ public static function createGFMEnvironment(): ConfigurableEnvironmentInterface return $environment; } - public function getInlineParserCharacterRegex(): string - { - return $this->inlineParserCharacterRegex; - } - public function addEventListener(string $eventClass, callable $listener, int $priority = 0): ConfigurableEnvironmentInterface { $this->assertUninitialized('Failed to add event listener.'); @@ -423,25 +377,16 @@ public function getListenersForEvent(object $event): iterable } } - private function buildInlineParserCharacterRegex(): void + /** + * @return iterable + */ + public function getInlineParsers(): iterable { - $chars = \array_unique(\array_merge( - \array_keys($this->inlineParsersByCharacter), - $this->delimiterProcessors->getDelimiterCharacters() - )); - - if (\count($chars) === 0) { - // If no special inline characters exist then parse the whole line - $this->inlineParserCharacterRegex = '/^.+$/'; - } else { - // Match any character which inline parsers are not interested in - $this->inlineParserCharacterRegex = '/^[^' . \preg_quote(\implode('', $chars), '/') . ']+/'; - - // Only add the u modifier (which slows down performance) if we have a multi-byte UTF-8 character in our regex - if (\strlen($this->inlineParserCharacterRegex) > \mb_strlen($this->inlineParserCharacterRegex)) { - $this->inlineParserCharacterRegex .= 'u'; - } + if (! $this->extensionsInitialized) { + $this->initializeExtensions(); } + + return $this->inlineParsers->getIterator(); } /** diff --git a/src/Environment/EnvironmentInterface.php b/src/Environment/EnvironmentInterface.php index 7b1c15beaf..830c02b674 100644 --- a/src/Environment/EnvironmentInterface.php +++ b/src/Environment/EnvironmentInterface.php @@ -37,7 +37,7 @@ public function getBlockStartParsers(): iterable; /** * @return iterable */ - public function getInlineParsersForCharacter(string $character): iterable; + public function getInlineParsers(): iterable; public function getDelimiterProcessors(): DelimiterProcessorCollection; @@ -47,11 +47,4 @@ public function getDelimiterProcessors(): DelimiterProcessorCollection; * @return iterable */ public function getRenderersForClass(string $nodeClass): iterable; - - /** - * Regex which matches any character which doesn't indicate an inline element - * - * This allows us to parse multiple non-special characters at once - */ - public function getInlineParserCharacterRegex(): string; } diff --git a/tests/unit/Environment/EnvironmentTest.php b/tests/unit/Environment/EnvironmentTest.php index f00bd55c0b..74050d16ee 100644 --- a/tests/unit/Environment/EnvironmentTest.php +++ b/tests/unit/Environment/EnvironmentTest.php @@ -186,21 +186,6 @@ public function testAddRendererFailsAfterInitialization(): void $environment->addRenderer('MyClass', $renderer); } - public function testInlineParserCanMatchRegexDelimiter(): void - { - $environment = new Environment(); - - $parser = $this->createMock(InlineParserInterface::class); - $parser->expects($this->any()) - ->method('getCharacters') - ->will($this->returnValue(['/'])); - - $environment->addInlineParser($parser); - $environment->getInlineParsersForCharacter('/'); - - $this->assertEquals(1, \preg_match($environment->getInlineParserCharacterRegex(), 'foo/bar')); - } - public function testAddInlineParserFailsAfterInitialization(): void { $this->expectException(\RuntimeException::class); @@ -208,33 +193,12 @@ public function testAddInlineParserFailsAfterInitialization(): void $environment = new Environment(); // This triggers the initialization - $environment->getInlineParsersForCharacter(''); + $environment->getInlineParsers(); $parser = $this->createMock(InlineParserInterface::class); $environment->addInlineParser($parser); } - public function testGetInlineParsersForCharacter(): void - { - $environment = new Environment(); - - $parser = $this->createMock(InlineParserInterface::class); - $parser->expects($this->any()) - ->method('getCharacters') - ->will($this->returnValue(['a'])); - - $environment->addInlineParser($parser); - - $this->assertContains($parser, $environment->getInlineParsersForCharacter('a')); - } - - public function testGetInlineParsersForNonExistantCharacter(): void - { - $environment = new Environment(); - - $this->assertEmpty($environment->getInlineParsersForCharacter('a')); - } - public function testAddDelimiterProcessor(): void { $environment = new Environment(); @@ -303,61 +267,6 @@ public function testAddExtensionFailsAfterInitialization(): void $environment->addExtension($extension); } - public function testGetInlineParserCharacterRegexForEmptyEnvironment(): void - { - $environment = new Environment(); - - // This triggers the initialization which builds the regex - $environment->getInlineParsersForCharacter(''); - - $regex = $environment->getInlineParserCharacterRegex(); - - $test = '*This* should match **everything** including chars like `[`.'; - $matches = []; - \preg_match($regex, $test, $matches); - $this->assertSame($test, $matches[0]); - } - - public function testGetInlineParserCharacterRegexForAsciiCharacters(): void - { - $environment = new Environment(); - - $parser1 = $this->createMock(InlineParserInterface::class); - $parser1->method('getCharacters')->willReturn(['*']); - $environment->addInlineParser($parser1); - - $parser2 = $this->createMock(InlineParserInterface::class); - $parser2->method('getCharacters')->willReturn(['[']); - $environment->addInlineParser($parser2); - - // This triggers the initialization which builds the regex - $environment->getInlineParsersForCharacter(''); - - $regex = $environment->getInlineParserCharacterRegex(); - - $this->assertSame('/^[^\*\[]+/', $regex); - } - - public function testGetInlineParserCharacterRegexForMultibyteCharacters(): void - { - $environment = new Environment(); - - $parser1 = $this->createMock(InlineParserInterface::class); - $parser1->method('getCharacters')->willReturn(['*']); - $environment->addInlineParser($parser1); - - $parser2 = $this->createMock(InlineParserInterface::class); - $parser2->method('getCharacters')->willReturn(['★']); - $environment->addInlineParser($parser2); - - // This triggers the initialization which builds the regex - $environment->getInlineParsersForCharacter(''); - - $regex = $environment->getInlineParserCharacterRegex(); - - $this->assertSame('/^[^\*★]+/u', $regex); - } - public function testInjectableBlockStartParsersGetInjected(): void { $environment = new Environment(); @@ -451,22 +360,19 @@ public function testBlockParserPrioritization(): void $this->assertSame($parser3, $parsers[2]); } - public function testInlineParserPrioritization(): void + public function testGetInlineParsersWithPrioritization(): void { $environment = new Environment(); $parser1 = $this->createMock(InlineParserInterface::class); - $parser1->method('getCharacters')->willReturn(['a']); $parser2 = $this->createMock(InlineParserInterface::class); - $parser2->method('getCharacters')->willReturn(['a']); $parser3 = $this->createMock(InlineParserInterface::class); - $parser3->method('getCharacters')->willReturn(['a']); $environment->addInlineParser($parser1); $environment->addInlineParser($parser2, 50); $environment->addInlineParser($parser3); - $parsers = \iterator_to_array($environment->getInlineParsersForCharacter('a')); + $parsers = \iterator_to_array($environment->getInlineParsers()); $this->assertSame($parser2, $parsers[0]); $this->assertSame($parser1, $parsers[1]); From d135ba8c8be09d9719d262738c068fe8214c3f0d Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:36:14 -0400 Subject: [PATCH 3/9] Optimization: provide already-matched text to the inline parser --- CHANGELOG.md | 1 + .../Parser/AttributesInlineParser.php | 8 ++----- .../Parser/Inline/AutolinkParser.php | 2 +- .../Parser/Inline/BacktickParser.php | 8 +++---- .../CommonMark/Parser/Inline/BangParser.php | 19 +++++++-------- .../Parser/Inline/CloseBracketParser.php | 2 +- .../CommonMark/Parser/Inline/EntityParser.php | 11 ++++----- .../Parser/Inline/EscapableParser.php | 2 +- .../Parser/Inline/HtmlInlineParser.php | 11 ++++----- .../Parser/Inline/OpenBracketParser.php | 2 +- .../Parser/AnonymousFootnoteRefParser.php | 24 +++++-------------- .../Footnote/Parser/FootnoteRefParser.php | 23 ++++-------------- src/Extension/Mention/MentionParser.php | 2 +- .../SmartPunct/PunctuationParser.php | 7 +++--- src/Extension/SmartPunct/QuoteParser.php | 11 +++------ .../TaskList/TaskListItemMarkerParser.php | 9 +++---- src/Parser/Inline/InlineParserInterface.php | 2 +- src/Parser/Inline/NewlineParser.php | 2 +- .../FakeInjectableInlineParser.php | 2 +- .../Parser/Inline/BacktickParserTest.php | 2 +- 20 files changed, 51 insertions(+), 99 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f1f8ded1f..7d78276c3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ See for detailed informatio - The paragraph parser no longer needs to be added manually to the environment - Implemented a new approach to inline parsing where parsers can now specify longer strings or regular expressions they want to parse (instead of just single characters): - `InlineParserInterface::getCharacters()` is now `getMatchDefinition()` and returns an instance of `InlineParserMatch` + - `InlineParserInterface::parse()` has a new parameter containing the pre-matched text - Changed block and inline rendering to use common methods and interfaces - `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface. - `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()` diff --git a/src/Extension/Attributes/Parser/AttributesInlineParser.php b/src/Extension/Attributes/Parser/AttributesInlineParser.php index bd4aaa3ecc..5b316b311d 100644 --- a/src/Extension/Attributes/Parser/AttributesInlineParser.php +++ b/src/Extension/Attributes/Parser/AttributesInlineParser.php @@ -27,14 +27,10 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::oneOf(' ', '{'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { + $char = $match; $cursor = $inlineContext->getCursor(); - if ($cursor->getNextNonSpaceCharacter() !== '{') { - return false; - } - - $char = $cursor->getCharacter(); if ($char === '{') { $char = (string) $cursor->getCharacter($cursor->getPosition() - 1); } diff --git a/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php b/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php index 48cd7d4dba..5a78e9e357 100644 --- a/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php +++ b/src/Extension/CommonMark/Parser/Inline/AutolinkParser.php @@ -32,7 +32,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex(self::EMAIL_REGEX . '|' . self::OTHER_LINK_REGEX); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); if ($m = $cursor->match('/^' . self::EMAIL_REGEX . '/')) { diff --git a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php index 3312f22c47..440746a410 100644 --- a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php @@ -29,13 +29,11 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex('`+'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { + $ticks = $match; $cursor = $inlineContext->getCursor(); - $ticks = $cursor->match('/^`+/'); - if ($ticks === null) { - return false; // This should never happen - } + $cursor->advanceBy(\mb_strlen($ticks)); $currentPosition = $cursor->getPosition(); $previousState = $cursor->saveState(); diff --git a/src/Extension/CommonMark/Parser/Inline/BangParser.php b/src/Extension/CommonMark/Parser/Inline/BangParser.php index d5931700fd..92ede4e27e 100644 --- a/src/Extension/CommonMark/Parser/Inline/BangParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BangParser.php @@ -29,21 +29,18 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::string('!['); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); - if ($cursor->peek() === '[') { - $cursor->advanceBy(2); - $node = new Text('![', ['delim' => true]); - $inlineContext->getContainer()->appendChild($node); - // Add entry to stack for this opener - $delimiter = new Delimiter('!', 1, $node, true, false, $cursor->getPosition()); - $inlineContext->getDelimiterStack()->push($delimiter); + $cursor->advanceBy(2); + $node = new Text('![', ['delim' => true]); + $inlineContext->getContainer()->appendChild($node); - return true; - } + // Add entry to stack for this opener + $delimiter = new Delimiter('!', 1, $node, true, false, $cursor->getPosition()); + $inlineContext->getDelimiterStack()->push($delimiter); - return false; + return true; } } diff --git a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php index 5572e19299..6e2ad348b7 100644 --- a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php @@ -46,7 +46,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::string(']'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { // Look through stack of delimiters for a [ or ! $opener = $inlineContext->getDelimiterStack()->searchByCharacter(['[', '!']); diff --git a/src/Extension/CommonMark/Parser/Inline/EntityParser.php b/src/Extension/CommonMark/Parser/Inline/EntityParser.php index a410852e24..b1af159238 100644 --- a/src/Extension/CommonMark/Parser/Inline/EntityParser.php +++ b/src/Extension/CommonMark/Parser/Inline/EntityParser.php @@ -30,14 +30,11 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex(RegexHelper::PARTIAL_ENTITY); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { - if ($m = $inlineContext->getCursor()->match('/^' . RegexHelper::PARTIAL_ENTITY . '/i')) { - $inlineContext->getContainer()->appendChild(new Text(Html5EntityDecoder::decode($m))); + $inlineContext->getCursor()->advanceBy(\mb_strlen($match)); + $inlineContext->getContainer()->appendChild(new Text(Html5EntityDecoder::decode($match))); - return true; - } - - return false; + return true; } } diff --git a/src/Extension/CommonMark/Parser/Inline/EscapableParser.php b/src/Extension/CommonMark/Parser/Inline/EscapableParser.php index 64e6fab851..fd48adb6ab 100644 --- a/src/Extension/CommonMark/Parser/Inline/EscapableParser.php +++ b/src/Extension/CommonMark/Parser/Inline/EscapableParser.php @@ -30,7 +30,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::string('\\'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); $nextChar = $cursor->peek(); diff --git a/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php b/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php index 62c416e59d..fa9934f2f9 100644 --- a/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php +++ b/src/Extension/CommonMark/Parser/Inline/HtmlInlineParser.php @@ -29,14 +29,11 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex(RegexHelper::PARTIAL_HTMLTAG); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { - if ($m = $inlineContext->getCursor()->match('/^' . RegexHelper::PARTIAL_HTMLTAG . '/i')) { - $inlineContext->getContainer()->appendChild(new HtmlInline($m)); + $inlineContext->getCursor()->advanceBy(\mb_strlen($match)); + $inlineContext->getContainer()->appendChild(new HtmlInline($match)); - return true; - } - - return false; + return true; } } diff --git a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php index 2b52d1cdc6..04b1cbeaae 100644 --- a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php @@ -29,7 +29,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::string('['); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $inlineContext->getCursor()->advanceBy(1); $node = new Text('[', ['delim' => true]); diff --git a/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php b/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php index 78bc141ac5..24456cb3fd 100644 --- a/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php +++ b/src/Extension/Footnote/Parser/AnonymousFootnoteRefParser.php @@ -46,30 +46,18 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex('\^\[[^\]]+\]'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { - $container = $inlineContext->getContainer(); - $cursor = $inlineContext->getCursor(); - $nextChar = $cursor->peek(); - if ($nextChar !== '[') { + if (\preg_match('#\^\[([^\]]+)\]#', $match, $matches) <= 0) { return false; } - $state = $cursor->saveState(); + $inlineContext->getCursor()->advanceBy(\mb_strlen($match)); - $m = $cursor->match('#\^\[[^\]]+\]#'); - if ($m !== null) { - if (\preg_match('#\^\[([^\]]+)\]#', $m, $matches) > 0) { - $reference = $this->createReference($matches[1]); - $container->appendChild(new FootnoteRef($reference, $matches[1])); + $reference = $this->createReference($matches[1]); + $inlineContext->getContainer()->appendChild(new FootnoteRef($reference, $matches[1])); - return true; - } - } - - $cursor->restoreState($state); - - return false; + return true; } /** diff --git a/src/Extension/Footnote/Parser/FootnoteRefParser.php b/src/Extension/Footnote/Parser/FootnoteRefParser.php index 2209eafb44..b196143802 100644 --- a/src/Extension/Footnote/Parser/FootnoteRefParser.php +++ b/src/Extension/Footnote/Parser/FootnoteRefParser.php @@ -32,29 +32,16 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex('\[\^([^\s\]]+)\]'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { - $container = $inlineContext->getContainer(); - $cursor = $inlineContext->getCursor(); - $nextChar = $cursor->peek(); - if ($nextChar !== '^') { + if (\preg_match('#\[\^([^\s\]]+)\]#', $match, $matches) <= 0) { return false; } - $state = $cursor->saveState(); + $inlineContext->getCursor()->advanceBy(\mb_strlen($match)); + $inlineContext->getContainer()->appendChild(new FootnoteRef($this->createReference($matches[1]))); - $m = $cursor->match('#\[\^([^\s\]]+)\]#'); - if ($m !== null) { - if (\preg_match('#\[\^([^\s\]]+)\]#', $m, $matches) > 0) { - $container->appendChild(new FootnoteRef($this->createReference($matches[1]))); - - return true; - } - } - - $cursor->restoreState($state); - - return false; + return true; } private function createReference(string $label): Reference diff --git a/src/Extension/Mention/MentionParser.php b/src/Extension/Mention/MentionParser.php index 7319fd91ac..a18f238fbb 100644 --- a/src/Extension/Mention/MentionParser.php +++ b/src/Extension/Mention/MentionParser.php @@ -55,7 +55,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::string($this->symbol); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); diff --git a/src/Extension/SmartPunct/PunctuationParser.php b/src/Extension/SmartPunct/PunctuationParser.php index 469381fa42..a2918ac1b3 100644 --- a/src/Extension/SmartPunct/PunctuationParser.php +++ b/src/Extension/SmartPunct/PunctuationParser.php @@ -28,20 +28,19 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::oneOf('-', '.'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $cursor = $inlineContext->getCursor(); - $ch = $cursor->getCharacter(); // Ellipses - if ($ch === '.' && $matched = $cursor->match('/^\\.( ?\\.)\\1/')) { + if ($match === '.' && $matched = $cursor->match('/^\\.( ?\\.)\\1/')) { $inlineContext->getContainer()->appendChild(new Text('…')); return true; } // Em/En-dashes - if ($ch === '-' && $matched = $cursor->match('/^(?match('/^(?getCursor(); - $character = $cursor->getCharacter(); + $cursor = $inlineContext->getCursor(); - if ($character === null) { - return false; - } - - $normalizedCharacter = $this->getNormalizedQuoteCharacter($character); + $normalizedCharacter = $this->getNormalizedQuoteCharacter($match); $charBefore = $cursor->peek(-1); if ($charBefore === null) { diff --git a/src/Extension/TaskList/TaskListItemMarkerParser.php b/src/Extension/TaskList/TaskListItemMarkerParser.php index fbdfd09e22..9b787a4c6a 100644 --- a/src/Extension/TaskList/TaskListItemMarkerParser.php +++ b/src/Extension/TaskList/TaskListItemMarkerParser.php @@ -26,7 +26,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::oneOf('[ ]', '[x]'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $container = $inlineContext->getContainer(); @@ -38,10 +38,7 @@ public function parse(InlineParserContext $inlineContext): bool $cursor = $inlineContext->getCursor(); $oldState = $cursor->saveState(); - $m = $cursor->match('/\[[ xX]\]/'); - if ($m === null) { - return false; - } + $cursor->advanceBy(3); if ($cursor->getNextNonSpaceCharacter() === null) { $cursor->restoreState($oldState); @@ -49,7 +46,7 @@ public function parse(InlineParserContext $inlineContext): bool return false; } - $isChecked = $m !== '[ ]'; + $isChecked = $match !== '[ ]'; $container->appendChild(new TaskListItemMarker($isChecked)); diff --git a/src/Parser/Inline/InlineParserInterface.php b/src/Parser/Inline/InlineParserInterface.php index fd13435bcf..c5cd3b5d36 100644 --- a/src/Parser/Inline/InlineParserInterface.php +++ b/src/Parser/Inline/InlineParserInterface.php @@ -19,5 +19,5 @@ interface InlineParserInterface { public function getMatchDefinition(): InlineParserMatch; - public function parse(InlineParserContext $inlineContext): bool; + public function parse(string $match, InlineParserContext $inlineContext): bool; } diff --git a/src/Parser/Inline/NewlineParser.php b/src/Parser/Inline/NewlineParser.php index eb10d917e6..8afcc45eb2 100644 --- a/src/Parser/Inline/NewlineParser.php +++ b/src/Parser/Inline/NewlineParser.php @@ -27,7 +27,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::regex('\\n'); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { $inlineContext->getCursor()->advanceBy(1); diff --git a/tests/unit/Environment/FakeInjectableInlineParser.php b/tests/unit/Environment/FakeInjectableInlineParser.php index 75e6b9074e..5dcc1dde99 100644 --- a/tests/unit/Environment/FakeInjectableInlineParser.php +++ b/tests/unit/Environment/FakeInjectableInlineParser.php @@ -28,7 +28,7 @@ public function getMatchDefinition(): InlineParserMatch return InlineParserMatch::oneOf(''); } - public function parse(InlineParserContext $inlineContext): bool + public function parse(string $match, InlineParserContext $inlineContext): bool { return false; } diff --git a/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php b/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php index 57d5f1c673..633b92c880 100644 --- a/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php +++ b/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php @@ -38,7 +38,7 @@ public function testParse(string $string, string $expectedContents): void $inlineContext->getCursor()->advanceBy($firstBacktickPos); $parser = new BacktickParser(); - $this->assertTrue($parser->parse($inlineContext)); + $this->assertTrue($parser->parse('`', $inlineContext)); $codeBlock = $paragraph->firstChild(); \assert($codeBlock instanceof Code); From f5ac044a752aa5ec5134bddff314872bbba9e0b8 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:37:17 -0400 Subject: [PATCH 4/9] Require the cursor to be injected into the context --- CHANGELOG.md | 1 + src/Parser/InlineParserContext.php | 4 ++-- .../CommonMark/Parser/Inline/BacktickParserTest.php | 8 +++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d78276c3a..2c79f94da3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ See for detailed informatio - Implemented a new approach to inline parsing where parsers can now specify longer strings or regular expressions they want to parse (instead of just single characters): - `InlineParserInterface::getCharacters()` is now `getMatchDefinition()` and returns an instance of `InlineParserMatch` - `InlineParserInterface::parse()` has a new parameter containing the pre-matched text + - `InlineParserContext::__construct()` now requires the contents to be provided as a `Cursor` instead of a `string` - Changed block and inline rendering to use common methods and interfaces - `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface. - `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()` diff --git a/src/Parser/InlineParserContext.php b/src/Parser/InlineParserContext.php index 97fee43b42..d2f3940022 100644 --- a/src/Parser/InlineParserContext.php +++ b/src/Parser/InlineParserContext.php @@ -50,11 +50,11 @@ final class InlineParserContext */ private $delimiterStack; - public function __construct(string $contents, AbstractBlock $container, ReferenceMapInterface $referenceMap) + public function __construct(Cursor $contents, AbstractBlock $container, ReferenceMapInterface $referenceMap) { $this->referenceMap = $referenceMap; $this->container = $container; - $this->cursor = new Cursor(\trim($contents)); + $this->cursor = $contents; $this->delimiterStack = new DelimiterStack(); } diff --git a/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php b/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php index 633b92c880..fdabf9c231 100644 --- a/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php +++ b/tests/unit/Extension/CommonMark/Parser/Inline/BacktickParserTest.php @@ -19,6 +19,7 @@ use League\CommonMark\Extension\CommonMark\Node\Inline\Code; use League\CommonMark\Extension\CommonMark\Parser\Inline\BacktickParser; use League\CommonMark\Node\Block\Paragraph; +use League\CommonMark\Parser\Cursor; use League\CommonMark\Parser\InlineParserContext; use League\CommonMark\Reference\ReferenceMapInterface; use PHPUnit\Framework\TestCase; @@ -31,14 +32,15 @@ class BacktickParserTest extends TestCase public function testParse(string $string, string $expectedContents): void { $paragraph = new Paragraph(); - $inlineContext = new InlineParserContext($string, $paragraph, $this->createMock(ReferenceMapInterface::class)); + $cursor = new Cursor($string); + $inlineContext = new InlineParserContext($cursor, $paragraph, $this->createMock(ReferenceMapInterface::class)); // Move to just before the first backtick $firstBacktickPos = \mb_strpos($string, '`', 0, 'utf-8'); - $inlineContext->getCursor()->advanceBy($firstBacktickPos); + $cursor->advanceBy($firstBacktickPos); $parser = new BacktickParser(); - $this->assertTrue($parser->parse('`', $inlineContext)); + $this->assertTrue($parser->parse($cursor->getCharacter(), $inlineContext)); $codeBlock = $paragraph->firstChild(); \assert($codeBlock instanceof Code); From 225c357e53e9fe578079624b70bbfbdbeb66960c Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:40:37 -0400 Subject: [PATCH 5/9] Implement delimiter parsing as a special type of inline parser --- CHANGELOG.md | 2 + src/Delimiter/DelimiterParser.php | 105 ++++++++++++++++++++++++++++++ src/Environment/Environment.php | 3 + 3 files changed, 110 insertions(+) create mode 100644 src/Delimiter/DelimiterParser.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c79f94da3..995b766ac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ See for detailed informatio - `BlockStartParserInterface` - `ChildNodeRendererInterface` - `CursorState` + - `DelimiterParser` - `DocumentBlockParser` - `DocumentRenderedEvent` - `HtmlRendererInterface` @@ -64,6 +65,7 @@ See for detailed informatio - `InlineParserInterface::getCharacters()` is now `getMatchDefinition()` and returns an instance of `InlineParserMatch` - `InlineParserInterface::parse()` has a new parameter containing the pre-matched text - `InlineParserContext::__construct()` now requires the contents to be provided as a `Cursor` instead of a `string` + - Implemented delimiter parsing as a special type of inline parser (via the new `DelimiterParser` class) - Changed block and inline rendering to use common methods and interfaces - `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface. - `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()` diff --git a/src/Delimiter/DelimiterParser.php b/src/Delimiter/DelimiterParser.php new file mode 100644 index 0000000000..5205268a90 --- /dev/null +++ b/src/Delimiter/DelimiterParser.php @@ -0,0 +1,105 @@ +collection = $collection; + } + + public function getMatchDefinition(): InlineParserMatch + { + return InlineParserMatch::oneOf(...$this->collection->getDelimiterCharacters()); + } + + public function parse(string $match, InlineParserContext $inlineContext): bool + { + $character = $match; + $numDelims = 0; + $cursor = $inlineContext->getCursor(); + $processor = $this->collection->getDelimiterProcessor($character); + + if ($processor === null) { + throw new \LogicException('Delimiter processor should never be null here'); + } + + $charBefore = $cursor->peek(-1); + if ($charBefore === null) { + $charBefore = "\n"; + } + + while ($cursor->peek($numDelims) === $character) { + ++$numDelims; + } + + if ($numDelims < $processor->getMinLength()) { + return false; + } + + $cursor->advanceBy($numDelims); + + $charAfter = $cursor->getCharacter(); + if ($charAfter === null) { + $charAfter = "\n"; + } + + [$canOpen, $canClose] = self::determineCanOpenOrClose($charBefore, $charAfter, $character, $processor); + + $node = new Text(\str_repeat($character, $numDelims), [ + 'delim' => true, + ]); + $inlineContext->getContainer()->appendChild($node); + + // Add entry to stack to this opener + if ($canOpen || $canClose) { + $delimiter = new Delimiter($character, $numDelims, $node, $canOpen, $canClose); + $inlineContext->getDelimiterStack()->push($delimiter); + } + + return true; + } + + /** + * @return bool[] + */ + private static function determineCanOpenOrClose(string $charBefore, string $charAfter, string $character, DelimiterProcessorInterface $delimiterProcessor): array + { + $afterIsWhitespace = \preg_match(RegexHelper::REGEX_UNICODE_WHITESPACE_CHAR, $charAfter); + $afterIsPunctuation = \preg_match(RegexHelper::REGEX_PUNCTUATION, $charAfter); + $beforeIsWhitespace = \preg_match(RegexHelper::REGEX_UNICODE_WHITESPACE_CHAR, $charBefore); + $beforeIsPunctuation = \preg_match(RegexHelper::REGEX_PUNCTUATION, $charBefore); + + $leftFlanking = ! $afterIsWhitespace && (! $afterIsPunctuation || $beforeIsWhitespace || $beforeIsPunctuation); + $rightFlanking = ! $beforeIsWhitespace && (! $beforeIsPunctuation || $afterIsWhitespace || $afterIsPunctuation); + + if ($character === '_') { + $canOpen = $leftFlanking && (! $rightFlanking || $beforeIsPunctuation); + $canClose = $rightFlanking && (! $leftFlanking || $afterIsPunctuation); + } else { + $canOpen = $leftFlanking && $character === $delimiterProcessor->getOpeningCharacter(); + $canClose = $rightFlanking && $character === $delimiterProcessor->getClosingCharacter(); + } + + return [$canOpen, $canClose]; + } +} diff --git a/src/Environment/Environment.php b/src/Environment/Environment.php index 83556476c6..45ec444921 100644 --- a/src/Environment/Environment.php +++ b/src/Environment/Environment.php @@ -18,6 +18,7 @@ use League\CommonMark\Configuration\Configuration; use League\CommonMark\Configuration\ConfigurationAwareInterface; +use League\CommonMark\Delimiter\DelimiterParser; use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection; use League\CommonMark\Delimiter\Processor\DelimiterProcessorInterface; use League\CommonMark\Event\ListenerData; @@ -270,6 +271,8 @@ private function initializeExtensions(): void $this->extensionsInitialized = true; + // Create the special delimiter parser + $this->inlineParsers->add(new DelimiterParser($this->delimiterProcessors), PHP_INT_MIN); } private function injectEnvironmentAndConfigurationIfNeeded(object $object): void From 55b911bcd4c031326b7e0a06d049af448450212f Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:41:25 -0400 Subject: [PATCH 6/9] Only search for delimiters if any were given --- CHANGELOG.md | 1 + src/Delimiter/Processor/DelimiterProcessorCollection.php | 5 +++++ .../Processor/DelimiterProcessorCollectionInterface.php | 2 +- src/Environment/Environment.php | 6 ++++-- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 995b766ac7..219d8f574b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,7 @@ See for detailed informatio - Footnote event listeners now have numbered priorities (but still execute in the same order) - Footnotes must now be separated from previous content by a blank line - The line numbers (keys) returned via `MarkdownInput::getLines()` now start at 1 instead of 0 + - `DelimiterProcessorCollectionInterface` now extends `Countable` ### Fixed diff --git a/src/Delimiter/Processor/DelimiterProcessorCollection.php b/src/Delimiter/Processor/DelimiterProcessorCollection.php index 0fb1744a82..580577ac4b 100644 --- a/src/Delimiter/Processor/DelimiterProcessorCollection.php +++ b/src/Delimiter/Processor/DelimiterProcessorCollection.php @@ -79,4 +79,9 @@ private function addStaggeredDelimiterProcessorForChar(string $opening, Delimite $s->add($new); $this->processorsByChar[$opening] = $s; } + + public function count(): int + { + return \count($this->processorsByChar); + } } diff --git a/src/Delimiter/Processor/DelimiterProcessorCollectionInterface.php b/src/Delimiter/Processor/DelimiterProcessorCollectionInterface.php index fac8bc495f..2a12075c33 100644 --- a/src/Delimiter/Processor/DelimiterProcessorCollectionInterface.php +++ b/src/Delimiter/Processor/DelimiterProcessorCollectionInterface.php @@ -19,7 +19,7 @@ namespace League\CommonMark\Delimiter\Processor; -interface DelimiterProcessorCollectionInterface +interface DelimiterProcessorCollectionInterface extends \Countable { /** * Add the given delim processor to the collection diff --git a/src/Environment/Environment.php b/src/Environment/Environment.php index 45ec444921..eae06d776d 100644 --- a/src/Environment/Environment.php +++ b/src/Environment/Environment.php @@ -271,8 +271,10 @@ private function initializeExtensions(): void $this->extensionsInitialized = true; - // Create the special delimiter parser - $this->inlineParsers->add(new DelimiterParser($this->delimiterProcessors), PHP_INT_MIN); + // Create the special delimiter parser if any processors were registered + if ($this->delimiterProcessors->count() > 0) { + $this->inlineParsers->add(new DelimiterParser($this->delimiterProcessors), PHP_INT_MIN); + } } private function injectEnvironmentAndConfigurationIfNeeded(object $object): void From 4a40802d8fb56e1eb2d8fb79687df1f5899bc1ac Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:42:42 -0400 Subject: [PATCH 7/9] Make regular expressions case-insensitive --- CHANGELOG.md | 1 + src/Util/RegexHelper.php | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 219d8f574b..ca5e32229b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -115,6 +115,7 @@ See for detailed informatio - Footnotes must now be separated from previous content by a blank line - The line numbers (keys) returned via `MarkdownInput::getLines()` now start at 1 instead of 0 - `DelimiterProcessorCollectionInterface` now extends `Countable` + - `RegexHelper::PARTIAL_` constants must always be used in case-insensitive contexts ### Fixed diff --git a/src/Util/RegexHelper.php b/src/Util/RegexHelper.php index 6b08005439..fb1d6fcbc2 100644 --- a/src/Util/RegexHelper.php +++ b/src/Util/RegexHelper.php @@ -21,13 +21,16 @@ /** * Provides regular expressions and utilities for parsing Markdown * + * All of the PARTIAL_ regex constants assume that they'll be used in case-insensitive searches + * All other complete regexes provided by this class (either via constants or methods) will have case-insensitivity enabled. + * * @phpcs:disable Generic.Strings.UnnecessaryStringConcat.Found * * @psalm-immutable */ final class RegexHelper { - // Partial regular expressions (wrap with `/` on each side before use) + // Partial regular expressions (wrap with `/` on each side and add the case-insensitive `i` flag before use) public const PARTIAL_ENTITY = '&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});'; public const PARTIAL_ESCAPABLE = '[!"#$%&\'()*+,.\/:;<=>?@[\\\\\]^_`{|}~-]'; public const PARTIAL_ESCAPED_CHAR = '\\\\' . self::PARTIAL_ESCAPABLE; @@ -36,9 +39,9 @@ final class RegexHelper public const PARTIAL_IN_PARENS = '\\((' . self::PARTIAL_ESCAPED_CHAR . '|[^)\x00])*\\)'; public const PARTIAL_REG_CHAR = '[^\\\\()\x00-\x20]'; public const PARTIAL_IN_PARENS_NOSP = '\((' . self::PARTIAL_REG_CHAR . '|' . self::PARTIAL_ESCAPED_CHAR . '|\\\\)*\)'; - public const PARTIAL_TAGNAME = '[A-Za-z][A-Za-z0-9-]*'; + public const PARTIAL_TAGNAME = '[a-z][a-z0-9-]*'; public const PARTIAL_BLOCKTAGNAME = '(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'; - public const PARTIAL_ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'; + public const PARTIAL_ATTRIBUTENAME = '[a-z_:][a-z0-9:._-]*'; public const PARTIAL_UNQUOTEDVALUE = '[^"\'=<>`\x00-\x20]+'; public const PARTIAL_SINGLEQUOTEDVALUE = '\'[^\']*\''; public const PARTIAL_DOUBLEQUOTEDVALUE = '"[^"]*"'; @@ -168,9 +171,9 @@ public static function getHtmlBlockOpenRegex(int $type): string case HtmlBlock::TYPE_3: return '/^<[?]/'; case HtmlBlock::TYPE_4: - return '/^]|$)%i'; case HtmlBlock::TYPE_7_MISC_ELEMENT: From ee66a64dc40fa19f6a0caf0f817fa343b0d9bfe5 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 14:43:47 -0400 Subject: [PATCH 8/9] Rewrite the InlineParserEngine to use the new approach --- src/Parser/InlineParserEngine.php | 184 +++++++++--------- tests/unit/Parser/Inline/FakeInlineParser.php | 56 ++++++ tests/unit/Parser/InlineParserEngineTest.php | 65 +++++++ 3 files changed, 212 insertions(+), 93 deletions(-) create mode 100644 tests/unit/Parser/Inline/FakeInlineParser.php create mode 100644 tests/unit/Parser/InlineParserEngineTest.php diff --git a/src/Parser/InlineParserEngine.php b/src/Parser/InlineParserEngine.php index 8daf8f05bf..cb99b18df5 100644 --- a/src/Parser/InlineParserEngine.php +++ b/src/Parser/InlineParserEngine.php @@ -16,15 +16,13 @@ namespace League\CommonMark\Parser; -use League\CommonMark\Delimiter\Delimiter; -use League\CommonMark\Delimiter\Processor\DelimiterProcessorInterface; use League\CommonMark\Environment\EnvironmentInterface; use League\CommonMark\Node\Block\AbstractBlock; use League\CommonMark\Node\Inline\AdjacentTextMerger; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Node\Node; +use League\CommonMark\Parser\Inline\InlineParserInterface; use League\CommonMark\Reference\ReferenceMapInterface; -use League\CommonMark\Util\RegexHelper; /** * @internal @@ -45,104 +43,83 @@ final class InlineParserEngine implements InlineParserEngineInterface */ private $referenceMap; + /** + * @var array + * @psalm-var list + * @phpstan-var array + */ + private $parsers; + public function __construct(EnvironmentInterface $environment, ReferenceMapInterface $referenceMap) { $this->environment = $environment; $this->referenceMap = $referenceMap; - } - - public function parse(string $contents, AbstractBlock $block): void - { - $inlineParserContext = new InlineParserContext($contents, $block, $this->referenceMap); - $cursor = $inlineParserContext->getCursor(); - while (($character = $cursor->getCharacter()) !== null) { - if (! $this->parseCharacter($character, $inlineParserContext)) { - $this->addPlainText($character, $block, $inlineParserContext); - } - } - - $delimiterStack = $inlineParserContext->getDelimiterStack(); - $delimiterStack->processDelimiters(null, $this->environment->getDelimiterProcessors()); - $delimiterStack->removeAll(); - AdjacentTextMerger::mergeChildNodes($block); - } + foreach ($environment->getInlineParsers() as $parser) { + \assert($parser instanceof InlineParserInterface); + $regex = $parser->getMatchDefinition()->getRegex(); - /** - * @return bool Whether we successfully parsed a character at that position - */ - private function parseCharacter(string $character, InlineParserContext $inlineParserContext): bool - { - foreach ($this->environment->getInlineParsersForCharacter($character) as $parser) { - if ($parser->parse($inlineParserContext)) { - return true; - } + $this->parsers[] = [$parser, $regex, \strlen($regex) !== \mb_strlen($regex)]; } - - if ($delimiterProcessor = $this->environment->getDelimiterProcessors()->getDelimiterProcessor($character)) { - return $this->parseDelimiters($delimiterProcessor, $inlineParserContext); - } - - return false; } - private function parseDelimiters(DelimiterProcessorInterface $delimiterProcessor, InlineParserContext $inlineContext): bool + public function parse(string $contents, AbstractBlock $block): void { - $cursor = $inlineContext->getCursor(); - $character = $cursor->getCharacter(); - $numDelims = 0; + $contents = \trim($contents); + $cursor = new Cursor($contents); - if ($character === null) { - throw new \RuntimeException('Cannot parse delimiters without a valid character'); - } + $inlineParserContext = new InlineParserContext($cursor, $block, $this->referenceMap); - $charBefore = $cursor->peek(-1); - if ($charBefore === null) { - $charBefore = "\n"; - } + // Have all parsers look at the line to determine what they might want to parse and what positions they exist at + foreach ($this->matchParsers($contents) as $matchPosition => $parsers) { + $currentPosition = $cursor->getPosition(); + // We've already gone past this point + if ($currentPosition > $matchPosition) { + continue; + } - while ($cursor->peek($numDelims) === $character) { - ++$numDelims; - } + // We've skipped over some uninteresting text that should be added as a plain text node + if ($currentPosition < $matchPosition) { + $cursor->advanceBy($matchPosition - $currentPosition); + $this->addPlainText($cursor->getPreviousText(), $block); + } - if ($numDelims < $delimiterProcessor->getMinLength()) { - return false; - } + // We're now at a potential start - see which of the current parsers can handle it + $parsed = false; + foreach ($parsers as [$parser, $match]) { + \assert($parser instanceof InlineParserInterface); + if ($parser->parse($match, $inlineParserContext)) { + // A parser has successfully handled the text at the given position; don't consider any others at this position + $parsed = true; + break; + } + } - $cursor->advanceBy($numDelims); + if ($parsed) { + continue; + } - $charAfter = $cursor->getCharacter(); - if ($charAfter === null) { - $charAfter = "\n"; + // Despite potentially being interested, nothing actually parsed text here, so add the current character and continue onwards + $this->addPlainText((string) $cursor->getCharacter(), $block); + $cursor->advance(); } - [$canOpen, $canClose] = self::determineCanOpenOrClose($charBefore, $charAfter, $character, $delimiterProcessor); - - $node = new Text(\str_repeat($character, $numDelims), [ - 'delim' => true, - ]); - $inlineContext->getContainer()->appendChild($node); - - // Add entry to stack to this opener - if ($canOpen || $canClose) { - $delimiter = new Delimiter($character, $numDelims, $node, $canOpen, $canClose); - $inlineContext->getDelimiterStack()->push($delimiter); + // Add any remaining text that wasn't parsed + if (! $cursor->isAtEnd()) { + $this->addPlainText($cursor->getRemainder(), $block); } - return true; + // Process any delimiters that were found + $delimiterStack = $inlineParserContext->getDelimiterStack(); + $delimiterStack->processDelimiters(null, $this->environment->getDelimiterProcessors()); + $delimiterStack->removeAll(); + + // Combine adjacent text notes into one + AdjacentTextMerger::mergeChildNodes($block); } - private function addPlainText(string $character, Node $container, InlineParserContext $inlineParserContext): void + private function addPlainText(string $text, Node $container): void { - // We reach here if none of the parsers can handle the input - // Attempt to match multiple non-special characters at once - $text = $inlineParserContext->getCursor()->match($this->environment->getInlineParserCharacterRegex()); - // This might fail if we're currently at a special character which wasn't parsed; if so, just add that character - if ($text === null) { - $inlineParserContext->getCursor()->advanceBy(1); - $text = $character; - } - $lastInline = $container->lastChild(); if ($lastInline instanceof Text && ! isset($lastInline->data['delim'])) { $lastInline->append($text); @@ -152,26 +129,47 @@ private function addPlainText(string $character, Node $container, InlineParserCo } /** - * @return bool[] + * Given the current line, ask all the parsers which parts of the text they would be interested in parsing. + * + * The resulting array provides a list of character positions, which parsers are interested in trying to parse + * the text at those points, and (for convenience/optimization) what the matching text happened to be. + * + * @return array> + * + * @psalm-return array> + * + * @phpstan-return array> */ - private static function determineCanOpenOrClose(string $charBefore, string $charAfter, string $character, DelimiterProcessorInterface $delimiterProcessor): array + private function matchParsers(string $contents): array { - $afterIsWhitespace = \preg_match(RegexHelper::REGEX_UNICODE_WHITESPACE_CHAR, $charAfter); - $afterIsPunctuation = \preg_match(RegexHelper::REGEX_PUNCTUATION, $charAfter); - $beforeIsWhitespace = \preg_match(RegexHelper::REGEX_UNICODE_WHITESPACE_CHAR, $charBefore); - $beforeIsPunctuation = \preg_match(RegexHelper::REGEX_PUNCTUATION, $charBefore); + $contents = \trim($contents); + $isMultibyte = \mb_strlen($contents) !== \strlen($contents); - $leftFlanking = ! $afterIsWhitespace && (! $afterIsPunctuation || $beforeIsWhitespace || $beforeIsPunctuation); - $rightFlanking = ! $beforeIsWhitespace && (! $beforeIsPunctuation || $afterIsWhitespace || $afterIsPunctuation); + $ret = []; - if ($character === '_') { - $canOpen = $leftFlanking && (! $rightFlanking || $beforeIsPunctuation); - $canClose = $rightFlanking && (! $leftFlanking || $afterIsPunctuation); - } else { - $canOpen = $leftFlanking && $character === $delimiterProcessor->getOpeningCharacter(); - $canClose = $rightFlanking && $character === $delimiterProcessor->getClosingCharacter(); + foreach ($this->parsers as [$parser, $regex, $isRegexMultibyte]) { + if ($isMultibyte || $isRegexMultibyte) { + $regex .= 'u'; + } + + if (! \preg_match_all($regex, $contents, $matches, \PREG_OFFSET_CAPTURE | \PREG_SET_ORDER)) { + continue; + } + + foreach ($matches as $match) { + if ($isMultibyte) { + // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying + $offset = \mb_strlen(\substr($contents, 0, $match[0][1]), 'UTF-8'); + } else { + $offset = (int) $match[0][1]; + } + + $ret[$offset][] = [$parser, (string) $match[0][0]]; + } } - return [$canOpen, $canClose]; + \ksort($ret); + + return $ret; } } diff --git a/tests/unit/Parser/Inline/FakeInlineParser.php b/tests/unit/Parser/Inline/FakeInlineParser.php new file mode 100644 index 0000000000..3034ce90b6 --- /dev/null +++ b/tests/unit/Parser/Inline/FakeInlineParser.php @@ -0,0 +1,56 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Tests\Unit\Parser\Inline; + +use League\CommonMark\Node\Inline\Text; +use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; +use League\CommonMark\Parser\InlineParserContext; + +final class FakeInlineParser implements InlineParserInterface +{ + /** @var string[] */ + private $matches = []; + + /** @var InlineParserMatch */ + private $start; + + public function __construct(InlineParserMatch $start) + { + $this->start = $start; + } + + public function getMatchDefinition(): InlineParserMatch + { + return $this->start; + } + + public function parse(string $match, InlineParserContext $inlineContext): bool + { + $this->matches[] = $match; + + $inlineContext->getCursor()->advanceBy(\mb_strlen($match)); + $inlineContext->getContainer()->appendChild(new Text($match)); + + return true; + } + + /** + * @return string[] + */ + public function getMatches(): array + { + return $this->matches; + } +} diff --git a/tests/unit/Parser/InlineParserEngineTest.php b/tests/unit/Parser/InlineParserEngineTest.php new file mode 100644 index 0000000000..e47e9b40de --- /dev/null +++ b/tests/unit/Parser/InlineParserEngineTest.php @@ -0,0 +1,65 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Tests\Unit\Parser; + +use League\CommonMark\Environment\Environment; +use League\CommonMark\Node\Block\Paragraph; +use League\CommonMark\Parser\Inline\InlineParserMatch; +use League\CommonMark\Parser\InlineParserEngine; +use League\CommonMark\Reference\ReferenceMap; +use League\CommonMark\Tests\Unit\Parser\Inline\FakeInlineParser; +use PHPUnit\Framework\TestCase; + +final class InlineParserEngineTest extends TestCase +{ + public function testParseWithDefaultPriorityOrder(): void + { + $colorParser = new FakeInlineParser(InlineParserMatch::string('brown')); + $adjectiveParser = new FakeInlineParser(InlineParserMatch::oneOf('quick', 'brown', 'lazy')); + $fiveLetterParser = new FakeInlineParser(InlineParserMatch::regex('\b\w{5}\b')); + + $environment = new Environment(); + $environment->addInlineParser($colorParser); + $environment->addInlineParser($adjectiveParser); + $environment->addInlineParser($fiveLetterParser); + + $engine = new InlineParserEngine($environment, new ReferenceMap()); + $paragraph = new Paragraph(); + $engine->parse('The quick brown fox jumps over the lazy dog', $paragraph); + + $this->assertSame(['brown'], $colorParser->getMatches()); + $this->assertSame(['quick', 'lazy'], $adjectiveParser->getMatches()); + $this->assertSame(['jumps'], $fiveLetterParser->getMatches()); + } + + public function testParseWithDifferentPriorityOrder(): void + { + $colorParser = new FakeInlineParser(InlineParserMatch::string('brown')); + $adjectiveParser = new FakeInlineParser(InlineParserMatch::oneOf('quick', 'brown', 'lazy')); + $fiveLetterParser = new FakeInlineParser(InlineParserMatch::regex('\b\w{5}\b')); + + $environment = new Environment(); + $environment->addInlineParser($colorParser, 100); + $environment->addInlineParser($adjectiveParser, -100); + $environment->addInlineParser($fiveLetterParser); + + $engine = new InlineParserEngine($environment, new ReferenceMap()); + $paragraph = new Paragraph(); + $engine->parse('The quick brown fox jumps over the lazy dog', $paragraph); + + $this->assertSame(['brown'], $colorParser->getMatches()); + $this->assertSame(['lazy'], $adjectiveParser->getMatches()); + $this->assertSame(['quick', 'jumps'], $fiveLetterParser->getMatches()); + } +} From 00649fb7bb06d133508d6e48caff7acfd9d2d849 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 26 Sep 2020 16:36:44 -0400 Subject: [PATCH 9/9] Re-implement the GFM Autolink extension using the new inline parser approach Fixes #492 --- CHANGELOG.md | 4 + src/Extension/Autolink/AutolinkExtension.php | 5 +- .../Autolink/EmailAutolinkParser.php | 47 ++++++++ .../Autolink/EmailAutolinkProcessor.php | 75 ------------ ...inkProcessor.php => UrlAutolinkParser.php} | 112 ++++++++---------- .../Autolink/EmailAutolinkParserTest.php} | 4 +- .../Autolink/UrlAutolinkParserTest.php} | 7 +- 7 files changed, 112 insertions(+), 142 deletions(-) create mode 100644 src/Extension/Autolink/EmailAutolinkParser.php delete mode 100644 src/Extension/Autolink/EmailAutolinkProcessor.php rename src/Extension/Autolink/{UrlAutolinkProcessor.php => UrlAutolinkParser.php} (64%) rename tests/{unit/Extension/Autolink/EmailAutolinkProcessorTest.php => functional/Extension/Autolink/EmailAutolinkParserTest.php} (95%) rename tests/{unit/Extension/Autolink/UrlAutolinkProcessorTest.php => functional/Extension/Autolink/UrlAutolinkParserTest.php} (92%) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca5e32229b..4767d12aac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -70,6 +70,9 @@ See for detailed informatio - `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface. - `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()` - `EnvironmentInterface::getBlockRenderersForClass()` and `getInlineRenderersForClass()` are now just `getRenderersForClass()` + - Re-implemented the GFM Autolink extension using the new inline parser approach instead of document processors + - `EmailAutolinkProcessor` is now `EmailAutolinkParser` + - `UrlAutolinkProcessor` is now `UrlAutolinkParser` - Combined separate classes/interfaces into one: - `DisallowedRawHtmlRenderer` replaces `DisallowedRawHtmlBlockRenderer` and `DisallowedRawHtmlInlineRenderer` - `NodeRendererInterface` replaces `BlockRendererInterface` and `InlineRendererInterface` @@ -121,6 +124,7 @@ See for detailed informatio - Fixed parsing of footnotes without content - Fixed rendering of orphaned footnotes and footnote refs + - Fixed some URL autolinks breaking too early (#492) ### Removed diff --git a/src/Extension/Autolink/AutolinkExtension.php b/src/Extension/Autolink/AutolinkExtension.php index 7a5b1128f6..db1f47c321 100644 --- a/src/Extension/Autolink/AutolinkExtension.php +++ b/src/Extension/Autolink/AutolinkExtension.php @@ -14,14 +14,13 @@ namespace League\CommonMark\Extension\Autolink; use League\CommonMark\Environment\ConfigurableEnvironmentInterface; -use League\CommonMark\Event\DocumentParsedEvent; use League\CommonMark\Extension\ExtensionInterface; final class AutolinkExtension implements ExtensionInterface { public function register(ConfigurableEnvironmentInterface $environment): void { - $environment->addEventListener(DocumentParsedEvent::class, new EmailAutolinkProcessor()); - $environment->addEventListener(DocumentParsedEvent::class, new UrlAutolinkProcessor()); + $environment->addInlineParser(new EmailAutolinkParser()); + $environment->addInlineParser(new UrlAutolinkParser()); } } diff --git a/src/Extension/Autolink/EmailAutolinkParser.php b/src/Extension/Autolink/EmailAutolinkParser.php new file mode 100644 index 0000000000..6e2c3cdab1 --- /dev/null +++ b/src/Extension/Autolink/EmailAutolinkParser.php @@ -0,0 +1,47 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Extension\Autolink; + +use League\CommonMark\Extension\CommonMark\Node\Inline\Link; +use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; +use League\CommonMark\Parser\InlineParserContext; + +final class EmailAutolinkParser implements InlineParserInterface +{ + private const REGEX = '[A-Za-z0-9.\-_+]+@[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_.]+'; + + public function getMatchDefinition(): InlineParserMatch + { + return InlineParserMatch::regex(self::REGEX); + } + + public function parse(string $match, InlineParserContext $inlineContext): bool + { + // The last character cannot be - or _ + if (\in_array(\substr($match, -1), ['-', '_'], true)) { + return false; + } + + // Does the URL end with punctuation that should be stripped? + if (\substr($match, -1) === '.') { + $match = \substr($match, 0, -1); + } + + $inlineContext->getCursor()->advanceBy(\strlen($match)); + $inlineContext->getContainer()->appendChild(new Link('mailto:' . $match, $match)); + + return true; + } +} diff --git a/src/Extension/Autolink/EmailAutolinkProcessor.php b/src/Extension/Autolink/EmailAutolinkProcessor.php deleted file mode 100644 index 1b444f7416..0000000000 --- a/src/Extension/Autolink/EmailAutolinkProcessor.php +++ /dev/null @@ -1,75 +0,0 @@ - - * - * For the full copyright and license information, please view the LICENSE - * file that was distributed with this source code. - */ - -namespace League\CommonMark\Extension\Autolink; - -use League\CommonMark\Event\DocumentParsedEvent; -use League\CommonMark\Extension\CommonMark\Node\Inline\Link; -use League\CommonMark\Node\Inline\Text; - -final class EmailAutolinkProcessor -{ - private const REGEX = '/([A-Za-z0-9.\-_+]+@[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_.]+)/'; - - public function __invoke(DocumentParsedEvent $e): void - { - $walker = $e->getDocument()->walker(); - - while ($event = $walker->next()) { - $node = $event->getNode(); - if ($node instanceof Text && ! ($node->parent() instanceof Link)) { - self::processAutolinks($node); - } - } - } - - private static function processAutolinks(Text $node): void - { - $contents = \preg_split(self::REGEX, $node->getLiteral(), -1, PREG_SPLIT_DELIM_CAPTURE); - - if ($contents === false || \count($contents) === 1) { - return; - } - - $leftovers = ''; - foreach ($contents as $i => $content) { - if ($i % 2 === 0) { - $text = $leftovers . $content; - if ($text !== '') { - $node->insertBefore(new Text($leftovers . $content)); - } - - $leftovers = ''; - continue; - } - - // Does the URL end with punctuation that should be stripped? - if (\substr($content, -1) === '.') { - // Add the punctuation later - $content = \substr($content, 0, -1); - $leftovers = '.'; - } - - // The last character cannot be - or _ - if (\in_array(\substr($content, -1), ['-', '_'], true)) { - $node->insertBefore(new Text($content . $leftovers)); - $leftovers = ''; - continue; - } - - $node->insertBefore(new Link('mailto:' . $content, $content)); - } - - $node->detach(); - } -} diff --git a/src/Extension/Autolink/UrlAutolinkProcessor.php b/src/Extension/Autolink/UrlAutolinkParser.php similarity index 64% rename from src/Extension/Autolink/UrlAutolinkProcessor.php rename to src/Extension/Autolink/UrlAutolinkParser.php index 48fff3a6bb..1d242f2b45 100644 --- a/src/Extension/Autolink/UrlAutolinkProcessor.php +++ b/src/Extension/Autolink/UrlAutolinkParser.php @@ -13,15 +13,17 @@ namespace League\CommonMark\Extension\Autolink; -use League\CommonMark\Event\DocumentParsedEvent; use League\CommonMark\Extension\CommonMark\Node\Inline\Link; -use League\CommonMark\Node\Inline\Text; +use League\CommonMark\Parser\Inline\InlineParserInterface; +use League\CommonMark\Parser\Inline\InlineParserMatch; +use League\CommonMark\Parser\InlineParserContext; -final class UrlAutolinkProcessor +final class UrlAutolinkParser implements InlineParserInterface { + private const ALLOWED_AFTER = [null, ' ', "\t", "\n", "\x0b", "\x0c", "\x0d", '*', '_', '~', '(']; + // RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php private const REGEX = '~ - (?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(]) # Can only come at the beginning of a line, after whitespace, or certain delimiting characters ( # Must start with a supported scheme + auth, or "www" (?: @@ -43,6 +45,13 @@ final class UrlAutolinkProcessor (?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a fragment (optional) )~ixu'; + /** + * @var string[] + * + * @psalm-readonly + */ + private $prefixes = ['www']; + /** * @var string * @@ -56,79 +65,62 @@ final class UrlAutolinkProcessor public function __construct(array $allowedProtocols = ['http', 'https', 'ftp']) { $this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols)); + + foreach ($allowedProtocols as $protocol) { + $this->prefixes[] = $protocol . '://'; + } } - public function __invoke(DocumentParsedEvent $e): void + public function getMatchDefinition(): InlineParserMatch { - $walker = $e->getDocument()->walker(); - - while ($event = $walker->next()) { - $node = $event->getNode(); - if ($node instanceof Text && ! ($node->parent() instanceof Link)) { - self::processAutolinks($node, $this->finalRegex); - } - } + return InlineParserMatch::oneOf(...$this->prefixes); } - private static function processAutolinks(Text $node, string $regex): void + public function parse(string $match, InlineParserContext $inlineContext): bool { - $contents = \preg_split($regex, $node->getLiteral(), -1, PREG_SPLIT_DELIM_CAPTURE); + $cursor = $inlineContext->getCursor(); - if ($contents === false || \count($contents) === 1) { - return; + // Autolinks can only come at the beginning of a line, after whitespace, or certain delimiting characters + $previousChar = $cursor->peek(-1); + if (! \in_array($previousChar, self::ALLOWED_AFTER, true)) { + return false; } - $leftovers = ''; - foreach ($contents as $i => $content) { - // Even-indexed elements are things before/after the URLs - if ($i % 2 === 0) { - // Insert any left-over characters here as well - $text = $leftovers . $content; - if ($text !== '') { - $node->insertBefore(new Text($leftovers . $content)); - } - - $leftovers = ''; - continue; - } - - $leftovers = ''; - - // Does the URL end with punctuation that should be stripped? - if (\preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) { - // Add the punctuation later - $content = $matches[1]; - $leftovers = $matches[2]; - } - - // Does the URL end with something that looks like an entity reference? - if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) { - $content = $matches[1]; - $leftovers = $matches[2] . $leftovers; - } - - // Does the URL need unmatched parens chopped off? - if (\substr($content, -1) === ')' && ($diff = self::diffParens($content)) > 0) { - $content = \substr($content, 0, -$diff); - $leftovers = \str_repeat(')', $diff) . $leftovers; - } - - self::addLink($node, $content); + // Check if we have a valid URL + if (! \preg_match($this->finalRegex, $cursor->getRemainder(), $matches)) { + return false; } - $node->detach(); - } + $url = $matches[0]; + + // Does the URL end with punctuation that should be stripped? + if (\preg_match('/(.+)([?!.,:*_~]+)$/', $url, $matches)) { + // Add the punctuation later + $url = $matches[1]; + } + + // Does the URL end with something that looks like an entity reference? + if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $url, $matches)) { + $url = $matches[1]; + } + + // Does the URL need unmatched parens chopped off? + if (\substr($url, -1) === ')' && ($diff = self::diffParens($url)) > 0) { + $url = \substr($url, 0, -$diff); + } + + $cursor->advanceBy(\mb_strlen($url)); - private static function addLink(Text $node, string $url): void - { // Auto-prefix 'http://' onto 'www' URLs if (\substr($url, 0, 4) === 'www.') { - $node->insertBefore(new Link('http://' . $url, $url)); + $inlineContext->getContainer()->appendChild(new Link('http://' . $url, $url)); - return; + return true; } - $node->insertBefore(new Link($url, $url)); + $inlineContext->getContainer()->appendChild(new Link($url, $url)); + + return true; } /** diff --git a/tests/unit/Extension/Autolink/EmailAutolinkProcessorTest.php b/tests/functional/Extension/Autolink/EmailAutolinkParserTest.php similarity index 95% rename from tests/unit/Extension/Autolink/EmailAutolinkProcessorTest.php rename to tests/functional/Extension/Autolink/EmailAutolinkParserTest.php index 9c0007c7b4..bd69064ba1 100644 --- a/tests/unit/Extension/Autolink/EmailAutolinkProcessorTest.php +++ b/tests/functional/Extension/Autolink/EmailAutolinkParserTest.php @@ -11,14 +11,14 @@ * file that was distributed with this source code. */ -namespace League\CommonMark\Tests\Unit\Extension\Autolink; +namespace League\CommonMark\Tests\Functional\Extension\Autolink; use League\CommonMark\CommonMarkConverter; use League\CommonMark\Environment\Environment; use League\CommonMark\Extension\Autolink\AutolinkExtension; use PHPUnit\Framework\TestCase; -final class EmailAutolinkProcessorTest extends TestCase +final class EmailAutolinkParserTest extends TestCase { /** * @dataProvider dataProviderForEmailAutolinks diff --git a/tests/unit/Extension/Autolink/UrlAutolinkProcessorTest.php b/tests/functional/Extension/Autolink/UrlAutolinkParserTest.php similarity index 92% rename from tests/unit/Extension/Autolink/UrlAutolinkProcessorTest.php rename to tests/functional/Extension/Autolink/UrlAutolinkParserTest.php index 14c677e29c..5817251a41 100644 --- a/tests/unit/Extension/Autolink/UrlAutolinkProcessorTest.php +++ b/tests/functional/Extension/Autolink/UrlAutolinkParserTest.php @@ -11,14 +11,14 @@ * file that was distributed with this source code. */ -namespace League\CommonMark\Tests\Unit\Extension\Autolink; +namespace League\CommonMark\Tests\Functional\Extension\Autolink; use League\CommonMark\CommonMarkConverter; use League\CommonMark\Environment\Environment; use League\CommonMark\Extension\Autolink\AutolinkExtension; use PHPUnit\Framework\TestCase; -final class UrlAutolinkProcessorTest extends TestCase +final class UrlAutolinkParserTest extends TestCase { /** * @dataProvider dataProviderForAutolinkTests @@ -80,5 +80,8 @@ public function dataProviderForAutolinkTests(): iterable // Regression: CommonMark autolinks should not be double-linked yield ['', '

https://www.google.com

']; + + // Issue 492: underscores in URLs (see https://github.com/thephpleague/commonmark/issues/492) + yield ['http://wiki/Puncutation_in_links:_why_its_bad_(and_should_be_avoided)', '

http://wiki/Puncutation_in_links:_why_its_bad_(and_should_be_avoided)

']; } }