Skip to content

Commit

Permalink
Re-implement the GFM Autolink extension using the new inline parser a…
Browse files Browse the repository at this point in the history
…pproach

Fixes #492
  • Loading branch information
colinodell committed Sep 26, 2020
1 parent 8097a58 commit 0e5ed0d
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 142 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ See <https://commonmark.thephpleague.com/2.0/upgrading/> for detailed informatio
- `BlockRendererInterface` and `InlineRendererInterface` were replaced by `NodeRendererInterface` with slightly different parameters. All core renderers now implement this interface.
- `ConfigurableEnvironmentInterface::addBlockRenderer()` and `addInlineRenderer()` are now just `addRenderer()`
- `EnvironmentInterface::getBlockRenderersForClass()` and `getInlineRenderersForClass()` are now just `getRenderersForClass()`
- Re-implemented the GFM Autolink extension using the new inline parser approach instead of document processors
- `EmailAutolinkProcessor` is now `EmailAutolinkParser`
- `UrlAutolinkProcessor` is now `UrlAutolinkParser`
- Combined separate classes/interfaces into one:
- `DisallowedRawHtmlRenderer` replaces `DisallowedRawHtmlBlockRenderer` and `DisallowedRawHtmlInlineRenderer`
- `NodeRendererInterface` replaces `BlockRendererInterface` and `InlineRendererInterface`
Expand Down Expand Up @@ -121,6 +124,7 @@ See <https://commonmark.thephpleague.com/2.0/upgrading/> for detailed informatio

- Fixed parsing of footnotes without content
- Fixed rendering of orphaned footnotes and footnote refs
- Fixed some URL autolinks breaking too early (#492)

### Removed

Expand Down
5 changes: 2 additions & 3 deletions src/Extension/Autolink/AutolinkExtension.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
namespace League\CommonMark\Extension\Autolink;

use League\CommonMark\Environment\ConfigurableEnvironmentInterface;
use League\CommonMark\Event\DocumentParsedEvent;
use League\CommonMark\Extension\ExtensionInterface;

final class AutolinkExtension implements ExtensionInterface
{
public function register(ConfigurableEnvironmentInterface $environment): void
{
$environment->addEventListener(DocumentParsedEvent::class, new EmailAutolinkProcessor());
$environment->addEventListener(DocumentParsedEvent::class, new UrlAutolinkProcessor());
$environment->addInlineParser(new EmailAutolinkParser());
$environment->addInlineParser(new UrlAutolinkParser());
}
}
47 changes: 47 additions & 0 deletions src/Extension/Autolink/EmailAutolinkParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?php

declare(strict_types=1);

/*
* This file is part of the league/commonmark package.
*
* (c) Colin O'Dell <colinodell@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace League\CommonMark\Extension\Autolink;

use League\CommonMark\Extension\CommonMark\Node\Inline\Link;
use League\CommonMark\Parser\Inline\InlineParserInterface;
use League\CommonMark\Parser\Inline\InlineParserMatch;
use League\CommonMark\Parser\InlineParserContext;

final class EmailAutolinkParser implements InlineParserInterface
{
private const REGEX = '[A-Za-z0-9.\-_+]+@[A-Za-z0-9\-_]+\.[A-Za-z0-9\-_.]+';

public function getMatchDefinition(): InlineParserMatch
{
return InlineParserMatch::regex(self::REGEX);
}

public function parse(string $match, InlineParserContext $inlineContext): bool
{
// The last character cannot be - or _
if (\in_array(\substr($match, -1), ['-', '_'], true)) {
return false;
}

// Does the URL end with punctuation that should be stripped?
if (\substr($match, -1) === '.') {
$match = \substr($match, 0, -1);
}

$inlineContext->getCursor()->advanceBy(\strlen($match));
$inlineContext->getContainer()->appendChild(new Link('mailto:' . $match, $match));

return true;
}
}
75 changes: 0 additions & 75 deletions src/Extension/Autolink/EmailAutolinkProcessor.php

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@

namespace League\CommonMark\Extension\Autolink;

use League\CommonMark\Event\DocumentParsedEvent;
use League\CommonMark\Extension\CommonMark\Node\Inline\Link;
use League\CommonMark\Node\Inline\Text;
use League\CommonMark\Parser\Inline\InlineParserInterface;
use League\CommonMark\Parser\Inline\InlineParserMatch;
use League\CommonMark\Parser\InlineParserContext;

final class UrlAutolinkProcessor
final class UrlAutolinkParser implements InlineParserInterface
{
private const ALLOWED_AFTER = [null, ' ', "\t", "\n", "\x0b", "\x0c", "\x0d", '*', '_', '~', '('];

// RegEx adapted from https://github.com/symfony/symfony/blob/4.2/src/Symfony/Component/Validator/Constraints/UrlValidator.php
private const REGEX = '~
(?<=^|[ \\t\\n\\x0b\\x0c\\x0d*_\\~\\(]) # Can only come at the beginning of a line, after whitespace, or certain delimiting characters
(
# Must start with a supported scheme + auth, or "www"
(?:
Expand All @@ -43,6 +45,13 @@ final class UrlAutolinkProcessor
(?:\# (?:[\pL\pN\-._\~!$&\'()*+,;=:@/?]|%%[0-9A-Fa-f]{2})* )? # a fragment (optional)
)~ixu';

/**
* @var string[]
*
* @psalm-readonly
*/
private $prefixes = ['www'];

/**
* @var string
*
Expand All @@ -56,79 +65,62 @@ final class UrlAutolinkProcessor
public function __construct(array $allowedProtocols = ['http', 'https', 'ftp'])
{
$this->finalRegex = \sprintf(self::REGEX, \implode('|', $allowedProtocols));

foreach ($allowedProtocols as $protocol) {
$this->prefixes[] = $protocol . '://';
}
}

public function __invoke(DocumentParsedEvent $e): void
public function getMatchDefinition(): InlineParserMatch
{
$walker = $e->getDocument()->walker();

while ($event = $walker->next()) {
$node = $event->getNode();
if ($node instanceof Text && ! ($node->parent() instanceof Link)) {
self::processAutolinks($node, $this->finalRegex);
}
}
return InlineParserMatch::oneOf(...$this->prefixes);
}

private static function processAutolinks(Text $node, string $regex): void
public function parse(string $match, InlineParserContext $inlineContext): bool
{
$contents = \preg_split($regex, $node->getLiteral(), -1, PREG_SPLIT_DELIM_CAPTURE);
$cursor = $inlineContext->getCursor();

if ($contents === false || \count($contents) === 1) {
return;
// Autolinks can only come at the beginning of a line, after whitespace, or certain delimiting characters
$previousChar = $cursor->peek(-1);
if (! \in_array($previousChar, self::ALLOWED_AFTER, true)) {
return false;
}

$leftovers = '';
foreach ($contents as $i => $content) {
// Even-indexed elements are things before/after the URLs
if ($i % 2 === 0) {
// Insert any left-over characters here as well
$text = $leftovers . $content;
if ($text !== '') {
$node->insertBefore(new Text($leftovers . $content));
}

$leftovers = '';
continue;
}

$leftovers = '';

// Does the URL end with punctuation that should be stripped?
if (\preg_match('/(.+)([?!.,:*_~]+)$/', $content, $matches)) {
// Add the punctuation later
$content = $matches[1];
$leftovers = $matches[2];
}

// Does the URL end with something that looks like an entity reference?
if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $content, $matches)) {
$content = $matches[1];
$leftovers = $matches[2] . $leftovers;
}

// Does the URL need unmatched parens chopped off?
if (\substr($content, -1) === ')' && ($diff = self::diffParens($content)) > 0) {
$content = \substr($content, 0, -$diff);
$leftovers = \str_repeat(')', $diff) . $leftovers;
}

self::addLink($node, $content);
// Check if we have a valid URL
if (! \preg_match($this->finalRegex, $cursor->getRemainder(), $matches)) {
return false;
}

$node->detach();
}
$url = $matches[0];

// Does the URL end with punctuation that should be stripped?
if (\preg_match('/(.+)([?!.,:*_~]+)$/', $url, $matches)) {
// Add the punctuation later
$url = $matches[1];
}

// Does the URL end with something that looks like an entity reference?
if (\preg_match('/(.+)(&[A-Za-z0-9]+;)$/', $url, $matches)) {
$url = $matches[1];
}

// Does the URL need unmatched parens chopped off?
if (\substr($url, -1) === ')' && ($diff = self::diffParens($url)) > 0) {
$url = \substr($url, 0, -$diff);
}

$cursor->advanceBy(\mb_strlen($url));

private static function addLink(Text $node, string $url): void
{
// Auto-prefix 'http://' onto 'www' URLs
if (\substr($url, 0, 4) === 'www.') {
$node->insertBefore(new Link('http://' . $url, $url));
$inlineContext->getContainer()->appendChild(new Link('http://' . $url, $url));

return;
return true;
}

$node->insertBefore(new Link($url, $url));
$inlineContext->getContainer()->appendChild(new Link($url, $url));

return true;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
* file that was distributed with this source code.
*/

namespace League\CommonMark\Tests\Unit\Extension\Autolink;
namespace League\CommonMark\Tests\Functional\Extension\Autolink;

use League\CommonMark\CommonMarkConverter;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\Autolink\AutolinkExtension;
use PHPUnit\Framework\TestCase;

final class EmailAutolinkProcessorTest extends TestCase
final class EmailAutolinkParserTest extends TestCase
{
/**
* @dataProvider dataProviderForEmailAutolinks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
* file that was distributed with this source code.
*/

namespace League\CommonMark\Tests\Unit\Extension\Autolink;
namespace League\CommonMark\Tests\Functional\Extension\Autolink;

use League\CommonMark\CommonMarkConverter;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\Autolink\AutolinkExtension;
use PHPUnit\Framework\TestCase;

final class UrlAutolinkProcessorTest extends TestCase
final class UrlAutolinkParserTest extends TestCase
{
/**
* @dataProvider dataProviderForAutolinkTests
Expand Down Expand Up @@ -80,5 +80,8 @@ public function dataProviderForAutolinkTests(): iterable

// Regression: CommonMark autolinks should not be double-linked
yield ['<https://www.google.com>', '<p><a href="https://www.google.com">https://www.google.com</a></p>'];

// Issue 492: underscores in URLs (see https://github.com/thephpleague/commonmark/issues/492)
yield ['http://wiki/Puncutation_in_links:_why_its_bad_(and_should_be_avoided)', '<p><a href="http://wiki/Puncutation_in_links:_why_its_bad_(and_should_be_avoided)">http://wiki/Puncutation_in_links:_why_its_bad_(and_should_be_avoided)</a></p>'];
}
}

0 comments on commit 0e5ed0d

Please sign in to comment.