Skip to content

Commit

Permalink
update domain parser
Browse files Browse the repository at this point in the history
  • Loading branch information
tacman committed Jul 15, 2022
1 parent aefbb36 commit 6de56a5
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 9 deletions.
9 changes: 5 additions & 4 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
}
],
"require": {
"php": "^7.3 | ^8.0",
"php": "^8.0",
"ext-intl": "*",
"donatello-za/rake-php-plus": "^1.0.15",
"fabpot/goutte": "^4.0",
"symfony/dom-crawler": "^5.4 || ^6.0 ",
"jeremykendall/php-domain-parser": "^5.6",
"donatello-za/rake-php-plus": "^1.0.15"
"jeremykendall/php-domain-parser": "^6.1.1",
"symfony/cache": "^6.1",
"symfony/dom-crawler": "^5.4 || ^6.0 "
},
"require-dev": {
"symfony/thanks": "*",
Expand Down
32 changes: 27 additions & 5 deletions src/phpscraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

// https://github.com/Donatello-za/rake-php-plus
use DonatelloZa\RakePlus\RakePlus;
use Pdp\Rules;
use Pdp\Storage\PsrStorageFactory;
use phpDocumentor\Reflection\Types\Integer;
use Symfony\Contracts\Cache\CacheInterface;
use Symfony\Contracts\Cache\ItemInterface;

class phpscraper
{
Expand All @@ -27,7 +32,7 @@ class phpscraper
/**
* Constructor
*/
public function __construct()
public function __construct(private CacheInterface $cache)
{
$this->core = new core();
}
Expand Down Expand Up @@ -758,12 +763,12 @@ public function links()
public function internalLinks()
{
// Get the current host - to compare against for internal links
$manager = new Manager(new Cache(), new CurlHttpClient());
$rules = $manager->getRules();
$rules = $this->getTldCollection();


$root_domain = $rules
->resolve(parse_url($this->currentURL(), PHP_URL_HOST))
->getRegistrableDomain();
->registrableDomain();


// Filter the array
Expand All @@ -772,13 +777,30 @@ public function internalLinks()
function ($link) use (&$root_domain, &$rules) {
$link_root_domain = $rules
->resolve(parse_url($link, PHP_URL_HOST))
->getRegistrableDomain();
->registrableDomain();

return ($root_domain === $link_root_domain);
}
));
}

public function getTldCollection(): Rules
{

$rules = $this->cache->get('pdp_rules', function (ItemInterface $item) {
// The callable will only be executed on a cache miss.
$item->expiresAfter(3600 * 24);
$response = $this->client->request(
'GET',
PsrStorageFactory::PUBLIC_SUFFIX_LIST_URI
);
return $response->getContent();
});

$publicSuffixList = Rules::fromString($rules);
return $publicSuffixList;
}

/**
* Get all external links on the page as absolute URLs
*
Expand Down

0 comments on commit 6de56a5

Please sign in to comment.