Skip to content

Commit

Permalink
Merge pull request #14 from brevia-ai/feat/import-sitemap-command
Browse files Browse the repository at this point in the history
New import sitemap command
  • Loading branch information
nikazzio authored May 10, 2024
2 parents 17877ba + 4d20cca commit 431d091
Show file tree
Hide file tree
Showing 3 changed files with 277 additions and 0 deletions.
173 changes: 173 additions & 0 deletions src/Command/ImportSitemapCommand.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
<?php
declare(strict_types=1);

/**
* BEdita Brevia plugin
*
* Copyright 2024 Atlas Srl
*/
namespace Brevia\BEdita\Command;

use BEdita\Core\Utility\LoggedUser;
use Brevia\BEdita\Client\BreviaClient;
use Brevia\BEdita\Utility\ReadCSVTrait;
use Cake\Command\Command;
use Cake\Console\Arguments;
use Cake\Console\ConsoleIo;
use Cake\Console\ConsoleOptionParser;
use Cake\Log\LogTrait;
use Cake\ORM\Table;
use Cake\Utility\Hash;

/**
* Import links from sitemap and create links
*
* @property \BEdita\Core\Model\Table\ObjectsTable $Collections
*/
class ImportSitemapCommand extends Command
{
use LogTrait;
use ReadCSVTrait;

/**
* Brevia API client
*
* @var \Brevia\BEdita\Client\BreviaClient
*/
protected BreviaClient $client;

/**
* Links Table
*
* @var \Cake\ORM\Table
*/
protected Table $Links;

/**
* @inheritDoc
*/
public $defaultTable = 'Collections';

/**
* @inheritDoc
*/
protected function buildOptionParser(ConsoleOptionParser $parser): ConsoleOptionParser
{
return $parser->addOption('sitemap', [
'help' => 'File path or URL of sitemap to import',
'short' => 's',
'required' => true,
])
->addOption('prefix', [
'help' => 'Optional path prefix of URLs to import',
'short' => 'p',
'required' => false,
])
->addOption('collection', [
'help' => 'Collection used to index (use the unique collection name)',
'short' => 'c',
'required' => true,
]);
}

/**
* @inheritDoc
*/
public function initialize(): void
{
$this->client = new BreviaClient();
$this->Links = $this->fetchTable('Links');
}

/**
* @inheritDoc
*/
public function execute(Arguments $args, ConsoleIo $io)
{
$sitemap = $args->getOption('sitemap');
if (!file_exists($sitemap)) {
$io->abort(sprintf('File not found: %s', $sitemap));
}
$content = file_get_contents($sitemap);

$name = $args->getOption('collection');
$response = $this->client->get('/collections', compact('name'));
$collectionId = Hash::get($response->getJson(), '0.cmetadata.id');
if (empty($collectionId)) {
$io->abort(sprintf('Collection not found: %s', $name));
}
$collection = $this->Collections->get($collectionId, ['contain' => ['HasDocuments']]);
$currentUrls = array_filter(array_map(function ($link) {
$link = $link->getTable()->get($link->id);

return $link->get('url');
},
(array)$collection->get('has_documents')));
$prefix = $args->getOption('prefix');

$xml = simplexml_load_string($content);
$json = json_encode($xml);
$data = (array)json_decode($json, true);
$urls = Hash::extract($data, 'url.{n}.loc');
if (empty($urls)) {
$io->abort('No URLs found in sitemap');
}
$entities = [];
LoggedUser::setUserAdmin();
foreach ($urls as $url) {
if (
in_array($url, $currentUrls) ||
in_array(urldecode($url), $currentUrls) ||
($prefix && strpos($url, $prefix) !== 0)
) {
continue;
}
$io->info('Adding link: ' . $url);
$data = [
'status' => 'on',
'title' => $url,
'url' => $url,
'extra' => [
'brevia' => [
'metadata' => [
'type' => 'links',
'url' => $url,
],
'options' => $this->linkOptions($url, (array)$collection->get('link_load_options')),
],
],
];
$entity = $this->Links->newEntity($data);
$entities[] = $this->Links->saveOrFail($entity);
}
// @phpstan-ignore-next-line
$this->Collections->addRelated($collection, 'has_documents', $entities);

$io->out('Done. Link added successfully: ' . count($entities));

return null;
}

/**
* Get link options
*
* @param string $url URL
* @param array $linkLoadOptions Link load options
* @return array
*/
protected function linkOptions(string $url, array $linkLoadOptions): array
{
$options = array_filter($linkLoadOptions, function ($o) use ($url) {
return $o['url'] === $url;
});
$selector = Hash::get($options, '0.selector');
if (!empty($selector)) {
return compact('selector');
}
$options = array_filter($linkLoadOptions, function ($o) use ($url) {
return strpos($url, $o['url']) === 0;
});

return ['selector' => Hash::get($options, '0.selector')];
}
}
90 changes: 90 additions & 0 deletions tests/TestCase/Command/ImportSitemapCommandTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?php
declare(strict_types=1);

/**
* BEdita Brevia plugin
*
* Copyright 2023 Atlas Srl
*/
namespace Brevia\BEdita\Test\TestCase\Command;

use BEdita\Core\Model\Entity\ObjectEntity;
use Brevia\BEdita\Test\TestMockTrait;
use Cake\Console\TestSuite\ConsoleIntegrationTestTrait;
use Cake\Routing\Router;
use Cake\TestSuite\TestCase;

/**
* {@see \Brevia\BEdita\Command\ImportSitemapCommand} Test Case
*
* @coversDefaultClass \Brevia\BEdita\Command\ImportSitemapCommand
*/
class ImportSitemapCommandTest extends TestCase
{
use ConsoleIntegrationTestTrait;
use TestMockTrait;

/**
* @inheritDoc
*/
public function setUp(): void
{
parent::setUp();
$this->useCommandRunner();
Router::reload();
}

/**
* Test buildOptionParser method
*
* @return void
* @covers ::buildOptionParser()
*/
public function testBuildOptionParser(): void
{
$this->exec('import_sitemap --help');
$this->assertOutputContains('File path or URL of sitemap to import');
$this->assertOutputContains('Optional path prefix of URLs to import');
$this->assertOutputContains('Collection used to index');
}

/**
* Test options failure
*
* @return void
* @covers ::initialize()
* @covers ::execute()
*/
public function testOptionFailure(): void
{
$this->exec('import_sitemap --sitemap /not/existing/path --collection gustavo');
$this->assertExitError('File not found: /not/existing/path');

$this->mockClientResponse(json_encode([]));
$xmlPath = sprintf('%s/tests/files/sitemap.xml', getcwd());
$this->exec(sprintf('import_sitemap --sitemap %s --collection gustavo', $xmlPath));
$this->assertExitError('Collection not found: gustavo');

$this->mockClientResponse('[{"cmetadata": {"id":"1"}}]');
$this->mockTable('Collections', new ObjectEntity());
$xmlPath = sprintf('%s/tests/files/empty.csv', getcwd());
$this->exec(sprintf('import_sitemap --sitemap %s --collection gustavo', $xmlPath));
$this->assertExitError('No URLs found in sitemap');
}

/**
* Test command success
*
* @return void
* @covers ::execute()
*/
public function testCommand(): void
{
$this->mockTable('Collections', new ObjectEntity());
$this->mockTable('Links', new ObjectEntity());
$this->mockClientResponse('[{"cmetadata": {"id":"1"}}]', 200, 3);
$xmlPath = sprintf('%s/tests/files/sitemap.xml', getcwd());
$this->exec(sprintf('import_sitemap --sitemap %s --collection gustavo', $xmlPath));
$this->assertExitSuccess('Done');
}
}
14 changes: 14 additions & 0 deletions tests/files/sitemap.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://example.com/home</loc>
<lastmod>2024-04-28T08:02:38+00:00</lastmod>
<changefreq>always</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>https://example.com/page2</loc>
<lastmod>2024-04-28T08:02:38+00:00</lastmod>
<changefreq>always</changefreq>
<priority>1.0</priority>
</url>
</urlset>

0 comments on commit 431d091

Please sign in to comment.