<?php namespace Nen\Bundle\KennisbankPlatformBundle\Publication;
use Doctrine\ORM\EntityManagerInterface;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMXPath;
use Exception;
use Nen\Bundle\KennisbankPlatformBundle\Bolt\BoltConfiguration;
use Nen\Bundle\KennisbankPlatformBundle\Entity\BoltPublication;
use Nen\Bundle\KennisbankPlatformBundle\Entity\EsDocument;
use Nen\Bundle\KennisbankPlatformBundle\Entity\PublisherEntry;
use Nen\Bundle\KennisbankPlatformBundle\Exceptions\SectionNotFoundException;
use Nen\Bundle\KennisbankPlatformBundle\Publication\Event\PublisherFinishEvent;
use Nen\Bundle\KennisbankPlatformBundle\Repository\EsDocumentRepository;
use Psr\Log\LoggerInterface;
use Smalot\PdfParser\Parser;
use Symfony\Component\EventDispatcher\EventSubscriberInterface;
use Symfony\Component\Filesystem\Filesystem;
class Indexer implements EventSubscriberInterface
{
/**
* @var string
*/
protected $path;
/**
* @var EsDocumentRepository
*/
protected $repository;
/**
* @var Publisher
*/
protected $publisher;
/**
* @var EntityManagerInterface
*/
protected $em;
/**
* @var Filesystem
*/
protected $fs;
/**
* @var LoggerInterface
*/
private $logger;
/**
* @var BoltConfiguration
*/
private $configuration;
/**
* @param EntityManagerInterface $manager
* @param Publisher $publisher
* @param EsDocumentRepository $repository
* @param BoltConfiguration $configuration
*/
public function __construct(
EntityManagerInterface $manager,
Publisher $publisher,
EsDocumentRepository $repository,
BoltConfiguration $configuration
) {
$this->em = $manager;
$this->publisher = $publisher;
$this->repository = $repository;
$this->logger = $publisher->getLogger();
$this->fs = $publisher->getFileSystem();
$this->configuration = $configuration;
}
/**
* @return array
*/
public static function getSubscribedEvents()
{
return [
PublisherFinishEvent::class => 'onPublicationFinish',
];
}
/**
* @param PublisherFinishEvent $event
* @throws Exception
*/
public function onPublicationFinish(PublisherFinishEvent $event)
{
$this->index($event->getPublication(), $event->getEntry());
}
/**
* @param BoltPublication $publication
* @param PublisherEntry|null $entry
* @throws Exception
*/
public function index(BoltPublication $publication, PublisherEntry $entry = null)
{
if ($publication->getZip() || $publication->hasPdf()) {
$this->logger->info('Creating index for '.$publication->getTitle());
} else {
$this->logger->info('Skipping publication '.$publication->getTitle().' it is not indexable.');
$this->repository->removeForPublication($publication);
return;
}
$this->repository->removeForPublication($publication);
if ($publication->getStatus() === 'published') {
if ($publication->getZip()) {
$this->path = $this->createAndReturnIndexPath($publication);
$sections = $publication->getSections();
foreach ($sections as $section) {
if ($section === 'TOC') {
continue;
}
$this->indexSection(trim($section), $publication);
}
} else {
$this->indexPdf($publication);
}
}
$this->logger->info('Finished creating index');
}
/**
* @param BoltPublication $publication
* @param PublisherEntry|null $entry
* @throws Exception
*/
public function remove(BoltPublication $publication, PublisherEntry $entry = null)
{
$this->repository->removeForPublication($publication);
}
/**
* @param BoltPublication $publication
* @return string
*/
private function createAndReturnIndexPath(BoltPublication $publication): string
{
$path = $this->publisher->getPublicationFolder($publication).'/index';
if ($this->fs->exists($path)) {
$this->fs->remove($path);
}
$this->fs->mkdir($path);
return $path;
}
/**
* @param string $section
* @param BoltPublication $publication
*/
public function indexSection(string $section, BoltPublication $publication)
{
set_time_limit(30);
try {
$html = $this->publisher->getPublishedSection($section, $publication);
} catch (SectionNotFoundException $e) {
return;
}
$document = new DOMDocument('1.0', 'UTF-8');
libxml_use_internal_errors(true);
$document->loadHTML('<?xml encoding="UTF-8">'.$html);
libxml_use_internal_errors(false);
$xpath = new DOMXPath($document);
$nodes = $xpath->query('//div[@class="sts-section"]');
if ($nodes->length === 0) {
$nodes = $xpath->query('//div[@class="sts-app"]');
}
if ($nodes->length === 0) {
return;
}
/** @var DOMElement $section */
foreach ($nodes as $node) {
$local = $node->getAttribute('id');
$html = $document->saveHTML($node);
$id = $publication->getId().'_'.$section.'_'.$local;
$parsed = $this->parseSection($local, $node, $xpath);
$this->fs->dumpFile(
$this->path.'/'.$section.'_'.$local.'.html',
$html
);
if (!$esDocument = $this->repository->findOneBy(['documentId' => $id])) {
$esDocument = new EsDocument();
}
$meta = [
'publication' => [
'id' => $publication->getId(),
'slug' => $publication->getSlug(),
'title' => $publication->getTitle(),
'short_title' => $publication->getShortTitle() ?: $publication->getTitle(),
'category' => $publication->getCategory(),
'category_label' => $this->configuration->getContentTypeSelectValues('publications',
'category')[$publication->getCategory()],
'status' => $publication->getPublicationStatus(),
],
'section' => $section,
'local' => $local,
'number' => $parsed['number'],
];
$esDocument
->setDocumentId($id)
->setHtml($html)
->setTitle($parsed['title'])
->setText($parsed['text'])
->setPublication($publication)
->setMetadata($meta);
$this->em->persist($esDocument);
}
$this->em->flush();
}
/**
* @param string $local
* @param DOMNode $node
* @param DOMXPath $xpath
* @return array
*/
protected function parseSection(string $local, DOMNode $node, DOMXPath $xpath): array
{
$number = str_replace('sec_', '', $local);
$title = null;
$text = null;
$nodes = $xpath->query('.//*[@class="sts-sec-title" or @class="sts-section-label"]', $node);
if ($nodes->length > 0) {
$title = $nodes->item(0)->nodeValue;
$nodes->item(0)->parentNode->removeChild($nodes->item(0));
}
if (empty($title)) {
$title = $number;
}
$text = preg_replace('/\\[nrt]+/', ' ', strip_tags($node->nodeValue));
$text = preg_replace('/[ ]{2,}/', ' ', $text);
$title = preg_replace('/[\x{00A0}]+$/u', '', $title);
$title = preg_replace('/\\[nrt]+/', ' ', $title);
$title = preg_replace('/[ ]{2,}/', ' ', $title);
return [
'number' => (string)$number,
'title' => trim($title),
'text' => trim($text),
];
}
/**
* @param BoltPublication $publication
* @throws Exception
*/
private function indexPdf(BoltPublication $publication)
{
$file = $publication->getPdfForIndex();
$this->logger->info('Processing file ' . $file);
$parser = new Parser();
try {
$pdf = $parser->parseFile($this->configuration->getFilesPath().$publication->getPdf());
} catch (Exception $e) {
$this->logger->error($e->getMessage());
return;
}
$pages = $pdf->getPages();
$number = 1;
foreach ($pages as $page) {
$id = $publication->getId().$number;
if (!$esDocument = $this->repository->findOneBy(['documentId' => $id])) {
$esDocument = new EsDocument();
}
$meta = [
'publication' => [
'id' => $publication->getId(),
'slug' => $publication->getSlug(),
'title' => $publication->getTitle(),
'short_title' => $publication->getShortTitle() ?: $publication->getTitle(),
'category' => $publication->getCategory(),
'category_label' => $this->configuration->getContentTypeSelectValues(
'publications', 'category')[$publication->getCategory()],
'status' => $publication->getPublicationStatus(),
],
'section' => $number,
'local' => $number,
'number' => $number,
];
$esDocument
->setDocumentId($id)
->setHtml($page->getText())
->setTitle($publication->getTitle())
->setText($page->getText())
->setPublication($publication)
->setMetadata($meta);
$this->em->persist($esDocument);
$this->em->flush();
$number++;
}
}
}