vendor/nen/kennisbank-platform/src/Nen/Publication/Indexer.php line 103

Open in your IDE?
  1. <?php namespace Nen\Bundle\KennisbankPlatformBundle\Publication;
  2. use Doctrine\ORM\EntityManagerInterface;
  3. use DOMDocument;
  4. use DOMElement;
  5. use DOMNode;
  6. use DOMXPath;
  7. use Exception;
  8. use Nen\Bundle\KennisbankPlatformBundle\Bolt\BoltConfiguration;
  9. use Nen\Bundle\KennisbankPlatformBundle\Entity\BoltPublication;
  10. use Nen\Bundle\KennisbankPlatformBundle\Entity\EsDocument;
  11. use Nen\Bundle\KennisbankPlatformBundle\Entity\PublisherEntry;
  12. use Nen\Bundle\KennisbankPlatformBundle\Exceptions\SectionNotFoundException;
  13. use Nen\Bundle\KennisbankPlatformBundle\Publication\Event\PublisherFinishEvent;
  14. use Nen\Bundle\KennisbankPlatformBundle\Repository\EsDocumentRepository;
  15. use Psr\Log\LoggerInterface;
  16. use Smalot\PdfParser\Parser;
  17. use Symfony\Component\EventDispatcher\EventSubscriberInterface;
  18. use Symfony\Component\Filesystem\Filesystem;
  19. class Indexer implements EventSubscriberInterface
  20. {
  21.     /**
  22.      * @var string
  23.      */
  24.     protected $path;
  25.     /**
  26.      * @var EsDocumentRepository
  27.      */
  28.     protected $repository;
  29.     /**
  30.      * @var Publisher
  31.      */
  32.     protected $publisher;
  33.     /**
  34.      * @var EntityManagerInterface
  35.      */
  36.     protected $em;
  37.     /**
  38.      * @var Filesystem
  39.      */
  40.     protected $fs;
  41.     /**
  42.      * @var LoggerInterface
  43.      */
  44.     private $logger;
  45.     /**
  46.      * @var BoltConfiguration
  47.      */
  48.     private $configuration;
  49.     /**
  50.      * @param EntityManagerInterface $manager
  51.      * @param Publisher              $publisher
  52.      * @param EsDocumentRepository   $repository
  53.      * @param BoltConfiguration      $configuration
  54.      */
  55.     public function __construct(
  56.         EntityManagerInterface $manager,
  57.         Publisher $publisher,
  58.         EsDocumentRepository $repository,
  59.         BoltConfiguration $configuration
  60.     ) {
  61.         $this->em            $manager;
  62.         $this->publisher     $publisher;
  63.         $this->repository    $repository;
  64.         $this->logger        $publisher->getLogger();
  65.         $this->fs            $publisher->getFileSystem();
  66.         $this->configuration $configuration;
  67.     }
  68.     /**
  69.      * @return array
  70.      */
  71.     public static function getSubscribedEvents()
  72.     {
  73.         return [
  74.             PublisherFinishEvent::class => 'onPublicationFinish',
  75.         ];
  76.     }
  77.     /**
  78.      * @param PublisherFinishEvent $event
  79.      * @throws Exception
  80.      */
  81.     public function onPublicationFinish(PublisherFinishEvent $event)
  82.     {
  83.         $this->index($event->getPublication(), $event->getEntry());
  84.     }
  85.     /**
  86.      * @param BoltPublication     $publication
  87.      * @param PublisherEntry|null $entry
  88.      * @throws Exception
  89.      */
  90.     public function index(BoltPublication $publicationPublisherEntry $entry null)
  91.     {
  92.         if ($publication->getZip() || $publication->hasPdf()) {
  93.             $this->logger->info('Creating index for '.$publication->getTitle());
  94.         } else {
  95.             $this->logger->info('Skipping publication '.$publication->getTitle().' it is not indexable.');
  96.             $this->repository->removeForPublication($publication);
  97.             return;
  98.         }
  99.         $this->repository->removeForPublication($publication);
  100.         if ($publication->getStatus() === 'published') {
  101.             if ($publication->getZip()) {
  102.                 $this->path $this->createAndReturnIndexPath($publication);
  103.                 $sections $publication->getSections();
  104.                 foreach ($sections as $section) {
  105.                     if ($section === 'TOC') {
  106.                         continue;
  107.                     }
  108.                     $this->indexSection(trim($section), $publication);
  109.                 }
  110.             } else {
  111.                 $this->indexPdf($publication);
  112.             }
  113.         }
  114.         $this->logger->info('Finished creating index');
  115.     }
  116.     /**
  117.      * @param BoltPublication     $publication
  118.      * @param PublisherEntry|null $entry
  119.      * @throws Exception
  120.      */
  121.     public function remove(BoltPublication $publicationPublisherEntry $entry null)
  122.     {
  123.         $this->repository->removeForPublication($publication);
  124.     }
  125.     /**
  126.      * @param BoltPublication $publication
  127.      * @return string
  128.      */
  129.     private function createAndReturnIndexPath(BoltPublication $publication): string
  130.     {
  131.         $path $this->publisher->getPublicationFolder($publication).'/index';
  132.         if ($this->fs->exists($path)) {
  133.             $this->fs->remove($path);
  134.         }
  135.         $this->fs->mkdir($path);
  136.         return $path;
  137.     }
  138.     /**
  139.      * @param string          $section
  140.      * @param BoltPublication $publication
  141.      */
  142.     public function indexSection(string $sectionBoltPublication $publication)
  143.     {
  144.         set_time_limit(30);
  145.         try {
  146.             $html $this->publisher->getPublishedSection($section$publication);
  147.         } catch (SectionNotFoundException $e) {
  148.             return;
  149.         }
  150.         $document = new DOMDocument('1.0''UTF-8');
  151.         libxml_use_internal_errors(true);
  152.         $document->loadHTML('<?xml encoding="UTF-8">'.$html);
  153.         libxml_use_internal_errors(false);
  154.         $xpath = new DOMXPath($document);
  155.         $nodes $xpath->query('//div[@class="sts-section"]');
  156.         if ($nodes->length === 0) {
  157.             $nodes $xpath->query('//div[@class="sts-app"]');
  158.         }
  159.         if ($nodes->length === 0) {
  160.             return;
  161.         }
  162.         /** @var DOMElement $section */
  163.         foreach ($nodes as $node) {
  164.             $local  $node->getAttribute('id');
  165.             $html   $document->saveHTML($node);
  166.             $id     $publication->getId().'_'.$section.'_'.$local;
  167.             $parsed $this->parseSection($local$node$xpath);
  168.             $this->fs->dumpFile(
  169.                 $this->path.'/'.$section.'_'.$local.'.html',
  170.                 $html
  171.             );
  172.             if (!$esDocument $this->repository->findOneBy(['documentId' => $id])) {
  173.                 $esDocument = new EsDocument();
  174.             }
  175.             $meta = [
  176.                 'publication' => [
  177.                     'id'             => $publication->getId(),
  178.                     'slug'           => $publication->getSlug(),
  179.                     'title'          => $publication->getTitle(),
  180.                     'short_title'    => $publication->getShortTitle() ?: $publication->getTitle(),
  181.                     'category'       => $publication->getCategory(),
  182.                     'category_label' => $this->configuration->getContentTypeSelectValues('publications',
  183.                         'category')[$publication->getCategory()],
  184.                     'status'         => $publication->getPublicationStatus(),
  185.                 ],
  186.                 'section'     => $section,
  187.                 'local'       => $local,
  188.                 'number'      => $parsed['number'],
  189.             ];
  190.             $esDocument
  191.                 ->setDocumentId($id)
  192.                 ->setHtml($html)
  193.                 ->setTitle($parsed['title'])
  194.                 ->setText($parsed['text'])
  195.                 ->setPublication($publication)
  196.                 ->setMetadata($meta);
  197.             $this->em->persist($esDocument);
  198.         }
  199.         $this->em->flush();
  200.     }
  201.     /**
  202.      * @param string   $local
  203.      * @param DOMNode  $node
  204.      * @param DOMXPath $xpath
  205.      * @return array
  206.      */
  207.     protected function parseSection(string $localDOMNode $nodeDOMXPath $xpath): array
  208.     {
  209.         $number str_replace('sec_'''$local);
  210.         $title  null;
  211.         $text   null;
  212.         $nodes $xpath->query('.//*[@class="sts-sec-title" or @class="sts-section-label"]'$node);
  213.         if ($nodes->length 0) {
  214.             $title $nodes->item(0)->nodeValue;
  215.             $nodes->item(0)->parentNode->removeChild($nodes->item(0));
  216.         }
  217.         if (empty($title)) {
  218.             $title $number;
  219.         }
  220.         $text preg_replace('/\\[nrt]+/'' 'strip_tags($node->nodeValue));
  221.         $text preg_replace('/[ ]{2,}/'' '$text);
  222.         $title preg_replace('/[\x{00A0}]+$/u'''$title);
  223.         $title preg_replace('/\\[nrt]+/'' '$title);
  224.         $title preg_replace('/[ ]{2,}/'' '$title);
  225.         return [
  226.             'number' => (string)$number,
  227.             'title'  => trim($title),
  228.             'text'   => trim($text),
  229.         ];
  230.     }
  231.     /**
  232.      * @param BoltPublication $publication
  233.      * @throws Exception
  234.      */
  235.     private function indexPdf(BoltPublication $publication)
  236.     {
  237.         $file $publication->getPdfForIndex();
  238.         $this->logger->info('Processing file ' $file);
  239.         $parser = new Parser();
  240.         try {
  241.             $pdf $parser->parseFile($this->configuration->getFilesPath().$publication->getPdf());
  242.         } catch (Exception $e) {
  243.             $this->logger->error($e->getMessage());
  244.             return;
  245.         }
  246.         $pages  $pdf->getPages();
  247.         $number 1;
  248.         foreach ($pages as $page) {
  249.             $id $publication->getId().$number;
  250.             if (!$esDocument $this->repository->findOneBy(['documentId' => $id])) {
  251.                 $esDocument = new EsDocument();
  252.             }
  253.             $meta = [
  254.                 'publication' => [
  255.                     'id'             => $publication->getId(),
  256.                     'slug'           => $publication->getSlug(),
  257.                     'title'          => $publication->getTitle(),
  258.                     'short_title'    => $publication->getShortTitle() ?: $publication->getTitle(),
  259.                     'category'       => $publication->getCategory(),
  260.                     'category_label' => $this->configuration->getContentTypeSelectValues(
  261.                         'publications''category')[$publication->getCategory()],
  262.                     'status'         => $publication->getPublicationStatus(),
  263.                 ],
  264.                 'section'     => $number,
  265.                 'local'       => $number,
  266.                 'number'      => $number,
  267.             ];
  268.             $esDocument
  269.                 ->setDocumentId($id)
  270.                 ->setHtml($page->getText())
  271.                 ->setTitle($publication->getTitle())
  272.                 ->setText($page->getText())
  273.                 ->setPublication($publication)
  274.                 ->setMetadata($meta);
  275.             $this->em->persist($esDocument);
  276.             $this->em->flush();
  277.             $number++;
  278.         }
  279.     }
  280. }