summaryrefslogtreecommitdiff
path: root/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php')
-rw-r--r--vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php102
1 files changed, 102 insertions, 0 deletions
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php
new file mode 100644
index 00000000..9beb59c1
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php
@@ -0,0 +1,102 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+use DOMXPath;
+use PicoFeed\Parser\XmlParser;
+
+/**
+ * Rule Parser.
+ *
+ * @author Frederic Guillot
+ */
+class RuleParser implements ParserInterface
+{
+ private $dom;
+ private $xpath;
+ private $rules = array();
+
+ /**
+ * Constructor.
+ *
+ * @param string $html
+ * @param array $rules
+ */
+ public function __construct($html, array $rules)
+ {
+ $this->rules = $rules;
+ $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
+ $this->xpath = new DOMXPath($this->dom);
+ }
+
+ /**
+ * Get the relevant content with predefined rules.
+ *
+ * @return string
+ */
+ public function execute()
+ {
+ $this->stripTags();
+
+ return $this->findContent();
+ }
+
+ /**
+ * Remove HTML tags.
+ */
+ public function stripTags()
+ {
+ if (isset($this->rules['strip']) && is_array($this->rules['strip'])) {
+ foreach ($this->rules['strip'] as $pattern) {
+ $nodes = $this->xpath->query($pattern);
+
+ if ($nodes !== false && $nodes->length > 0) {
+ foreach ($nodes as $node) {
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Fetch content based on Xpath rules.
+ */
+ public function findContent()
+ {
+ $content = '';
+ if (isset($this->rules['body']) && is_array($this->rules['body'])) {
+ foreach ($this->rules['body'] as $pattern) {
+ $nodes = $this->xpath->query($pattern);
+
+ if ($nodes !== false && $nodes->length > 0) {
+ foreach ($nodes as $node) {
+ $content .= $this->dom->saveXML($node);
+ }
+ }
+ }
+ }
+
+ return $content;
+ }
+
+ /**
+ * Fetch next link based on Xpath rules.
+ *
+ * @return string
+ */
+ public function findNextLink()
+ {
+ if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
+ foreach ($this->rules['next_page'] as $pattern) {
+ $nodes = $this->xpath->query($pattern);
+ if ($nodes !== false && $nodes->length > 0) {
+ foreach ($nodes as $node) {
+ return $node->getAttribute('href');
+ }
+ }
+ }
+ }
+ return null;
+ }
+}