diff options
author | Frederic Guillot <fred@kanboard.net> | 2017-10-25 16:22:10 -0700 |
---|---|---|
committer | Frederic Guillot <fred@kanboard.net> | 2017-10-25 16:22:10 -0700 |
commit | 9e2b2a32fd0e967ad3184e9a5d091a29953acb91 (patch) | |
tree | 00822e24aa1110c73ca455a8d096ef296c008cbc /vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php | |
parent | c507c5416251c505cb3e088a03c6664bed73c812 (diff) |
Include composer dependencies in repo
Diffstat (limited to 'vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php')
-rw-r--r-- | vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php new file mode 100644 index 00000000..9beb59c1 --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php @@ -0,0 +1,102 @@ +<?php + +namespace PicoFeed\Scraper; + +use DOMXPath; +use PicoFeed\Parser\XmlParser; + +/** + * Rule Parser. + * + * @author Frederic Guillot + */ +class RuleParser implements ParserInterface +{ + private $dom; + private $xpath; + private $rules = array(); + + /** + * Constructor. + * + * @param string $html + * @param array $rules + */ + public function __construct($html, array $rules) + { + $this->rules = $rules; + $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); + $this->xpath = new DOMXPath($this->dom); + } + + /** + * Get the relevant content with predefined rules. + * + * @return string + */ + public function execute() + { + $this->stripTags(); + + return $this->findContent(); + } + + /** + * Remove HTML tags. + */ + public function stripTags() + { + if (isset($this->rules['strip']) && is_array($this->rules['strip'])) { + foreach ($this->rules['strip'] as $pattern) { + $nodes = $this->xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + } + } + + /** + * Fetch content based on Xpath rules. + */ + public function findContent() + { + $content = ''; + if (isset($this->rules['body']) && is_array($this->rules['body'])) { + foreach ($this->rules['body'] as $pattern) { + $nodes = $this->xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $content .= $this->dom->saveXML($node); + } + } + } + } + + return $content; + } + + /** + * Fetch next link based on Xpath rules. + * + * @return string + */ + public function findNextLink() + { + if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) { + foreach ($this->rules['next_page'] as $pattern) { + $nodes = $this->xpath->query($pattern); + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + return $node->getAttribute('href'); + } + } + } + } + return null; + } +} |