5 files changed, 791 insertions, 0 deletions
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php
new file mode 100644
index 00000000..6c53a289
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php
@@ -0,0 +1,283 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+use DomDocument;
+use DOMXPath;
+use PicoFeed\Logging\Logger;
+use PicoFeed\Parser\XmlParser;
+
+/**
+ * Candidate Parser.
+ *
+ * @author  Frederic Guillot
+ */
+class CandidateParser implements ParserInterface
+{
+    private $dom;
+    private $xpath;
+
+    /**
+     * List of attributes to try to get the content, order is important, generic terms at the end.
+     *
+     * @var array
+     */
+    private $candidatesAttributes = array(
+        'articleBody',
+        'articlebody',
+        'article-body',
+        'articleContent',
+        'articlecontent',
+        'article-content',
+        'articlePage',
+        'post-content',
+        'post_content',
+        'entry-content',
+        'entry-body',
+        'main-content',
+        'story_content',
+        'storycontent',
+        'entryBox',
+        'entrytext',
+        'comic',
+        'post',
+        'article',
+        'content',
+        'main',
+    );
+
+    /**
+     * List of attributes to strip.
+     *
+     * @var array
+     */
+    private $stripAttributes = array(
+        'comment',
+        'share',
+        'links',
+        'toolbar',
+        'fb',
+        'footer',
+        'credit',
+        'bottom',
+        'nav',
+        'header',
+        'social',
+        'tag',
+        'metadata',
+        'entry-utility',
+        'related-posts',
+        'tweet',
+        'categories',
+        'post_title',
+        'by_line',
+        'byline',
+        'sponsors',
+    );
+
+    /**
+     * Tags to remove.
+     *
+     * @var array
+     */
+    private $stripTags = array(
+        'nav',
+        'header',
+        'footer',
+        'aside',
+        'form',
+    );
+
+    /**
+     * Constructor.
+     *
+     * @param string $html
+     */
+    public function __construct($html)
+    {
+        $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
+        $this->xpath = new DOMXPath($this->dom);
+    }
+
+    /**
+     * Get the relevant content with the list of potential attributes.
+     *
+     * @return string
+     */
+    public function execute()
+    {
+        $content = $this->findContentWithCandidates();
+
+        if (strlen($content) < 200) {
+            $content = $this->findContentWithArticle();
+        }
+
+        if (strlen($content) < 50) {
+            $content = $this->findContentWithBody();
+        }
+
+        return $this->stripGarbage($content);
+    }
+
+    /**
+     * Find content based on the list of tag candidates.
+     *
+     * @return string
+     */
+    public function findContentWithCandidates()
+    {
+        foreach ($this->candidatesAttributes as $candidate) {
+            Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
+
+            $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
+
+            if ($nodes !== false && $nodes->length > 0) {
+                Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');
+
+                return $this->dom->saveXML($nodes->item(0));
+            }
+        }
+
+        return '';
+    }
+
+    /**
+     * Find <article/> tag.
+     *
+     * @return string
+     */
+    public function findContentWithArticle()
+    {
+        $nodes = $this->xpath->query('//article');
+
+        if ($nodes !== false && $nodes->length > 0) {
+            Logger::setMessage(get_called_class().': Find <article/> tag');
+
+            return $this->dom->saveXML($nodes->item(0));
+        }
+
+        return '';
+    }
+
+    /**
+     * Find <body/> tag.
+     *
+     * @return string
+     */
+    public function findContentWithBody()
+    {
+        $nodes = $this->xpath->query('//body');
+
+        if ($nodes !== false && $nodes->length > 0) {
+            Logger::setMessage(get_called_class().' Find <body/>');
+
+            return $this->dom->saveXML($nodes->item(0));
+        }
+
+        return '';
+    }
+
+    /**
+     * Strip useless tags.
+     *
+     * @param string $content
+     *
+     * @return string
+     */
+    public function stripGarbage($content)
+    {
+        $dom = XmlParser::getDomDocument($content);
+
+        if ($dom !== false) {
+            $xpath = new DOMXPath($dom);
+
+            $this->stripTags($xpath);
+            $this->stripAttributes($dom, $xpath);
+
+            $content = $dom->saveXML($dom->documentElement);
+        }
+
+        return $content;
+    }
+
+    /**
+     * Remove blacklisted tags.
+     *
+     * @param DOMXPath $xpath
+     */
+    public function stripTags(DOMXPath $xpath)
+    {
+        foreach ($this->stripTags as $tag) {
+            $nodes = $xpath->query('//'.$tag);
+
+            if ($nodes !== false && $nodes->length > 0) {
+                Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
+
+                foreach ($nodes as $node) {
+                    $node->parentNode->removeChild($node);
+                }
+            }
+        }
+    }
+
+    /**
+     * Remove blacklisted attributes.
+     *
+     * @param DomDocument $dom
+     * @param DOMXPath    $xpath
+     */
+    public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
+    {
+        foreach ($this->stripAttributes as $attribute) {
+            $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
+
+            if ($nodes !== false && $nodes->length > 0) {
+                Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
+
+                foreach ($nodes as $node) {
+                    if ($this->shouldRemove($dom, $node)) {
+                        $node->parentNode->removeChild($node);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Find link for next page of the article.
+     *
+     * @return string
+     */
+    public function findNextLink()
+    {
+        return null;
+    }
+
+    /**
+     * Return false if the node should not be removed.
+     *
+     * @param DomDocument $dom
+     * @param DomNode     $node
+     *
+     * @return bool
+     */
+    public function shouldRemove(DomDocument $dom, $node)
+    {
+        $document_length = strlen($dom->textContent);
+        $node_length = strlen($node->textContent);
+
+        if ($document_length === 0) {
+            return true;
+        }
+
+        $ratio = $node_length * 100 / $document_length;
+
+        if ($ratio >= 90) {
+            Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
+
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php
new file mode 100644
index 00000000..3ded4b1c
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php
@@ -0,0 +1,20 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+interface ParserInterface
+{
+    /**
+     * Execute the parser and return the contents.
+     *
+     * @return string
+     */
+    public function execute();
+
+    /**
+     * Find link for next page of the article.
+     *
+     * @return string
+     */
+    public function findNextLink();
+}
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
new file mode 100644
index 00000000..6650682d
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php
@@ -0,0 +1,107 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+use PicoFeed\Base;
+use PicoFeed\Logging\Logger;
+
+/**
+ * RuleLoader class.
+ *
+ * @author  Frederic Guillot
+ * @author  Bernhard Posselt
+ */
+class RuleLoader extends Base
+{
+    /**
+     * Get the rules for an URL.
+     *
+     * @param string $url the URL that should be looked up
+     *
+     * @return array the array containing the rules
+     */
+    public function getRules($url)
+    {
+        $hostname = parse_url($url, PHP_URL_HOST);
+
+        if ($hostname !== false) {
+            $files = $this->getRulesFileList($hostname);
+
+            foreach ($this->getRulesFolders() as $folder) {
+                $rule = $this->loadRuleFile($folder, $files);
+
+                if (!empty($rule)) {
+                    return $rule;
+                }
+            }
+        }
+
+        return array();
+    }
+
+    /**
+     * Get the list of possible rules file names for a given hostname.
+     *
+     * @param string $hostname Hostname
+     *
+     * @return array
+     */
+    public function getRulesFileList($hostname)
+    {
+        $files = array($hostname);                 // subdomain.domain.tld
+        $parts = explode('.', $hostname);
+        $len = count($parts);
+
+        if ($len > 2) {
+            $subdomain = array_shift($parts);
+            $files[] = implode('.', $parts);       // domain.tld
+            $files[] = '.'.implode('.', $parts);   // .domain.tld
+            $files[] = $subdomain;                 // subdomain
+        } elseif ($len === 2) {
+            $files[] = '.'.implode('.', $parts);    // .domain.tld
+            $files[] = $parts[0];                   // domain
+        }
+
+        return $files;
+    }
+
+    /**
+     * Load a rule file from the defined folder.
+     *
+     * @param string $folder Rule directory
+     * @param array  $files  List of possible file names
+     *
+     * @return array
+     */
+    public function loadRuleFile($folder, array $files)
+    {
+        foreach ($files as $file) {
+            $filename = $folder.'/'.$file.'.php';
+            if (file_exists($filename)) {
+                Logger::setMessage(get_called_class().' Load rule: '.$file);
+
+                return include $filename;
+            }
+        }
+
+        return array();
+    }
+
+    /**
+     * Get the list of folders that contains rules.
+     *
+     * @return array
+     */
+    public function getRulesFolders()
+    {
+        $folders = array();
+
+        if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
+            $folders[] = $this->config->getGrabberRulesFolder();
+        }
+
+        $folders[] = __DIR__ . '/../Rules';
+
+        return $folders;
+    }
+}
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php
new file mode 100644
index 00000000..9beb59c1
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php
@@ -0,0 +1,102 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+use DOMXPath;
+use PicoFeed\Parser\XmlParser;
+
+/**
+ * Rule Parser.
+ *
+ * @author  Frederic Guillot
+ */
+class RuleParser implements ParserInterface
+{
+    private $dom;
+    private $xpath;
+    private $rules = array();
+
+    /**
+     * Constructor.
+     *
+     * @param string $html
+     * @param array  $rules
+     */
+    public function __construct($html, array $rules)
+    {
+        $this->rules = $rules;
+        $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
+        $this->xpath = new DOMXPath($this->dom);
+    }
+
+    /**
+     * Get the relevant content with predefined rules.
+     *
+     * @return string
+     */
+    public function execute()
+    {
+        $this->stripTags();
+
+        return $this->findContent();
+    }
+
+    /**
+     * Remove HTML tags.
+     */
+    public function stripTags()
+    {
+        if (isset($this->rules['strip']) && is_array($this->rules['strip'])) {
+            foreach ($this->rules['strip'] as $pattern) {
+                $nodes = $this->xpath->query($pattern);
+
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        $node->parentNode->removeChild($node);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Fetch content based on Xpath rules.
+     */
+    public function findContent()
+    {
+        $content = '';
+        if (isset($this->rules['body']) && is_array($this->rules['body'])) {
+            foreach ($this->rules['body'] as $pattern) {
+                $nodes = $this->xpath->query($pattern);
+
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        $content .= $this->dom->saveXML($node);
+                    }
+                }
+            }
+        }
+
+        return $content;
+    }
+
+    /**
+     * Fetch next link based on Xpath rules.
+     *
+     * @return string
+     */
+    public function findNextLink()
+    {
+        if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
+            foreach ($this->rules['next_page'] as $pattern) {
+                $nodes = $this->xpath->query($pattern);
+                if ($nodes !== false && $nodes->length > 0) {
+                    foreach ($nodes as $node) {
+                        return $node->getAttribute('href');
+                    }
+                }
+            }
+        }
+        return null;
+    }
+}
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php
new file mode 100644
index 00000000..e5b9817f
--- /dev/null
+++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php
@@ -0,0 +1,279 @@
+<?php
+
+namespace PicoFeed\Scraper;
+
+use PicoFeed\Base;
+use PicoFeed\Client\Client;
+use PicoFeed\Client\ClientException;
+use PicoFeed\Client\Url;
+use PicoFeed\Encoding\Encoding;
+use PicoFeed\Filter\Filter;
+use PicoFeed\Logging\Logger;
+use PicoFeed\Parser\XmlParser;
+
+/**
+ * Scraper class.
+ *
+ * @author  Frederic Guillot
+ */
+class Scraper extends Base
+{
+    /**
+     * URL.
+     *
+     * @var string
+     */
+    private $url = '';
+
+    /**
+     * Relevant content.
+     *
+     * @var string
+     */
+    private $content = '';
+
+    /**
+     * HTML content.
+     *
+     * @var string
+     */
+    private $html = '';
+
+    /**
+     * HTML content encoding.
+     *
+     * @var string
+     */
+    private $encoding = '';
+
+    /**
+     * Flag to enable candidates parsing.
+     *
+     * @var bool
+     */
+    private $enableCandidateParser = true;
+
+    /**
+     * Disable candidates parsing.
+     *
+     * @return Scraper
+     */
+    public function disableCandidateParser()
+    {
+        $this->enableCandidateParser = false;
+        return $this;
+    }
+
+    /**
+     * Get encoding.
+     *
+     * @return string
+     */
+    public function getEncoding()
+    {
+        return $this->encoding;
+    }
+
+    /**
+     * Set encoding.
+     *
+     * @param string $encoding
+     *
+     * @return Scraper
+     */
+    public function setEncoding($encoding)
+    {
+        $this->encoding = $encoding;
+
+        return $this;
+    }
+
+    /**
+     * Get URL to download.
+     *
+     * @return string
+     */
+    public function getUrl()
+    {
+        return $this->url;
+    }
+
+    /**
+     * Set URL to download.
+     *
+     * @param string $url URL
+     *
+     * @return Scraper
+     */
+    public function setUrl($url)
+    {
+        $this->url = $url;
+
+        return $this;
+    }
+
+    /**
+     * Return true if the scraper found relevant content.
+     *
+     * @return bool
+     */
+    public function hasRelevantContent()
+    {
+        return !empty($this->content);
+    }
+
+    /**
+     * Get relevant content.
+     *
+     * @return string
+     */
+    public function getRelevantContent()
+    {
+        return $this->content;
+    }
+
+    /**
+     * Get raw content (unfiltered).
+     *
+     * @return string
+     */
+    public function getRawContent()
+    {
+        return $this->html;
+    }
+
+    /**
+     * Set raw content (unfiltered).
+     *
+     * @param string $html
+     *
+     * @return Scraper
+     */
+    public function setRawContent($html)
+    {
+        $this->html = $html;
+
+        return $this;
+    }
+
+    /**
+     * Get filtered relevant content.
+     *
+     * @return string
+     */
+    public function getFilteredContent()
+    {
+        $filter = Filter::html($this->content, $this->url);
+        $filter->setConfig($this->config);
+
+        return $filter->execute();
+    }
+
+    /**
+     * Download the HTML content.
+     *
+     * @return bool
+     */
+    public function download()
+    {
+        if (!empty($this->url)) {
+
+            // Clear everything
+            $this->html = '';
+            $this->content = '';
+            $this->encoding = '';
+
+            try {
+                $client = Client::getInstance();
+                $client->setConfig($this->config);
+                $client->setTimeout($this->config->getGrabberTimeout());
+                $client->setUserAgent($this->config->getGrabberUserAgent());
+                $client->execute($this->url);
+
+                $this->url = $client->getUrl();
+                $this->html = $client->getContent();
+                $this->encoding = $client->getEncoding();
+
+                return true;
+            } catch (ClientException $e) {
+                Logger::setMessage(get_called_class().': '.$e->getMessage());
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Execute the scraper.
+     */
+    public function execute($pageContent = '', $recursionDepth = 0)
+    {
+        $this->html = '';
+        $this->encoding = '';
+        $this->content = '';
+        $this->download();
+        $this->prepareHtml();
+
+        $parser = $this->getParser();
+
+        if ($parser !== null) {
+            $maxRecursions = $this->config->getMaxRecursions();
+            if(!isset($maxRecursions)){
+                $maxRecursions = 25;
+            }
+            $pageContent .= $parser->execute();
+            // check if there is a link to next page and recursively get content (max 25 pages)
+            if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
+                $nextLink = Url::resolve($nextLink,$this->url);
+                $this->setUrl($nextLink);
+                $this->execute($pageContent,$recursionDepth+1);
+            }
+            else{
+                $this->content = $pageContent;
+            }
+            Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
+        }
+    }
+
+    /**
+     * Get the parser.
+     *
+     * @return ParserInterface
+     */
+    public function getParser()
+    {
+        $ruleLoader = new RuleLoader($this->config);
+        $rules = $ruleLoader->getRules($this->url);
+
+        if (!empty($rules['grabber'])) {
+            Logger::setMessage(get_called_class().': Parse content with rules');
+
+            foreach ($rules['grabber'] as $pattern => $rule) {
+                $url = new Url($this->url);
+                $sub_url = $url->getFullPath();
+
+                if (preg_match($pattern, $sub_url)) {
+                    Logger::setMessage(get_called_class().': Matched url '.$sub_url);
+                    return new RuleParser($this->html, $rule);
+                }
+            }
+        } elseif ($this->enableCandidateParser) {
+            Logger::setMessage(get_called_class().': Parse content with candidates');
+        }
+
+        return new CandidateParser($this->html);
+    }
+
+    /**
+     * Normalize encoding and strip head tag.
+     */
+    public function prepareHtml()
+    {
+        $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
+
+        $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
+        $this->html = Filter::stripHeadTags($this->html);
+
+        Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
+    }
+}