diff options
Diffstat (limited to 'vendor/miniflux/picofeed/lib/PicoFeed/Scraper')
5 files changed, 791 insertions, 0 deletions
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php new file mode 100644 index 00000000..6c53a289 --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php @@ -0,0 +1,283 @@ +<?php + +namespace PicoFeed\Scraper; + +use DomDocument; +use DOMXPath; +use PicoFeed\Logging\Logger; +use PicoFeed\Parser\XmlParser; + +/** + * Candidate Parser. + * + * @author Frederic Guillot + */ +class CandidateParser implements ParserInterface +{ + private $dom; + private $xpath; + + /** + * List of attributes to try to get the content, order is important, generic terms at the end. + * + * @var array + */ + private $candidatesAttributes = array( + 'articleBody', + 'articlebody', + 'article-body', + 'articleContent', + 'articlecontent', + 'article-content', + 'articlePage', + 'post-content', + 'post_content', + 'entry-content', + 'entry-body', + 'main-content', + 'story_content', + 'storycontent', + 'entryBox', + 'entrytext', + 'comic', + 'post', + 'article', + 'content', + 'main', + ); + + /** + * List of attributes to strip. + * + * @var array + */ + private $stripAttributes = array( + 'comment', + 'share', + 'links', + 'toolbar', + 'fb', + 'footer', + 'credit', + 'bottom', + 'nav', + 'header', + 'social', + 'tag', + 'metadata', + 'entry-utility', + 'related-posts', + 'tweet', + 'categories', + 'post_title', + 'by_line', + 'byline', + 'sponsors', + ); + + /** + * Tags to remove. + * + * @var array + */ + private $stripTags = array( + 'nav', + 'header', + 'footer', + 'aside', + 'form', + ); + + /** + * Constructor. + * + * @param string $html + */ + public function __construct($html) + { + $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); + $this->xpath = new DOMXPath($this->dom); + } + + /** + * Get the relevant content with the list of potential attributes. + * + * @return string + */ + public function execute() + { + $content = $this->findContentWithCandidates(); + + if (strlen($content) < 200) { + $content = $this->findContentWithArticle(); + } + + if (strlen($content) < 50) { + $content = $this->findContentWithBody(); + } + + return $this->stripGarbage($content); + } + + /** + * Find content based on the list of tag candidates. + * + * @return string + */ + public function findContentWithCandidates() + { + foreach ($this->candidatesAttributes as $candidate) { + Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); + + $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"'); + + return $this->dom->saveXML($nodes->item(0)); + } + } + + return ''; + } + + /** + * Find <article/> tag. + * + * @return string + */ + public function findContentWithArticle() + { + $nodes = $this->xpath->query('//article'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Find <article/> tag'); + + return $this->dom->saveXML($nodes->item(0)); + } + + return ''; + } + + /** + * Find <body/> tag. + * + * @return string + */ + public function findContentWithBody() + { + $nodes = $this->xpath->query('//body'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().' Find <body/>'); + + return $this->dom->saveXML($nodes->item(0)); + } + + return ''; + } + + /** + * Strip useless tags. + * + * @param string $content + * + * @return string + */ + public function stripGarbage($content) + { + $dom = XmlParser::getDomDocument($content); + + if ($dom !== false) { + $xpath = new DOMXPath($dom); + + $this->stripTags($xpath); + $this->stripAttributes($dom, $xpath); + + $content = $dom->saveXML($dom->documentElement); + } + + return $content; + } + + /** + * Remove blacklisted tags. + * + * @param DOMXPath $xpath + */ + public function stripTags(DOMXPath $xpath) + { + foreach ($this->stripTags as $tag) { + $nodes = $xpath->query('//'.$tag); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); + + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + } + + /** + * Remove blacklisted attributes. + * + * @param DomDocument $dom + * @param DOMXPath $xpath + */ + public function stripAttributes(DomDocument $dom, DOMXPath $xpath) + { + foreach ($this->stripAttributes as $attribute) { + $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); + + foreach ($nodes as $node) { + if ($this->shouldRemove($dom, $node)) { + $node->parentNode->removeChild($node); + } + } + } + } + } + + /** + * Find link for next page of the article. + * + * @return string + */ + public function findNextLink() + { + return null; + } + + /** + * Return false if the node should not be removed. + * + * @param DomDocument $dom + * @param DomNode $node + * + * @return bool + */ + public function shouldRemove(DomDocument $dom, $node) + { + $document_length = strlen($dom->textContent); + $node_length = strlen($node->textContent); + + if ($document_length === 0) { + return true; + } + + $ratio = $node_length * 100 / $document_length; + + if ($ratio >= 90) { + Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); + + return false; + } + + return true; + } +} diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php new file mode 100644 index 00000000..3ded4b1c --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/ParserInterface.php @@ -0,0 +1,20 @@ +<?php + +namespace PicoFeed\Scraper; + +interface ParserInterface +{ + /** + * Execute the parser and return the contents. + * + * @return string + */ + public function execute(); + + /** + * Find link for next page of the article. + * + * @return string + */ + public function findNextLink(); +} diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php new file mode 100644 index 00000000..6650682d --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleLoader.php @@ -0,0 +1,107 @@ +<?php + +namespace PicoFeed\Scraper; + +use PicoFeed\Base; +use PicoFeed\Logging\Logger; + +/** + * RuleLoader class. + * + * @author Frederic Guillot + * @author Bernhard Posselt + */ +class RuleLoader extends Base +{ + /** + * Get the rules for an URL. + * + * @param string $url the URL that should be looked up + * + * @return array the array containing the rules + */ + public function getRules($url) + { + $hostname = parse_url($url, PHP_URL_HOST); + + if ($hostname !== false) { + $files = $this->getRulesFileList($hostname); + + foreach ($this->getRulesFolders() as $folder) { + $rule = $this->loadRuleFile($folder, $files); + + if (!empty($rule)) { + return $rule; + } + } + } + + return array(); + } + + /** + * Get the list of possible rules file names for a given hostname. + * + * @param string $hostname Hostname + * + * @return array + */ + public function getRulesFileList($hostname) + { + $files = array($hostname); // subdomain.domain.tld + $parts = explode('.', $hostname); + $len = count($parts); + + if ($len > 2) { + $subdomain = array_shift($parts); + $files[] = implode('.', $parts); // domain.tld + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $subdomain; // subdomain + } elseif ($len === 2) { + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $parts[0]; // domain + } + + return $files; + } + + /** + * Load a rule file from the defined folder. + * + * @param string $folder Rule directory + * @param array $files List of possible file names + * + * @return array + */ + public function loadRuleFile($folder, array $files) + { + foreach ($files as $file) { + $filename = $folder.'/'.$file.'.php'; + if (file_exists($filename)) { + Logger::setMessage(get_called_class().' Load rule: '.$file); + + return include $filename; + } + } + + return array(); + } + + /** + * Get the list of folders that contains rules. + * + * @return array + */ + public function getRulesFolders() + { + $folders = array(); + + if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { + $folders[] = $this->config->getGrabberRulesFolder(); + } + + $folders[] = __DIR__ . '/../Rules'; + + return $folders; + } +} diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php new file mode 100644 index 00000000..9beb59c1 --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/RuleParser.php @@ -0,0 +1,102 @@ +<?php + +namespace PicoFeed\Scraper; + +use DOMXPath; +use PicoFeed\Parser\XmlParser; + +/** + * Rule Parser. + * + * @author Frederic Guillot + */ +class RuleParser implements ParserInterface +{ + private $dom; + private $xpath; + private $rules = array(); + + /** + * Constructor. + * + * @param string $html + * @param array $rules + */ + public function __construct($html, array $rules) + { + $this->rules = $rules; + $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); + $this->xpath = new DOMXPath($this->dom); + } + + /** + * Get the relevant content with predefined rules. + * + * @return string + */ + public function execute() + { + $this->stripTags(); + + return $this->findContent(); + } + + /** + * Remove HTML tags. + */ + public function stripTags() + { + if (isset($this->rules['strip']) && is_array($this->rules['strip'])) { + foreach ($this->rules['strip'] as $pattern) { + $nodes = $this->xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + } + } + + /** + * Fetch content based on Xpath rules. + */ + public function findContent() + { + $content = ''; + if (isset($this->rules['body']) && is_array($this->rules['body'])) { + foreach ($this->rules['body'] as $pattern) { + $nodes = $this->xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $content .= $this->dom->saveXML($node); + } + } + } + } + + return $content; + } + + /** + * Fetch next link based on Xpath rules. + * + * @return string + */ + public function findNextLink() + { + if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) { + foreach ($this->rules['next_page'] as $pattern) { + $nodes = $this->xpath->query($pattern); + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + return $node->getAttribute('href'); + } + } + } + } + return null; + } +} diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php new file mode 100644 index 00000000..e5b9817f --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/Scraper.php @@ -0,0 +1,279 @@ +<?php + +namespace PicoFeed\Scraper; + +use PicoFeed\Base; +use PicoFeed\Client\Client; +use PicoFeed\Client\ClientException; +use PicoFeed\Client\Url; +use PicoFeed\Encoding\Encoding; +use PicoFeed\Filter\Filter; +use PicoFeed\Logging\Logger; +use PicoFeed\Parser\XmlParser; + +/** + * Scraper class. + * + * @author Frederic Guillot + */ +class Scraper extends Base +{ + /** + * URL. + * + * @var string + */ + private $url = ''; + + /** + * Relevant content. + * + * @var string + */ + private $content = ''; + + /** + * HTML content. + * + * @var string + */ + private $html = ''; + + /** + * HTML content encoding. + * + * @var string + */ + private $encoding = ''; + + /** + * Flag to enable candidates parsing. + * + * @var bool + */ + private $enableCandidateParser = true; + + /** + * Disable candidates parsing. + * + * @return Scraper + */ + public function disableCandidateParser() + { + $this->enableCandidateParser = false; + return $this; + } + + /** + * Get encoding. + * + * @return string + */ + public function getEncoding() + { + return $this->encoding; + } + + /** + * Set encoding. + * + * @param string $encoding + * + * @return Scraper + */ + public function setEncoding($encoding) + { + $this->encoding = $encoding; + + return $this; + } + + /** + * Get URL to download. + * + * @return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Set URL to download. + * + * @param string $url URL + * + * @return Scraper + */ + public function setUrl($url) + { + $this->url = $url; + + return $this; + } + + /** + * Return true if the scraper found relevant content. + * + * @return bool + */ + public function hasRelevantContent() + { + return !empty($this->content); + } + + /** + * Get relevant content. + * + * @return string + */ + public function getRelevantContent() + { + return $this->content; + } + + /** + * Get raw content (unfiltered). + * + * @return string + */ + public function getRawContent() + { + return $this->html; + } + + /** + * Set raw content (unfiltered). + * + * @param string $html + * + * @return Scraper + */ + public function setRawContent($html) + { + $this->html = $html; + + return $this; + } + + /** + * Get filtered relevant content. + * + * @return string + */ + public function getFilteredContent() + { + $filter = Filter::html($this->content, $this->url); + $filter->setConfig($this->config); + + return $filter->execute(); + } + + /** + * Download the HTML content. + * + * @return bool + */ + public function download() + { + if (!empty($this->url)) { + + // Clear everything + $this->html = ''; + $this->content = ''; + $this->encoding = ''; + + try { + $client = Client::getInstance(); + $client->setConfig($this->config); + $client->setTimeout($this->config->getGrabberTimeout()); + $client->setUserAgent($this->config->getGrabberUserAgent()); + $client->execute($this->url); + + $this->url = $client->getUrl(); + $this->html = $client->getContent(); + $this->encoding = $client->getEncoding(); + + return true; + } catch (ClientException $e) { + Logger::setMessage(get_called_class().': '.$e->getMessage()); + } + } + + return false; + } + + /** + * Execute the scraper. + */ + public function execute($pageContent = '', $recursionDepth = 0) + { + $this->html = ''; + $this->encoding = ''; + $this->content = ''; + $this->download(); + $this->prepareHtml(); + + $parser = $this->getParser(); + + if ($parser !== null) { + $maxRecursions = $this->config->getMaxRecursions(); + if(!isset($maxRecursions)){ + $maxRecursions = 25; + } + $pageContent .= $parser->execute(); + // check if there is a link to next page and recursively get content (max 25 pages) + if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){ + $nextLink = Url::resolve($nextLink,$this->url); + $this->setUrl($nextLink); + $this->execute($pageContent,$recursionDepth+1); + } + else{ + $this->content = $pageContent; + } + Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); + } + } + + /** + * Get the parser. + * + * @return ParserInterface + */ + public function getParser() + { + $ruleLoader = new RuleLoader($this->config); + $rules = $ruleLoader->getRules($this->url); + + if (!empty($rules['grabber'])) { + Logger::setMessage(get_called_class().': Parse content with rules'); + + foreach ($rules['grabber'] as $pattern => $rule) { + $url = new Url($this->url); + $sub_url = $url->getFullPath(); + + if (preg_match($pattern, $sub_url)) { + Logger::setMessage(get_called_class().': Matched url '.$sub_url); + return new RuleParser($this->html, $rule); + } + } + } elseif ($this->enableCandidateParser) { + Logger::setMessage(get_called_class().': Parse content with candidates'); + } + + return new CandidateParser($this->html); + } + + /** + * Normalize encoding and strip head tag. + */ + public function prepareHtml() + { + $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); + + $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); + $this->html = Filter::stripHeadTags($this->html); + + Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"'); + } +} |