diff options
Diffstat (limited to 'vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php')
-rw-r--r-- | vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php new file mode 100644 index 00000000..6c53a289 --- /dev/null +++ b/vendor/miniflux/picofeed/lib/PicoFeed/Scraper/CandidateParser.php @@ -0,0 +1,283 @@ +<?php + +namespace PicoFeed\Scraper; + +use DomDocument; +use DOMXPath; +use PicoFeed\Logging\Logger; +use PicoFeed\Parser\XmlParser; + +/** + * Candidate Parser. + * + * @author Frederic Guillot + */ +class CandidateParser implements ParserInterface +{ + private $dom; + private $xpath; + + /** + * List of attributes to try to get the content, order is important, generic terms at the end. + * + * @var array + */ + private $candidatesAttributes = array( + 'articleBody', + 'articlebody', + 'article-body', + 'articleContent', + 'articlecontent', + 'article-content', + 'articlePage', + 'post-content', + 'post_content', + 'entry-content', + 'entry-body', + 'main-content', + 'story_content', + 'storycontent', + 'entryBox', + 'entrytext', + 'comic', + 'post', + 'article', + 'content', + 'main', + ); + + /** + * List of attributes to strip. + * + * @var array + */ + private $stripAttributes = array( + 'comment', + 'share', + 'links', + 'toolbar', + 'fb', + 'footer', + 'credit', + 'bottom', + 'nav', + 'header', + 'social', + 'tag', + 'metadata', + 'entry-utility', + 'related-posts', + 'tweet', + 'categories', + 'post_title', + 'by_line', + 'byline', + 'sponsors', + ); + + /** + * Tags to remove. + * + * @var array + */ + private $stripTags = array( + 'nav', + 'header', + 'footer', + 'aside', + 'form', + ); + + /** + * Constructor. + * + * @param string $html + */ + public function __construct($html) + { + $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html); + $this->xpath = new DOMXPath($this->dom); + } + + /** + * Get the relevant content with the list of potential attributes. + * + * @return string + */ + public function execute() + { + $content = $this->findContentWithCandidates(); + + if (strlen($content) < 200) { + $content = $this->findContentWithArticle(); + } + + if (strlen($content) < 50) { + $content = $this->findContentWithBody(); + } + + return $this->stripGarbage($content); + } + + /** + * Find content based on the list of tag candidates. + * + * @return string + */ + public function findContentWithCandidates() + { + foreach ($this->candidatesAttributes as $candidate) { + Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); + + $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"'); + + return $this->dom->saveXML($nodes->item(0)); + } + } + + return ''; + } + + /** + * Find <article/> tag. + * + * @return string + */ + public function findContentWithArticle() + { + $nodes = $this->xpath->query('//article'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Find <article/> tag'); + + return $this->dom->saveXML($nodes->item(0)); + } + + return ''; + } + + /** + * Find <body/> tag. + * + * @return string + */ + public function findContentWithBody() + { + $nodes = $this->xpath->query('//body'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().' Find <body/>'); + + return $this->dom->saveXML($nodes->item(0)); + } + + return ''; + } + + /** + * Strip useless tags. + * + * @param string $content + * + * @return string + */ + public function stripGarbage($content) + { + $dom = XmlParser::getDomDocument($content); + + if ($dom !== false) { + $xpath = new DOMXPath($dom); + + $this->stripTags($xpath); + $this->stripAttributes($dom, $xpath); + + $content = $dom->saveXML($dom->documentElement); + } + + return $content; + } + + /** + * Remove blacklisted tags. + * + * @param DOMXPath $xpath + */ + public function stripTags(DOMXPath $xpath) + { + foreach ($this->stripTags as $tag) { + $nodes = $xpath->query('//'.$tag); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); + + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + } + + /** + * Remove blacklisted attributes. + * + * @param DomDocument $dom + * @param DOMXPath $xpath + */ + public function stripAttributes(DomDocument $dom, DOMXPath $xpath) + { + foreach ($this->stripAttributes as $attribute) { + $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); + + foreach ($nodes as $node) { + if ($this->shouldRemove($dom, $node)) { + $node->parentNode->removeChild($node); + } + } + } + } + } + + /** + * Find link for next page of the article. + * + * @return string + */ + public function findNextLink() + { + return null; + } + + /** + * Return false if the node should not be removed. + * + * @param DomDocument $dom + * @param DomNode $node + * + * @return bool + */ + public function shouldRemove(DomDocument $dom, $node) + { + $document_length = strlen($dom->textContent); + $node_length = strlen($node->textContent); + + if ($document_length === 0) { + return true; + } + + $ratio = $node_length * 100 / $document_length; + + if ($ratio >= 90) { + Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); + + return false; + } + + return true; + } +} |