From 5df7a81d952c9aed7c9adadadecdeb84f1e5c264 Mon Sep 17 00:00:00 2001 From: emkael Date: Thu, 31 Jan 2019 17:43:10 +0100 Subject: Universal classes for XML/HTML scraping --- providers/HtmlFeed.php | 44 ++++++++++++++++++++++++++++++++++++++++++++ providers/XmlFeed.php | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 providers/HtmlFeed.php create mode 100644 providers/XmlFeed.php (limited to 'providers') diff --git a/providers/HtmlFeed.php b/providers/HtmlFeed.php new file mode 100644 index 0000000..411cddf --- /dev/null +++ b/providers/HtmlFeed.php @@ -0,0 +1,44 @@ +_encoding = mb_detect_encoding($page); + return $page; + } + + protected function _fetchItems() { + $page = $this->__getHttpContent($this->_feedUrl); + $this->_feedXml = htmlqp($page, NULL, ['convert_from_encoding' => $this->_encoding, 'convert_to_encoding' => $this->_encoding]); + return $this->_parseFeedContent($this->_feedXml); + } + +} + +?> diff --git a/providers/XmlFeed.php b/providers/XmlFeed.php new file mode 100644 index 0000000..d5ecd1c --- /dev/null +++ b/providers/XmlFeed.php @@ -0,0 +1,39 @@ +_feedUrl = $this->_getFeedUrl($feed); + if (!$this->_feedUrl) { + throw new Exception('XML feed "' . $feed . '" undefined'); + } + } + + abstract protected function _parseFeedContent($feed); + + protected function _fetchItems() { + $this->_feedXml = new \SimpleXMLElement($this->_feedUrl, 0, TRUE); + return $this->_parseFeedContent($this->_feedXml); + } + + protected function _spamFilter($content) { + return $content; + } + + protected function _sortContent($content) { + return $content; + } + +} + +?> -- cgit v1.2.3