diff options
-rw-r--r-- | _cron/emoji-list | 2 | ||||
-rw-r--r-- | _cron/fb-cache-files | 3 | ||||
-rw-r--r-- | bin/emoji-list.py | 12 | ||||
-rw-r--r-- | bin/fb-scrape/get-fb-content.py | 7 | ||||
-rw-r--r-- | bin/fb-scrape/requirements.txt | 2 | ||||
-rwxr-xr-x | bin/refresh-fb-cache.php | 8 | ||||
-rw-r--r-- | config/emoji.json | bin | 96527 -> 100941 bytes | |||
-rw-r--r-- | config/facebook.com_cookies.txt | bin | 875 -> 1107 bytes | |||
-rw-r--r-- | config/pagediff.json | bin | 0 -> 799 bytes | |||
-rw-r--r-- | config/rss.json | bin | 1369 -> 2371 bytes | |||
-rw-r--r-- | http/index.php | 2 | ||||
-rw-r--r-- | providers/Facebook.php | 18 | ||||
-rw-r--r-- | providers/Pagediff.php | 110 | ||||
-rw-r--r-- | providers/Provider.php | 9 | ||||
-rw-r--r-- | providers/Rss.php | 4 | ||||
-rw-r--r-- | providers/Twitter.php | 2 | ||||
-rw-r--r-- | providers/XmlFeed.php | 2 | ||||
-rw-r--r-- | providers/Youtube.php | 19 |
18 files changed, 180 insertions, 20 deletions
diff --git a/_cron/emoji-list b/_cron/emoji-list index f705d00..38abacf 100644 --- a/_cron/emoji-list +++ b/_cron/emoji-list @@ -1 +1 @@ -25 15 * * * python $SITEPATH/bin/emoji-list.py +25 15 * * 3 python3 $SITEPATH/bin/emoji-list.py diff --git a/_cron/fb-cache-files b/_cron/fb-cache-files index 4e116d3..cbcbba0 100644 --- a/_cron/fb-cache-files +++ b/_cron/fb-cache-files @@ -1,3 +1,2 @@ DIRENV_LOG_FORMAT="" -14 * * * * $SITEPATH/bin/refresh-fb-cache.php -10 * * * * find $SITEPATH/cache -size 6c +# 14 */3 * * * $SITEPATH/bin/refresh-fb-cache.php diff --git a/bin/emoji-list.py b/bin/emoji-list.py index 823347d..9e17c30 100644 --- a/bin/emoji-list.py +++ b/bin/emoji-list.py @@ -1,23 +1,23 @@ -import json, os, urllib +import json, os, urllib.request, urllib.parse, urllib.error import xml.etree.ElementTree as ET -emoji_list = json.load(urllib.urlopen( +emoji_list = json.load(urllib.request.urlopen( 'https://raw.githubusercontent.com/iamcal/emoji-data/master/emoji.json' )) dictionary = {} for emoji in emoji_list: - character = ''.join([unichr(int(nibble, 16)) for nibble in emoji['unified'].split('-')]) + character = ''.join([chr(int(nibble, 16)) for nibble in emoji['unified'].split('-')]) name = emoji['name'].replace(' ', '_') if emoji['name'] else emoji['short_name'].upper().replace('-', '_') dictionary[character] = ' [%s] ' % (name) -retardspeak_map = ET.fromstring(urllib.urlopen( +stupidspeak_map = ET.fromstring(urllib.request.urlopen( 'http://slothsoft.net/getResource.php/slothsoft/unicode-mapper' ).read()) -for letter in retardspeak_map.findall('.//letter'): +for letter in stupidspeak_map.findall('.//letter'): if letter.attrib['target'] != letter.attrib['source']: dictionary[letter.attrib['target']] = letter.attrib['source'] -json.dump(dictionary, file(os.path.join( +json.dump(dictionary, open(os.path.join( os.path.dirname(os.path.realpath(__file__)), '../config/emoji.json' ), 'w')) diff --git a/bin/fb-scrape/get-fb-content.py b/bin/fb-scrape/get-fb-content.py index 5b22f70..6d6b7ba 100644 --- a/bin/fb-scrape/get-fb-content.py +++ b/bin/fb-scrape/get-fb-content.py @@ -8,6 +8,7 @@ from pytz_deprecation_shim import PytzUsageWarning from random import randint from facebook_scraper import get_posts, enable_logging +from facebook_scraper.exceptions import TemporarilyBanned from requests.exceptions import RequestException debug = len(sys.argv) > 2 and sys.argv[2] == 'debug' @@ -21,6 +22,10 @@ warnings.filterwarnings( ) warnings.filterwarnings( action='ignore', + message=r"Facebook says 'Unsupported Browser'" +) +warnings.filterwarnings( + action='ignore', category=PytzUsageWarning ) @@ -36,7 +41,7 @@ try: 'images': post['images'] }) time.sleep(randint(10, 15)) -except RequestException: +except (RequestException, TemporarilyBanned): pass print(json.dumps(posts)) diff --git a/bin/fb-scrape/requirements.txt b/bin/fb-scrape/requirements.txt new file mode 100644 index 0000000..6632c5e --- /dev/null +++ b/bin/fb-scrape/requirements.txt @@ -0,0 +1,2 @@ +facebook-scraper +pytz-deprecation-shim diff --git a/bin/refresh-fb-cache.php b/bin/refresh-fb-cache.php index ca50146..6a841de 100755 --- a/bin/refresh-fb-cache.php +++ b/bin/refresh-fb-cache.php @@ -13,7 +13,7 @@ $cacheFiles = glob('../cache/facebook.*'); $fileToFetch = NULL; $oldestCache = PHP_INT_MAX; -foreach ($cacheFiles as $file) { +/*foreach ($cacheFiles as $file) { if (!preg_match('/\.Piwoteka$/', $file)) { $cacheTime = filemtime($file); if ($cacheTime < $oldestCache) { @@ -23,12 +23,12 @@ foreach ($cacheFiles as $file) { $fileToFetch = implode('.', $fileToFetch); } } -} + }*/ -$filesToFetch = ['Piwoteka', 'fermentlodz', $fileToFetch]; +$filesToFetch = ['Piwoteka', 'EtreGourmet']; //, $fileToFetch]; foreach ($filesToFetch as $feed) { - sleep(rand(60, 90)); + sleep(rand(90, 120)); $provider = new \Providers\Facebook($feed, ['force' => TRUE]); if (!$provider->get()) { throw new \Exception(sprintf('Feed %s failed to fetch!', $feed)); diff --git a/config/emoji.json b/config/emoji.json Binary files differindex 4ef174d..b4d7014 100644 --- a/config/emoji.json +++ b/config/emoji.json diff --git a/config/facebook.com_cookies.txt b/config/facebook.com_cookies.txt Binary files differindex 42160a8..8f01e56 100644 --- a/config/facebook.com_cookies.txt +++ b/config/facebook.com_cookies.txt diff --git a/config/pagediff.json b/config/pagediff.json Binary files differnew file mode 100644 index 0000000..d966516 --- /dev/null +++ b/config/pagediff.json diff --git a/config/rss.json b/config/rss.json Binary files differindex fcf0058..961ab73 100644 --- a/config/rss.json +++ b/config/rss.json diff --git a/http/index.php b/http/index.php index ca9b6ba..78e3901 100644 --- a/http/index.php +++ b/http/index.php @@ -26,7 +26,7 @@ $feed = array_shift($params); $parsedParams = []; foreach ($params as $param) { $splitParam = explode(':', $param, 2); - $parsedParams[$splitParam[0]] = count($splitParam) > 1 ? $plitParam[1] : TRUE; + $parsedParams[$splitParam[0]] = count($splitParam) > 1 ? $splitParam[1] : TRUE; } try { diff --git a/providers/Facebook.php b/providers/Facebook.php index 20f5028..384c013 100644 --- a/providers/Facebook.php +++ b/providers/Facebook.php @@ -26,7 +26,23 @@ class Facebook extends \Providers\Provider { implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', 'bin', 'fb-scrape']) . ' ' . 'python ' . implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', 'bin', 'fb-scrape', 'get-fb-content.py']) . ' ' . escapeshellarg($this->_feed), $jsonContent); - return json_decode(implode(PHP_EOL, $jsonContent), TRUE); + $cacheFile = sprintf($this->_getCachePath(), $this->_feed); + if (file_exists($cacheFile)) { + $cache = unserialize($this->_getCache($cacheFile)); + } + else { + $cache = []; + } + $fetched = json_decode(implode(PHP_EOL, $jsonContent), TRUE); + $cacheIDs = array_map(function($obj) { + return $obj['id']; + }, $cache); + foreach ($fetched as $fetchedItem) { + if (!in_array($fetchedItem['id'], $cacheIDs)) { + $cache[] = $fetchedItem; + } + } + return $cache; } protected function _mapItems($content) { diff --git a/providers/Pagediff.php b/providers/Pagediff.php new file mode 100644 index 0000000..8679532 --- /dev/null +++ b/providers/Pagediff.php @@ -0,0 +1,110 @@ +<?php + +namespace Providers; + +require_once('HtmlFeed.php'); +require_once('Item.php'); + +class Pagediff extends \Providers\HtmlFeed { + + protected $_cacheTimeout = '1 hour'; + + public function __construct($feed, $options=[]) { + $config = json_decode(file_get_contents('../config/pagediff.json'), TRUE); + if (!isset($config[$feed])) { + throw new \Exception(sprintf('Feed %s not configured', $feed)); + } + $this->_config = $config[$feed]; + parent::__construct($feed, $options); + } + + protected function _getCachePath() { + return '../cache/pagediff.%s'; + } + + protected function _getFeedUrl($feed) { + return $this->_config['url']; + } + + private function _getItemCachePath() { + return sprintf('../cache/pagediff.items.%s', $this->_feed); + } + + private function _getCachedContent() { + if (!file_exists($this->_getItemCachePath())) { + return []; + } + return unserialize( + file_get_contents( + $this->_getItemCachePath() + ) + ); + } + + private function _saveCachedContent($content) { + return file_put_contents( + $this->_getItemCachePath(), + serialize($content) + ); + } + + private function _getContentFromSelector($tree, $selector) { + $node = $tree->find($selector['node']); + if ($node->count() == 0) { + return NULL; + } + if ($node->count() != 1) { + if (isset($selector['index'])) { + $node = $node->eq($selector['index']); + } else { + $node = $node->first(); + } + } + if (isset($selector['html'])) { + return $node->innerHTML(); + } + if (isset($selector['attr'])) { + $text = $node->attr()[$selector['attr']]; + } else { + $text = $node->text(); + } + if (isset($selector['transform'])) { + $text = sprintf($selector['transform'], $text); + } + return $text; + } + + protected function _parseFeedContent($tree) { + $selectors = $this->_config['selectors']; + $items = $this->_getCachedContent(); + $currentItem = []; + foreach (['id', 'link', 'name', 'text'] as $type) { + $currentItem[$type] = $this->_getContentFromSelector($tree, $selectors[$type]); + } + $currentItem['time'] = date('Y-m-d H:i:s'); + if (!count($items) || $currentItem['id'] != $items[0]['id']) { + $items = array_merge([$currentItem], $items); + $this->_saveCachedContent($items); + } + return $items; + } + + protected function _mapItems($items) { + return array_map(function($item) { + $i = new Item(); + $i->ID = $item['id']; + $i->Title = $item['name']; + $i->Time = $item['time']; + $i->Text = $item['text']; + $i->Link = $item['link']; + return $i; + }, $items); + } + + public function title() { + return $this->_config['title']; + } + +} + +?> diff --git a/providers/Provider.php b/providers/Provider.php index 434b1d1..fe2195c 100644 --- a/providers/Provider.php +++ b/providers/Provider.php @@ -35,6 +35,9 @@ abstract class Provider { return unserialize($this->_getCache($cacheFile)); } else { $content = $this->_fetchItems(); + if (empty($content) && file_exists($cacheFile)) { + return unserialize($this->_getCache($cacheFile)); + } file_put_contents($cacheFile, serialize($content)); $this->_cacheTime = time(); return $content; @@ -51,6 +54,12 @@ abstract class Provider { } } } + if (array_key_exists('title', $this->_options)) { + $keyword = strtolower($this->_options['title']); + $items = array_filter($items, function($item) use($keyword) { + return str_contains(strtolower($item->Title), $keyword); + }); + } return $items; } diff --git a/providers/Rss.php b/providers/Rss.php index 8e20e3f..61e5321 100644 --- a/providers/Rss.php +++ b/providers/Rss.php @@ -31,13 +31,13 @@ class Rss extends \Providers\XmlFeed { protected function _mapItems($content) { $items = []; foreach ($content as $contentString) { - $itemString = str_replace(['content:encoded>', '<dc:', '</dc:', '<media:', '</media:', '<wfw:', '</wfw:'], ['content>', '<', '</', '<', '</', '<', '</'], $contentString); + $itemString = str_replace(['content:encoded>', '<yt:', '</yt:', '<dc:', '</dc:', '<media:', '</media:', '<wfw:', '</wfw:'], ['content>', '<', '</', '<', '</', '<', '</', '<', '</'], $contentString); $item = new \SimpleXMLElement($itemString); $itemObject = new Item(); $itemObject->ID = strval($item->id ?: $item->guid) ?: ltrim(parse_url(strval($item->link))['path'], '/'); $itemObject->Title = strval($item->title); $itemObject->Time = strval($item->published ?: $item->pubDate ?: $item->updated); - $itemObject->Text = strval($item->summary ?: $item->description ?: $item->content) ?: ($item->description ?: $item->content)->children()->asXML(); + $itemObject->Text = strval($item->summary ?: $item->description ?: $item->content ?: $item->group->description); $itemObject->Link = strval(isset($item->link['href']) ? $item->link->attributes()['href'] : $item->link); $itemObject->Author = strval($item->creator ? $item->creator : (is_string($item->author) ? $item->author : $item->author->name)); $items[] = $itemObject; diff --git a/providers/Twitter.php b/providers/Twitter.php index 9a98364..11532b5 100644 --- a/providers/Twitter.php +++ b/providers/Twitter.php @@ -45,7 +45,7 @@ class Twitter extends \Providers\Provider { }, $content->errors )); } - throw new Exception($errorString); + throw new \Exception($errorString); } unset($content->httpstatus); diff --git a/providers/XmlFeed.php b/providers/XmlFeed.php index d5ecd1c..0ac5e6f 100644 --- a/providers/XmlFeed.php +++ b/providers/XmlFeed.php @@ -15,7 +15,7 @@ abstract class XmlFeed extends \Providers\Provider { parent::__construct($feed, $options); $this->_feedUrl = $this->_getFeedUrl($feed); if (!$this->_feedUrl) { - throw new Exception('XML feed "' . $feed . '" undefined'); + throw new \Exception('XML feed "' . $feed . '" undefined'); } } diff --git a/providers/Youtube.php b/providers/Youtube.php new file mode 100644 index 0000000..bd38a52 --- /dev/null +++ b/providers/Youtube.php @@ -0,0 +1,19 @@ +<?php + +namespace Providers; + +require_once('Rss.php'); + +class Youtube extends Rss { + + protected function _getFeedUrl($feed) { + return 'https://www.youtube.com/feeds/videos.xml?channel_id=' . $feed; + } + + protected function _getCachePath() { + return '../cache/youtube.%s'; + } + +} + +?> |