From 24bd87bcaf315c761f85dc898fcaf9a715b630c4 Mon Sep 17 00:00:00 2001 From: emkael Date: Fri, 2 Apr 2021 03:06:56 +0200 Subject: Using an external (Python...) library to fetch FB posts Obligatory "Fuck you, Zuckerberg" --- bin/get-fb-content.py | 19 ++++++++++++ providers/Facebook.php | 84 ++++++++------------------------------------------ 2 files changed, 32 insertions(+), 71 deletions(-) create mode 100644 bin/get-fb-content.py diff --git a/bin/get-fb-content.py b/bin/get-fb-content.py new file mode 100644 index 0000000..c7238e0 --- /dev/null +++ b/bin/get-fb-content.py @@ -0,0 +1,19 @@ +import json +import logging +import sys +from os import path + +from facebook_scraper import get_posts + + +BASEDIR = path.dirname(__file__) + +posts = [] +for post in get_posts(sys.argv[1], cookies=path.join(BASEDIR, '../config/facebook.com_cookies.txt'), pages=3): + posts.append({ + 'id': post['post_id'], + 'time': str(post['time']), + 'texts': [t.strip() for t in post['text'].split('\n') if t] + }) + +print(json.dumps(posts)) diff --git a/providers/Facebook.php b/providers/Facebook.php index d5233c3..7757dff 100644 --- a/providers/Facebook.php +++ b/providers/Facebook.php @@ -5,91 +5,25 @@ namespace Providers; require_once('HtmlFeed.php'); require_once('Item.php'); -class Facebook extends \Providers\HtmlFeed { +class Facebook extends \Providers\Provider { protected $_cacheTimeout = '300 years'; public function __construct($feed, $options) { parent::__construct($feed, $options); - if (isset($this->_options['dump'])) { - $this->_options['force'] = TRUE; - } if (isset($this->_options['force'])) { $this->_cacheTimeout = '1 second'; } } - protected function __getUserAgent() { - return 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.51 Safari/537.36'; - } - protected function _getCachePath() { return '../cache/facebook.%s'; } - protected function _getFeedUrl($feed) { - return sprintf('https://m.facebook.com/%s/posts', $feed); - } - - private function _extractTimestamp($block) { - $origString = (string)$block->find('abbr')->eq(0)->text(); - $string = str_replace("\xc2\xa0", ' ', $origString); - $string = str_replace(' godzinie ', ' ', $string); - $string = str_replace(' o ', ', ', $string); - $string = str_replace('Wczoraj', 'Yesterday', $string); - $string = str_replace('Dzisiaj', '', $string); - $string = str_replace('godz.', 'hours ago', $string); - $string = str_replace('min', 'minutes ago', $string); - $string = str_replace('Przed chwilą', 'now', $string); - $string = str_replace(['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'], ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], $string); - $time = strtotime($string); - if (!$time) { - var_dump(bin2hex($origString)); - throw new \Exception('Cannot parse date string: ' . $origString); - } - return $string; - } - - protected function _parseFeedContent($tree) { - $items = []; - if (isset($this->_options['dump'])) { - print($tree->html()); - } - foreach ($tree->find('#timelineBody div[data-ft]') as $header) { - $data = json_decode($header->attr()['data-ft'], TRUE); - if (isset($this->_options['dump'])) { - var_dump($data); - } - if (isset($data['mf_story_key'])) { - $key = $data['mf_story_key']; - $texts = []; - foreach ($header->find('p, h3') as $paragraph) { - $text = $paragraph->text(); - if ($text != 'Więcej') { - $texts[] = $text; - } - } - if (isset($this->_options['dump'])) { - print_r($data); - print($key); - print(PHP_EOL); - print_r($texts); - print(PHP_EOL); - } - if (count($texts)) { - $items[$key] = [ - 'id' => $key, - 'time' => $this->_extractTimestamp($header), - 'content' => $header->html(), - 'texts' => $texts - ]; - } - } - } - if (isset($this->_options['dump'])) { - die(); - } - return array_values($items); + protected function _fetchItems() { + $jsonContent = []; + exec('python3 ' . implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', 'bin', 'get-fb-content.py']) . ' ' . escapeshellarg($this->_feed), $jsonContent); + return json_decode(implode(PHP_EOL, $jsonContent), TRUE); } protected function _mapItems($content) { @@ -110,6 +44,14 @@ class Facebook extends \Providers\HtmlFeed { ); } + protected function _sortContent($content) { + return $content; + } + + protected function _spamFilter($items) { + return $items; + } + public function title() { return sprintf("%s's Facebook page posts", $this->_feed); } -- cgit v1.2.3