summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2021-04-02 03:06:56 +0200
committeremkael <emkael@tlen.pl>2021-04-02 03:06:56 +0200
commit24bd87bcaf315c761f85dc898fcaf9a715b630c4 (patch)
tree06d47e54410feeefb5c38cd95eb93d0ae3bb2ff0
parent2bb2ae4c54beba67e7c63a02c7509182379589d9 (diff)
Using an external (Python...) library to fetch FB posts
Obligatory "Fuck you, Zuckerberg"
-rw-r--r--bin/get-fb-content.py19
-rw-r--r--providers/Facebook.php84
2 files changed, 32 insertions, 71 deletions
diff --git a/bin/get-fb-content.py b/bin/get-fb-content.py
new file mode 100644
index 0000000..c7238e0
--- /dev/null
+++ b/bin/get-fb-content.py
@@ -0,0 +1,19 @@
+import json
+import logging
+import sys
+from os import path
+
+from facebook_scraper import get_posts
+
+
+BASEDIR = path.dirname(__file__)
+
+posts = []
+for post in get_posts(sys.argv[1], cookies=path.join(BASEDIR, '../config/facebook.com_cookies.txt'), pages=3):
+ posts.append({
+ 'id': post['post_id'],
+ 'time': str(post['time']),
+ 'texts': [t.strip() for t in post['text'].split('\n') if t]
+ })
+
+print(json.dumps(posts))
diff --git a/providers/Facebook.php b/providers/Facebook.php
index d5233c3..7757dff 100644
--- a/providers/Facebook.php
+++ b/providers/Facebook.php
@@ -5,91 +5,25 @@ namespace Providers;
require_once('HtmlFeed.php');
require_once('Item.php');
-class Facebook extends \Providers\HtmlFeed {
+class Facebook extends \Providers\Provider {
protected $_cacheTimeout = '300 years';
public function __construct($feed, $options) {
parent::__construct($feed, $options);
- if (isset($this->_options['dump'])) {
- $this->_options['force'] = TRUE;
- }
if (isset($this->_options['force'])) {
$this->_cacheTimeout = '1 second';
}
}
- protected function __getUserAgent() {
- return 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.51 Safari/537.36';
- }
-
protected function _getCachePath() {
return '../cache/facebook.%s';
}
- protected function _getFeedUrl($feed) {
- return sprintf('https://m.facebook.com/%s/posts', $feed);
- }
-
- private function _extractTimestamp($block) {
- $origString = (string)$block->find('abbr')->eq(0)->text();
- $string = str_replace("\xc2\xa0", ' ', $origString);
- $string = str_replace(' godzinie ', ' ', $string);
- $string = str_replace(' o ', ', ', $string);
- $string = str_replace('Wczoraj', 'Yesterday', $string);
- $string = str_replace('Dzisiaj', '', $string);
- $string = str_replace('godz.', 'hours ago', $string);
- $string = str_replace('min', 'minutes ago', $string);
- $string = str_replace('Przed chwilą', 'now', $string);
- $string = str_replace(['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'], ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], $string);
- $time = strtotime($string);
- if (!$time) {
- var_dump(bin2hex($origString));
- throw new \Exception('Cannot parse date string: ' . $origString);
- }
- return $string;
- }
-
- protected function _parseFeedContent($tree) {
- $items = [];
- if (isset($this->_options['dump'])) {
- print($tree->html());
- }
- foreach ($tree->find('#timelineBody div[data-ft]') as $header) {
- $data = json_decode($header->attr()['data-ft'], TRUE);
- if (isset($this->_options['dump'])) {
- var_dump($data);
- }
- if (isset($data['mf_story_key'])) {
- $key = $data['mf_story_key'];
- $texts = [];
- foreach ($header->find('p, h3') as $paragraph) {
- $text = $paragraph->text();
- if ($text != 'Więcej') {
- $texts[] = $text;
- }
- }
- if (isset($this->_options['dump'])) {
- print_r($data);
- print($key);
- print(PHP_EOL);
- print_r($texts);
- print(PHP_EOL);
- }
- if (count($texts)) {
- $items[$key] = [
- 'id' => $key,
- 'time' => $this->_extractTimestamp($header),
- 'content' => $header->html(),
- 'texts' => $texts
- ];
- }
- }
- }
- if (isset($this->_options['dump'])) {
- die();
- }
- return array_values($items);
+ protected function _fetchItems() {
+ $jsonContent = [];
+ exec('python3 ' . implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', 'bin', 'get-fb-content.py']) . ' ' . escapeshellarg($this->_feed), $jsonContent);
+ return json_decode(implode(PHP_EOL, $jsonContent), TRUE);
}
protected function _mapItems($content) {
@@ -110,6 +44,14 @@ class Facebook extends \Providers\HtmlFeed {
);
}
+ protected function _sortContent($content) {
+ return $content;
+ }
+
+ protected function _spamFilter($items) {
+ return $items;
+ }
+
public function title() {
return sprintf("%s's Facebook page posts", $this->_feed);
}