diff options
author | emkael <emkael@tlen.pl> | 2022-01-31 21:39:23 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2022-01-31 21:39:23 +0100 |
commit | 9601f3be1ee5be711671a922c3282339aeb1ddf4 (patch) | |
tree | 7b402675b2029e49b5756f59c2e5ef77db8186cc /bin/fb-scrape | |
parent | 489816e3573afff315f61292ba2e1afc2fe96a16 (diff) |
Another round of FB crap
Diffstat (limited to 'bin/fb-scrape')
-rw-r--r-- | bin/fb-scrape/.gitignore | 2 | ||||
-rw-r--r-- | bin/fb-scrape/get-fb-content.py | 42 |
2 files changed, 44 insertions, 0 deletions
diff --git a/bin/fb-scrape/.gitignore b/bin/fb-scrape/.gitignore new file mode 100644 index 0000000..772944e --- /dev/null +++ b/bin/fb-scrape/.gitignore @@ -0,0 +1,2 @@ +.envrc +.direnv diff --git a/bin/fb-scrape/get-fb-content.py b/bin/fb-scrape/get-fb-content.py new file mode 100644 index 0000000..5b22f70 --- /dev/null +++ b/bin/fb-scrape/get-fb-content.py @@ -0,0 +1,42 @@ +import json +import logging +import sys +import time +import warnings +from os import path +from pytz_deprecation_shim import PytzUsageWarning +from random import randint + +from facebook_scraper import get_posts, enable_logging +from requests.exceptions import RequestException + +debug = len(sys.argv) > 2 and sys.argv[2] == 'debug' + +if debug: + enable_logging(logging.DEBUG) + +warnings.filterwarnings( + action='ignore', + message=r'A low page limit' +) +warnings.filterwarnings( + action='ignore', + category=PytzUsageWarning +) + +BASEDIR = path.dirname(__file__) + +posts = [] +try: + for post in get_posts(sys.argv[1], pages=2, cookies=path.join(BASEDIR, '../../config/facebook.com_cookies.txt')): + posts.append({ + 'id': post['post_id'], + 'time': str(post['time']), + 'texts': [t.strip() for t in post['text'].split('\n') if t] if post['text'] else [], + 'images': post['images'] + }) + time.sleep(randint(10, 15)) +except RequestException: + pass + +print(json.dumps(posts)) |