From 28169d1de9508f78309517859c6f918eccfe6dd0 Mon Sep 17 00:00:00 2001
From: emkael <emkael@tlen.pl>
Date: Thu, 14 Feb 2019 01:43:50 +0300
Subject: Rate-limiting downloads

---
 fetcher.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fetcher.py b/fetcher.py
index 291a71b..60eb0fa 100644
--- a/fetcher.py
+++ b/fetcher.py
@@ -1,16 +1,27 @@
 import urllib2
 from bs4 import BeautifulSoup as bs
 from urlparse import urljoin
-import sys, os, hashlib, re
+import sys, os, hashlib, re, time, csv
+
+fetch_limit = int(sys.argv[2]) if len(sys.argv) > 2 else 50
+fetch_delay = int(sys.argv[3]) if len(sys.argv) > 3 else 120
+currently_fetched = 0
+
+bye_string = 'BYE'
 
 def fetch_url(url):
+    global currently_fetched
     round_hash = hashlib.sha224(url).hexdigest()
     cache_path = os.path.join('cache', round_hash)
     if not os.path.exists(cache_path):
         print 'Fetching: %s' % (url)
+        if currently_fetched > fetch_limit:
+            print 'Fetch rate limit reached, delaying for %d seconds.' % (fetch_delay)
+            time.sleep(fetch_delay)
+            currently_fetched = 0
         r_content = urllib2.urlopen(url).read()
         file(cache_path, 'w').write(r_content)
-        print 'Done.'
+        currently_fetched += 1
     else:
         r_content = file(cache_path).read()
     return r_content
-- 
cgit v1.2.3