summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2019-02-14 01:43:50 +0300
committeremkael <emkael@tlen.pl>2019-02-14 01:43:50 +0300
commit28169d1de9508f78309517859c6f918eccfe6dd0 (patch)
tree45eb16e4264b8cd5f8b56f6cb08c7cf50b8065d5
parentf5d19105dd57ce3b625cd19eeefc27118330ebb6 (diff)
Rate-limiting downloads
-rw-r--r--fetcher.py15
1 files changed, 13 insertions, 2 deletions
diff --git a/fetcher.py b/fetcher.py
index 291a71b..60eb0fa 100644
--- a/fetcher.py
+++ b/fetcher.py
@@ -1,16 +1,27 @@
import urllib2
from bs4 import BeautifulSoup as bs
from urlparse import urljoin
-import sys, os, hashlib, re
+import sys, os, hashlib, re, time, csv
+
+fetch_limit = int(sys.argv[2]) if len(sys.argv) > 2 else 50
+fetch_delay = int(sys.argv[3]) if len(sys.argv) > 3 else 120
+currently_fetched = 0
+
+bye_string = 'BYE'
def fetch_url(url):
+ global currently_fetched
round_hash = hashlib.sha224(url).hexdigest()
cache_path = os.path.join('cache', round_hash)
if not os.path.exists(cache_path):
print 'Fetching: %s' % (url)
+ if currently_fetched > fetch_limit:
+ print 'Fetch rate limit reached, delaying for %d seconds.' % (fetch_delay)
+ time.sleep(fetch_delay)
+ currently_fetched = 0
r_content = urllib2.urlopen(url).read()
file(cache_path, 'w').write(r_content)
- print 'Done.'
+ currently_fetched += 1
else:
r_content = file(cache_path).read()
return r_content