From 28169d1de9508f78309517859c6f918eccfe6dd0 Mon Sep 17 00:00:00 2001 From: emkael Date: Thu, 14 Feb 2019 01:43:50 +0300 Subject: Rate-limiting downloads --- fetcher.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fetcher.py b/fetcher.py index 291a71b..60eb0fa 100644 --- a/fetcher.py +++ b/fetcher.py @@ -1,16 +1,27 @@ import urllib2 from bs4 import BeautifulSoup as bs from urlparse import urljoin -import sys, os, hashlib, re +import sys, os, hashlib, re, time, csv + +fetch_limit = int(sys.argv[2]) if len(sys.argv) > 2 else 50 +fetch_delay = int(sys.argv[3]) if len(sys.argv) > 3 else 120 +currently_fetched = 0 + +bye_string = 'BYE' def fetch_url(url): + global currently_fetched round_hash = hashlib.sha224(url).hexdigest() cache_path = os.path.join('cache', round_hash) if not os.path.exists(cache_path): print 'Fetching: %s' % (url) + if currently_fetched > fetch_limit: + print 'Fetch rate limit reached, delaying for %d seconds.' % (fetch_delay) + time.sleep(fetch_delay) + currently_fetched = 0 r_content = urllib2.urlopen(url).read() file(cache_path, 'w').write(r_content) - print 'Done.' + currently_fetched += 1 else: r_content = file(cache_path).read() return r_content -- cgit v1.2.3