diff options
author | emkael <emkael@tlen.pl> | 2019-02-14 01:43:50 +0300 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2019-02-14 01:43:50 +0300 |
commit | 28169d1de9508f78309517859c6f918eccfe6dd0 (patch) | |
tree | 45eb16e4264b8cd5f8b56f6cb08c7cf50b8065d5 | |
parent | f5d19105dd57ce3b625cd19eeefc27118330ebb6 (diff) |
Rate-limiting downloads
-rw-r--r-- | fetcher.py | 15 |
1 files changed, 13 insertions, 2 deletions
@@ -1,16 +1,27 @@ import urllib2 from bs4 import BeautifulSoup as bs from urlparse import urljoin -import sys, os, hashlib, re +import sys, os, hashlib, re, time, csv + +fetch_limit = int(sys.argv[2]) if len(sys.argv) > 2 else 50 +fetch_delay = int(sys.argv[3]) if len(sys.argv) > 3 else 120 +currently_fetched = 0 + +bye_string = 'BYE' def fetch_url(url): + global currently_fetched round_hash = hashlib.sha224(url).hexdigest() cache_path = os.path.join('cache', round_hash) if not os.path.exists(cache_path): print 'Fetching: %s' % (url) + if currently_fetched > fetch_limit: + print 'Fetch rate limit reached, delaying for %d seconds.' % (fetch_delay) + time.sleep(fetch_delay) + currently_fetched = 0 r_content = urllib2.urlopen(url).read() file(cache_path, 'w').write(r_content) - print 'Done.' + currently_fetched += 1 else: r_content = file(cache_path).read() return r_content |