summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2016-08-12 00:47:54 +0200
committeremkael <emkael@tlen.pl>2016-08-12 00:54:28 +0200
commit35d695b78631d1c5757bbbd56a7ebdc6ef239a8f (patch)
tree70aae52cfe92968a02e2f779128ae4fd9e86918c
parenta2ca758232575dd4f20609c02519ba535764d1b4 (diff)
* basic round data fetcher
-rw-r--r--fetcher.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/fetcher.py b/fetcher.py
new file mode 100644
index 0000000..c42c5fc
--- /dev/null
+++ b/fetcher.py
@@ -0,0 +1,49 @@
+import urllib2
+from bs4 import BeautifulSoup as bs
+import sys, os, hashlib, re
+from urlparse import urljoin
+
+results_url = sys.argv[1]
+
+results = bs(urllib2.urlopen(results_url).read(), 'lxml')
+
+round_links = []
+for link in results.select('a[href]'):
+ if '/RoundTeams.asp' in link['href']:
+ round_links.append(link['href'])
+
+class Round:
+ tournament = ''
+ name = ''
+ content = ''
+
+ def __repr__(self):
+ return self.tournament + ': ' + self.name
+
+ def __eq__(self, other):
+ return self.tournament == other.tournament and self.name == other.name
+
+round_regex = re.compile('^round ', flags=re.I)
+rounds = []
+for r in set(round_links):
+ round_hash = hashlib.sha224(results_url + r).hexdigest()
+ cache_path = os.path.join('cache', round_hash)
+ if not os.path.exists(cache_path):
+ r_content = urllib2.urlopen(urljoin(results_url, r)).read()
+ file(cache_path, 'w').write(r_content)
+ else:
+ r_content = file(cache_path).read()
+ content = bs(r_content, 'lxml')
+ first_row = content.select('table tr')[0]
+ cells = [cell.text.strip() for cell in first_row.select('td')]
+ round_cells = [cell for cell in cells if round_regex.match(cell)]
+ other_cells = [cell for cell in cells if not round_regex.match(cell)]
+ new_round = Round()
+ new_round.name = ' '.join(round_cells)
+ new_round.tournament = ' - '.join(other_cells)
+ new_round.content = content
+ if new_round not in rounds:
+ rounds.append(new_round)
+
+for r in rounds:
+ print r