From 35d695b78631d1c5757bbbd56a7ebdc6ef239a8f Mon Sep 17 00:00:00 2001
From: emkael <emkael@tlen.pl>
Date: Fri, 12 Aug 2016 00:47:54 +0200
Subject:  * basic round data fetcher

---
 fetcher.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 fetcher.py

diff --git a/fetcher.py b/fetcher.py
new file mode 100644
index 0000000..c42c5fc
--- /dev/null
+++ b/fetcher.py
@@ -0,0 +1,49 @@
+import urllib2
+from bs4 import BeautifulSoup as bs
+import sys, os, hashlib, re
+from urlparse import urljoin
+
+results_url = sys.argv[1]
+
+results = bs(urllib2.urlopen(results_url).read(), 'lxml')
+
+round_links = []
+for link in results.select('a[href]'):
+    if '/RoundTeams.asp' in link['href']:
+        round_links.append(link['href'])
+
+class Round:
+    tournament = ''
+    name = ''
+    content = ''
+
+    def __repr__(self):
+        return self.tournament + ': ' + self.name
+
+    def __eq__(self, other):
+        return self.tournament == other.tournament and self.name == other.name
+
+round_regex = re.compile('^round ', flags=re.I)
+rounds = []
+for r in set(round_links):
+    round_hash = hashlib.sha224(results_url + r).hexdigest()
+    cache_path = os.path.join('cache', round_hash)
+    if not os.path.exists(cache_path):
+        r_content = urllib2.urlopen(urljoin(results_url, r)).read()
+        file(cache_path, 'w').write(r_content)
+    else:
+        r_content = file(cache_path).read()
+    content = bs(r_content, 'lxml')
+    first_row = content.select('table tr')[0]
+    cells = [cell.text.strip() for cell in first_row.select('td')]
+    round_cells = [cell for cell in cells if round_regex.match(cell)]
+    other_cells = [cell for cell in cells if not round_regex.match(cell)]
+    new_round = Round()
+    new_round.name = ' '.join(round_cells)
+    new_round.tournament = ' - '.join(other_cells)
+    new_round.content = content
+    if new_round not in rounds:
+        rounds.append(new_round)
+
+for r in rounds:
+    print r
-- 
cgit v1.2.3