summaryrefslogtreecommitdiff
path: root/fetcher.py
diff options
context:
space:
mode:
Diffstat (limited to 'fetcher.py')
-rw-r--r--fetcher.py320
1 files changed, 308 insertions, 12 deletions
diff --git a/fetcher.py b/fetcher.py
index c42c5fc..0b110b4 100644
--- a/fetcher.py
+++ b/fetcher.py
@@ -2,10 +2,21 @@ import urllib2
from bs4 import BeautifulSoup as bs
import sys, os, hashlib, re
from urlparse import urljoin
+from math import floor
results_url = sys.argv[1]
-results = bs(urllib2.urlopen(results_url).read(), 'lxml')
+def fetch_url(url):
+ round_hash = hashlib.sha224(url).hexdigest()
+ cache_path = os.path.join('cache', round_hash)
+ if not os.path.exists(cache_path):
+ r_content = urllib2.urlopen(url).read()
+ file(cache_path, 'w').write(r_content)
+ else:
+ r_content = file(cache_path).read()
+ return r_content
+
+results = bs(fetch_url(results_url), 'lxml')
round_links = []
for link in results.select('a[href]'):
@@ -13,27 +24,34 @@ for link in results.select('a[href]'):
round_links.append(link['href'])
class Round:
- tournament = ''
+ tournament = None
+ url = ''
name = ''
content = ''
+ tables = None
+ boards = None
+
+ def __init__(self):
+ self.tables = []
+ self.boards = {}
+
+ def number(self):
+ return int(re.sub('\b*round\b*', '', self.name, flags=re.I))
def __repr__(self):
- return self.tournament + ': ' + self.name
+ return self.tournament.name + ': ' + self.name
def __eq__(self, other):
return self.tournament == other.tournament and self.name == other.name
+ def __gt__(self, other):
+ return self.number() > other.number()
+
round_regex = re.compile('^round ', flags=re.I)
rounds = []
for r in set(round_links):
- round_hash = hashlib.sha224(results_url + r).hexdigest()
- cache_path = os.path.join('cache', round_hash)
- if not os.path.exists(cache_path):
- r_content = urllib2.urlopen(urljoin(results_url, r)).read()
- file(cache_path, 'w').write(r_content)
- else:
- r_content = file(cache_path).read()
- content = bs(r_content, 'lxml')
+ url = urljoin(results_url, r)
+ content = bs(fetch_url(url), 'lxml')
first_row = content.select('table tr')[0]
cells = [cell.text.strip() for cell in first_row.select('td')]
round_cells = [cell for cell in cells if round_regex.match(cell)]
@@ -42,8 +60,286 @@ for r in set(round_links):
new_round.name = ' '.join(round_cells)
new_round.tournament = ' - '.join(other_cells)
new_round.content = content
+ new_round.url = url
if new_round not in rounds:
rounds.append(new_round)
+tournament_data = {}
for r in rounds:
- print r
+ if r.tournament not in tournament_data:
+ tournament_data[r.tournament] = []
+ tournament_data[r.tournament].append(r)
+
+class Tournament:
+ name = ''
+ rounds = None
+ lineup = None
+
+ def __init__(self, name, rounds):
+ self.lineup = []
+ self.name = name
+ self.rounds = rounds
+ for round in rounds:
+ round.tournament = self
+
+ def __repr__(self):
+ return '%s (%d rounds)' % (self.name, len(self.rounds))
+
+tournaments = []
+for tour in tournament_data:
+ tournaments.append(Tournament(tour, tournament_data[tour]))
+
+class Table:
+ results = None
+ content = ''
+
+ def __init__(self):
+ self.results = []
+
+class Pair:
+ first_name = ''
+ second_name = ''
+ nation = ''
+ results = None
+
+ def __init__(self, name1, name2, nation):
+ self.results = []
+ self.first_name = name1
+ self.second_name = name2
+ self.nation = nation
+
+ def __eq__(self, other):
+ return ' - '.join(sorted([self.first_name, self.second_name])) == \
+ ' - '.join(sorted([other.first_name, other.second_name]))
+
+ def __repr__(self):
+ return '%s - %s (%s)' % (self.first_name, self.second_name, self.nation)
+
+ def __hash__(self):
+ return int(hashlib.sha224(self.__repr__()).hexdigest(), 16)
+
+class Result:
+ ns_pair = None
+ ew_pair = None
+ tour_round = None
+ board_no = 0
+ score = 0
+ butler = 0
+ cutoff_butler = 0
+ cavendish = 0
+
+ def __init__(self, ns, ew, score, rnd, board):
+ self.ns_pair = ns
+ self.ew_pair = ew
+ self.score = score
+ self.tour_round = rnd
+ self.board_no = board
+ self.ns_pair.results.append(self)
+ self.ew_pair.results.append(self)
+
+ def __gt__(self, other):
+ return self.score > other.score
+
+ def __repr__(self):
+ return '%d-%d\t%d\t%d\t%d\t%f' % (self.tour_round.number(), self.board_no,
+ self.score,
+ self.butler, self.cutoff_butler,
+ self.cavendish)
+
+for tour in tournaments:
+ for r in tour.rounds:
+ table_urls = [urljoin(r.url, link['href']) for link in r.content.select('a[href]') if 'BoardDetails.asp' in link['href']]
+ for url in table_urls:
+ table = Table()
+ table.content = bs(fetch_url(url), 'lxml')
+ team_links = [link for link in table.content.select('div[align] a[href]') if 'TeamDetails.asp' in link['href']]
+ if len(team_links) == 2:
+ home_team = team_links[0].text
+ away_team = team_links[1].text
+ players = [link.text.strip() for link in table.content.select('a[href]') if 'people/person.asp' in link['href']]
+ if len(players) == 8:
+ pairs = [
+ Pair(players[0], players[6], home_team), # open
+ Pair(players[4], players[5], home_team), # closed
+ Pair(players[2], players[3], away_team), # open
+ Pair(players[1], players[7], away_team) # closed
+ ]
+ for i, pair in enumerate(pairs):
+ try:
+ pairs[i] = tour.lineup[tour.lineup.index(pair)]
+ except ValueError:
+ tour.lineup.append(pair)
+ result_cells = [int(cell.text.strip()) if len(cell.text.strip()) > 0 else 0 for cell in table.content.select('tr[nowrap] b')]
+ open_scores = []
+ closed_scores = []
+ for i in range(0, len(result_cells) / 6):
+ open_scores.append(result_cells[6*i] - result_cells[6*i + 1])
+ closed_scores.append(result_cells[6*i + 2] - result_cells[6*i + 3])
+ for board, score in enumerate(open_scores):
+ new_score = Result(pairs[0], pairs[2], score, r, board+1)
+ table.results.append(new_score)
+ for board, score in enumerate(closed_scores):
+ new_score = Result(pairs[3], pairs[1], score, r, board+1)
+ table.results.append(new_score)
+ r.tables.append(table)
+ for r in tour.rounds:
+ for table in r.tables:
+ for result in table.results:
+ if result.board_no not in r.boards:
+ r.boards[result.board_no] = []
+ r.boards[result.board_no].append(result)
+
+def imp(res1, res2):
+ diff = res1 - res2
+ ew = False
+ if diff < 0:
+ ew = True
+ diff = -diff
+ thresholds = [20, 50, 90, 130, 170, 220, 270, 320, 370, 430,
+ 500, 600, 750, 900, 1100, 1300, 1500, 1750,
+ 2000, 2250, 2500, 3000, 3500, 4000]
+ imps = len([t for t in thresholds if diff >= t])
+ return -imps if ew else imps
+
+def get_datum(board):
+ average = float(sum([r.score for r in board])) / len(board)
+ return int(round(average / 10)) * 10
+
+for tour in tournaments:
+
+ if len(tour.lineup) == 0:
+ continue
+
+ for rnd in tour.rounds:
+ for i, board in rnd.boards.iteritems():
+ datum = get_datum(board)
+ cutoff = int(floor(len(board) / 4))
+ cutoff_results = sorted(board)
+ cutoff_datum = get_datum(cutoff_results[cutoff:-cutoff])
+ for r in board:
+ r.butler = imp(r.score, datum)
+ r.cutoff_butler = imp(r.score, cutoff_datum)
+ r.cavendish = float(sum([imp(r.score, other.score) for other in board if r <> other])) / float((len(board) - 1))
+
+ print tour.name
+
+ for pair in tour.lineup:
+ print pair
+ result_table = []
+ for res in pair.results:
+ ew = -1 if res.ew_pair == pair else 1
+ result_table.append([
+ res.tour_round.number(),
+ res.board_no,
+ res.score,
+ ew * res.butler,
+ ew * res.cutoff_butler,
+ ew * res.cavendish
+ ])
+ for r in sorted(result_table, cmp=lambda x,y: cmp(x[0], y[0]) or cmp(x[1], y[1])):
+ print '\t'.join([str(s) for s in r[:3]] + [str(round(s, 2)) for s in r[3:]])
+ print '\t'.join([
+ str(len(result_table)),
+ '',
+ '',
+ str(round(float(sum([r[3] for r in result_table])) / float(len(result_table)), 2)),
+ str(round(float(sum([r[4] for r in result_table])) / float(len(result_table)), 2)),
+ str(round(float(sum([r[5] for r in result_table])) / float(len(result_table)), 2))
+ ])
+ print
+ print
+
+ head_to_head = {}
+ for r in tour.rounds:
+ for table in r.tables:
+ for result in table.results:
+ if result.ns_pair not in head_to_head:
+ head_to_head[result.ns_pair] = {}
+ if result.ew_pair not in head_to_head[result.ns_pair]:
+ head_to_head[result.ns_pair][result.ew_pair] = []
+ if result.ew_pair not in head_to_head:
+ head_to_head[result.ew_pair] = {}
+ if result.ns_pair not in head_to_head[result.ew_pair]:
+ head_to_head[result.ew_pair][result.ns_pair] = []
+ head_to_head[result.ns_pair][result.ew_pair].append([
+ result.butler,
+ result.cutoff_butler,
+ result.cavendish
+ ])
+ head_to_head[result.ew_pair][result.ns_pair].append([
+ -result.butler,
+ -result.cutoff_butler,
+ -result.cavendish
+ ])
+ for ns in head_to_head:
+ for ew in head_to_head[ns]:
+ count = float(len(head_to_head[ns][ew]))
+ head_to_head[ns][ew] = {
+ 'butler': float(sum([r[0] for r in head_to_head[ns][ew]])) / count,
+ 'cutoff_butler': float(sum([r[1] for r in head_to_head[ns][ew]])) / count,
+ 'cavendish': float(sum([r[2] for r in head_to_head[ns][ew]])) / count,
+ 'count': count
+ }
+ normalized = {}
+ for ns in head_to_head:
+ print ns
+ if ns not in normalized:
+ normalized[ns] = {
+ 'butler': 0,
+ 'cutoff_butler': 0,
+ 'cavendish': 0,
+ 'count': 0
+ }
+ for ew in head_to_head[ns]:
+ print 'Against %s' % ew
+ head_to_head[ns][ew]['opposition'] = {
+ 'butler': 0,
+ 'cutoff_butler': 0,
+ 'cavendish': 0,
+ 'count': 0
+ }
+ for opposition in head_to_head[ew]:
+ if opposition != ns:
+ head_to_head[ns][ew]['opposition']['butler'] += head_to_head[ew][opposition]['butler'] * head_to_head[ew][opposition]['count']
+ head_to_head[ns][ew]['opposition']['cutoff_butler'] += head_to_head[ew][opposition]['cutoff_butler'] * head_to_head[ew][opposition]['count']
+ head_to_head[ns][ew]['opposition']['cavendish'] += head_to_head[ew][opposition]['cavendish'] * head_to_head[ew][opposition]['count']
+ head_to_head[ns][ew]['opposition']['count'] += head_to_head[ew][opposition]['count']
+ if head_to_head[ns][ew]['opposition']['count'] > 0:
+ head_to_head[ns][ew]['opposition']['butler'] /= head_to_head[ns][ew]['opposition']['count']
+ head_to_head[ns][ew]['opposition']['cutoff_butler'] /= head_to_head[ns][ew]['opposition']['count']
+ head_to_head[ns][ew]['opposition']['cavendish'] /= head_to_head[ns][ew]['opposition']['count']
+ print '%.2f\t%.2f\t%.2f' % (
+ head_to_head[ns][ew]['butler'],
+ head_to_head[ns][ew]['cutoff_butler'],
+ head_to_head[ns][ew]['cavendish']
+ )
+ print 'Opposition average (%d boards)' % head_to_head[ns][ew]['opposition']['count']
+ print '%.2f\t%.2f\t%.2f' % (
+ head_to_head[ns][ew]['opposition']['butler'],
+ head_to_head[ns][ew]['opposition']['cutoff_butler'],
+ head_to_head[ns][ew]['opposition']['cavendish']
+ )
+ print 'Normalized'
+ normalized_butler = head_to_head[ns][ew]['butler'] + head_to_head[ns][ew]['opposition']['butler']
+ normalized_cutoff = head_to_head[ns][ew]['cutoff_butler'] + head_to_head[ns][ew]['opposition']['cutoff_butler']
+ normalized_cavendish = head_to_head[ns][ew]['cavendish'] + head_to_head[ns][ew]['opposition']['cavendish']
+ print '%.2f\t%.2f\t%.2f' % (
+ normalized_butler,
+ normalized_cutoff,
+ normalized_cavendish
+ )
+ normalized[ns]['butler'] += normalized_butler * head_to_head[ns][ew]['count']
+ normalized[ns]['cutoff_butler'] += normalized_cutoff * head_to_head[ns][ew]['count']
+ normalized[ns]['cavendish'] += normalized_cavendish * head_to_head[ns][ew]['count']
+ normalized[ns]['count'] += head_to_head[ns][ew]['count']
+ print
+ print 'Normalized totals'
+ for pair in normalized:
+ print pair
+ print '%.2f\t%.2f\t%.2f\t%d' % (
+ normalized[pair]['butler'] / normalized[pair]['count'],
+ normalized[pair]['cutoff_butler'] / normalized[pair]['count'],
+ normalized[pair]['cavendish'] / normalized[pair]['count'],
+ normalized[pair]['count']
+ )
+ print