diff options
author | emkael <emkael@tlen.pl> | 2015-08-28 13:26:05 +0200 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2015-08-28 13:26:05 +0200 |
commit | 1c7b15ca899e418a95cfd241e83dff18138e78d3 (patch) | |
tree | c29eb725fc806985d92005639333a40dd397d663 | |
parent | be1719eb6e3c82b5f770c3784f98dd69e987d47a (diff) |
* code comments and formatting
-rw-r--r-- | virtual_table.py | 163 |
1 files changed, 98 insertions, 65 deletions
diff --git a/virtual_table.py b/virtual_table.py index e186e96..a300734 100644 --- a/virtual_table.py +++ b/virtual_table.py @@ -1,9 +1,8 @@ -# coding=utf-8 - import sys import glob import re import math +import copy from os import path from bs4 import BeautifulSoup as bs4 @@ -16,62 +15,69 @@ class JFRVirtualTable: file_path = path.realpath(prefix + '.html') tournament_path = path.dirname(file_path) tournament_prefix = path.splitext(path.basename(file_path))[0] + + # RegEx matching traveller files for each board traveller_files_match = re.compile( re.escape(tournament_prefix) + '([0-9]{3})\.txt' ) + # converts {prefix}{anything}.{ext} filename to full path def get_path(relative_path): return path.join(tournament_path, relative_path) - self.__traveller_files = [ - f for f - in glob.glob( - get_path(tournament_prefix + '*.txt') - ) - if re.search(traveller_files_match, f) - ] + # filtering out traveller files from all TXT files + self.__traveller_files = [f for f + in glob.glob( + get_path(tournament_prefix + '*.txt')) + if re.search(traveller_files_match, f)] + + # RegEx for matching pair record files records_files_match = re.compile( - 'H-' + tournament_prefix + '-([0-9]{1,3})\.html' - ) + 'H-' + tournament_prefix + '-([0-9]{1,3})\.html') self.__pair_records_files = [ f for f - in glob.glob( - get_path('H-' + tournament_prefix + '*.html') - ) + in glob.glob(get_path('H-' + tournament_prefix + '*.html')) if re.search(records_files_match, f) ] + + # short rersult list, from side frame self.__results_file = get_path(tournament_prefix + 'WYN.txt') + # full results page self.__full_results_file = get_path('W-' + tournament_prefix + '.html') + # list of pair records links page self.__pair_records_list_file = get_path( - 'H-' + tournament_prefix + '-lista.html' - ) + 'H-' + tournament_prefix + '-lista.html') + # collected scores page self.__collected_scores_file = get_path( - tournament_prefix + 'zbior.html' - ) + tournament_prefix + 'zbior.html') + # auto-detect virtual pairs by their record file header def __detect_virtual_pairs(self): virtual_pairs = [] + # RegEx for matching pair number and names in pair record header pair_header_match = re.compile('([0-9]{1,}): (.*) - (.*), .*') for record_file_path in self.__pair_records_files: with file(record_file_path) as record_file: record = bs4(record_file) - header = [ - con for con in record.select('td.o1')[0].contents - if type(con) is NavigableString and re.match( - pair_header_match, con - ) - ] + # first <td class="o1"> with content matching + # pair header is what we're after + header = [con for con + in record.select('td.o1')[0].contents + if type(con) is NavigableString and re.match( + pair_header_match, con)] if len(header): header_match = re.match(pair_header_match, header[0]) pair_number = int(header_match.group(1)) - names = filter( - len, - [header_match.group(2), header_match.group(3)] - ) + names = filter(len, + [header_match.group(2), + header_match.group(3)]) + # virtual pair does not have any names filled if len(names) == 0: virtual_pairs.append(pair_number) return sorted(virtual_pairs) + # wrapper for DOM manipulation + # wraps the inner function into BS4 invokation and file overwrite def __fix_file(worker): def file_wrapper(self, file_path, encoding='utf-8'): with file(file_path, 'r+') as content_file: @@ -84,130 +90,156 @@ class JFRVirtualTable: content_file.truncate() return file_wrapper + # fix simple results list by removing virtual pair rows @__fix_file def __fix_results(self, content): rows = content.select('tr') for row in rows: cells = row.select('td') + # 6 or more cells in a "proper" result row + # (may contain carry over or penalties) if len(cells) >= 6: try: + # third cell in the row is pair number if int(cells[2].contents[0]) in self.__virtual_pairs: row.extract() except ValueError: pass return content.table + # fix full results file by removing virtual pair rows @__fix_file def __fix_full_results(self, content): rows = content.select('tr') for row in rows: - cell_links = [ - link for link - in row.select('td a') - if link.has_attr( - 'href' - ) and link['href'].startswith( - 'H-' - ) and not link['href'].endswith( - 'lista.html' - ) - ] + # select rows by cells containing pair records links + cell_links = [link for link + in row.select('td a') + if link.has_attr('href') and + link['href'].startswith('H-') and + not link['href'].endswith('lista.html')] + # remove these containing links to virtual pairs if len(cell_links): if int(cell_links[0].contents[0]) in self.__virtual_pairs: row.extract() return content + # fix the page with pair records links list @__fix_file def __fix_records_list(self, content): + # read the original column count row_cell_count = int(content.table.select('tr td.o')[0]['colspan']) rows = content.select('tr') + # gather rows which containted any links link_rows = [] + # gather cells which should stay link_cells = [] for row in rows: cells = row.select('td.u') cells_found = False for cell in cells: - cell_links = [ - link for link - in cell.select('a.pa') - if link.has_attr( - 'href' - ) and link['href'].startswith( - 'H-' - ) and not link['href'].endswith( - 'lista.html' - ) - ] + # select cells by pair records links inside + cell_links = [link for link + in cell.select('a.pa') + if link.has_attr('href') and + link['href'].startswith('H-') and + not link['href'].endswith('lista.html')] if len(cell_links): + # delete virtual pair cells if int(cell_links[0].contents[0]) in self.__virtual_pairs: cell.extract() + # store actual pair cells else: link_cells.append(cell) cells_found = True + # gather processed rows if cells_found: link_rows.append(row) + # detach actual pair cells from the tree cells = map(lambda cell: cell.extract(), link_cells) for row in link_rows: row.extract() - while len(cells) >= 20: + # first filler cell of each new row + first_cell = content.new_tag('td', **{'class': 'n'}) + first_cell.string = u'\xa0' + # arrange cells into rows, full rows first + while len(cells) >= row_cell_count: new_row = content.new_tag('tr') - first_cell = content.new_tag('td', **{'class': 'n'}) - first_cell.string = u'\xa0' - new_row.append(first_cell) - for cell in cells[0:20]: + new_row.append(copy.copy(first_cell)) + for cell in cells[0:row_cell_count]: new_row.append(cell) content.table.append(new_row) - del cells[0:20] + del cells[0:row_cell_count] + # last row may or may not be full last_row = content.new_tag('tr') - first_cell = content.new_tag('td', **{'class': 'n'}) - first_cell.string = u'\xa0' - last_row.append(first_cell) + last_row.append(copy.copy(first_cell)) for cell in cells: last_row.append(cell) - if len(cells) < 20: - last_cell = content.new_tag('td', colspan=20-len(cells)) + # if it wasn't full, fill it with a col-spanned last cell + if len(cells) < row_cell_count: + last_cell = content.new_tag('td', + colspan=row_cell_count-len(cells)) last_cell.string = u'\xa0' last_row.append(last_cell) content.table.append(last_row) return content + # fix collected scores tables by removing virtual pair rows @__fix_file def __fix_collected(self, content): rows = content.select('tr') for row in rows: cells = row.select('td') + # "proper" rows should have 7 cells if len(cells) == 7: + # ignore cells without proper pair numbers try: if int(cells[1].contents[0]) in self.__virtual_pairs: if int(cells[2].contents[0]) in self.__virtual_pairs: row.extract() except ValueError: pass + # there are some clearly broken table cells, just throw them away if len(cells) == 1 and cells[0]['colspan'] == '7': if cells[0].contents[0] == ' ': row.extract() return content + # fix board travellers, removing virtual tables and leaving one, annotated @__fix_file def __fix_traveller(self, content): + # this should only happen if the traveller wasn't already processed + # as it's the only operaton that may yield any results on second run + # and it might break stuff if not len(content.select('tr.virtualTable')): - rows = [ - row for row + # looking for all the rows with more than 2 cells + rows = [row for row in content.select('tr') - if len(row.select('td')) >= 3 - ] + if len(row.select('td')) >= 3] + # only the first "virtual" row needs to be prefixed with a header header_added = False for row in rows: cells = row.select('td') + # if we're already added a header, meaning we're below the first + # virtual table, we need to move the row above it + # or remove it entirely if header_added: row_below = row.extract() + # only move it if it has meaningful information (10 cells) if len(cells) >= 10: virtual_row.insert_before(row_below) + continue + # we're looking for a "proper" row, with at least 10 cells if len(cells) >= 10: + # and with both pair numbers virtual if int(cells[1].contents[0]) in self.__virtual_pairs: if int(cells[2].contents[0]) in self.__virtual_pairs: + # if we're already processed the first one, + # just drop subsequent virtual tables if header_added: row.extract() + # it's the first virtual table + # prefix it with a header else: virtual_row = content.new_tag( 'tr', @@ -223,6 +255,7 @@ class JFRVirtualTable: virtual_row_header.string = 'Wirtualny stolik:' virtual_row.append(virtual_row_header) row.insert_before(virtual_row) + # clear pair numbers for cell in cells[1:3]: cell.contents = '' header_added = True |