From 373a37d35e8326ad6e3a6732053bc50a5c0f8517 Mon Sep 17 00:00:00 2001 From: emkael Date: Mon, 28 Sep 2015 13:20:34 +0200 Subject: * initial commit --- rrb2txt.py | 318 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ rrb2txt.spec | 24 +++++ 2 files changed, 342 insertions(+) create mode 100644 rrb2txt.py create mode 100644 rrb2txt.spec diff --git a/rrb2txt.py b/rrb2txt.py new file mode 100644 index 0000000..55e9f17 --- /dev/null +++ b/rrb2txt.py @@ -0,0 +1,318 @@ +# coding=utf-8 + +from bs4 import BeautifulSoup, Comment +import sys, os, re + +dir = sys.argv[1] if len(sys.argv) > 1 else '.' +file = os.path.join(dir, 'pary.txt') + +content = '' +with open(file, 'r') as file: + content = BeautifulSoup(file.read()) + +pdf_fixed = False +comments = content.findAll(text=lambda t: isinstance(t, Comment)) +for comment in comments: + if comment == 'fixpdf.py': + pdf_fixed = True + break + +if not pdf_fixed: + header = content.select('thead tr')[0] + body = content.select('tbody tr') + + if not header.find_all(text='+/-'): + tag = content.new_tag('td', style='display:none') + tag.string = '+/-' + header.find(lambda n: n.text[0:5] == 'wynik').insert_before(tag) + for row in body: + tag = content.new_tag('td', style='display:none', rowspan=2) + score_cell = row.select('td.right') + if score_cell: + score_cell[0].insert_before(tag) + + extra_headers = ['PKL','PDF','nagroda'] + extra_headers_present = [bool(header.find_all(text=h)) for h in extra_headers] + + extra_headers_offset = 8 + + for i in range(0, len(extra_headers)): + if not extra_headers_present[i]: + tag = content.new_tag('td', style='display:none') + tag.string = extra_headers[i] + header.select('td')[extra_headers_offset].insert_after(tag) + for row in body: + cells = row.find_all('td') + if len(cells) >= extra_headers_offset: + tag = content.new_tag('td', style='display:none', rowspan=2) + cells[extra_headers_offset].insert_after(tag) + extra_headers_offset += 1 + + def get_points_count(row): + try: + return row.find_all('td')[10].text.count('|') + except IndexError: + return 0 + + max_points_count = max([get_points_count(row) for row in body]) + 1 + + header.find_all('td')[10]['colspan'] = max_points_count + + for row in body: + cells = row.find_all('td') + if len(cells) >= extra_headers_offset: + span = max_points_count + points = cells[10].text.split('|') + new_cells = [] + for point in points: + tag = content.new_tag('td', rowspan=2) + tag.string = point + new_cells.append(tag) + span -= 1 + if span > 0: + new_cells[-1]['colspan'] = span + 1 + for new_cell in new_cells: + cells[11].insert_before(new_cell) + cells[10].extract() + + content.body.append(content.new_string('fixpdf.py', Comment)) + + content.body.p.extract() + new_content = content.find('body').decode_contents() + new_length = len(new_content) + 1 + + file = open(file.name, 'wb') + file.write('%012d' % new_length) + file.write('\n') + file.write(new_content.encode('utf-8')) + file.write('\n') + +from glob import glob +from itertools import chain, cycle +import urlparse, math + +def format_boards(rows): + rows = rows[1:4] + header = rows[0][0].split(os.linesep) + rows[0][0] = '/'.join(reversed(header[1] + .replace('obie przed', 'NIKT') + .replace('obie po', 'OBIE') + .split(' / '))) + rows[1][1] = '' + def split_hand(hand): + return hand.split(os.linesep) + rows[0][1] = split_hand(rows[0][1]) + rows[1][0] = split_hand(rows[1][0]) + rows[1][2] = split_hand(rows[1][2]) + rows[2][1] = split_hand(rows[2][1]) + def side_rows(row): + ret =[ + [row[0], + row[1][0][2:], + row[2]] + ] + for i in range(1,4): + ret.append(['', + row[1][i][2:] or '===', + '']) + return ret + def middle_rows(row): + ret = [] + for i in range(0,4): + ret.append([row[0][i][2:] or '===', + row[1], + row[2][i][2:] or '===']) + return ret + rows = side_rows(rows[0]) + middle_rows(rows[1]) + side_rows(rows[2]) + header = 'ROZDANIE NR ' + header[0] + output = [header, ''] + output.append('{:10s}{:6s}{:10s}'.format(*rows.pop(0))) + for row in rows: + output.append(' {:8s}{:6s}{:10s}'.format(*row)) + output.append('') + return output + +def format_protocols(rows): + output = [' ZAPIS WYNIK', + ' NS EW KONTRAKT WIST NS EW NS EW'] + for row in rows: + content = [] + if len(row) == 10: + content = [ + row[0], + row[1], + ' ' + row[2] + ' ' + row[3] + ' ' + row[5], + row[4] or '', + row[6] or '', + '-' + row[7] if row[7] else '', + '{:.1f}'.format(float(row[8])), + '{:.1f}'.format(float(row[9])) + ] + elif len(row) == 9: + content = [ + row[0], + row[1], + ' ' + row[2] + ' ' + row[3] + ' ' + row[5], + row[4], + '0', + '', + '{:.1f}'.format(float(row[7])), + '{:.1f}'.format(float(row[8])) + ] + if content: + output.append(u'{:>3s} {:>3s} {:11s}{:^4s}{:>4s}{:>5s} {:>5s} {:>5s}'.format(*content)) + elif len(row) != 4 and len(row) != 8: + print 'protocols: row of unexpected length' + print row + output.append('') + return output + +def format_results(rows): + rows.pop(0) + content = [] + link_regex = re.compile('^http://www.msc.com.pl') + cezar_ids = ['{:05d}'.format(int(dict(urlparse.parse_qsl(urlparse.urlparse(row.pop()).query))['r'])) + if re.match(link_regex, row[-1]) + else '' + for row in rows] + pdf_columns = max([len(row) for row in rows]) - 11 + for row in rows: + length = len(row) + if length > 5: + content.append(row[0:3] + [cezar_ids.pop(0)] + row[3:]) + elif length == 5: + content.append([''] * 2 + row[0:1] + [cezar_ids.pop(0)] + row[1:] + [''] * (3 + pdf_columns)) + elif length == 4: + if len(row[3]) != 2: + content.append([''] * 2 + row[0:1] + [cezar_ids.pop(0)] + row[1:3] + content[-1][6:7] + row[3:4] + [''] * (3 + pdf_columns)) + else: + content.append([''] * 2 + row[0:1] + [cezar_ids.pop(0)] + row[1:4] + [''] * (4 + pdf_columns)) + elif length == 3: + content.append([''] * 2 + row[0:1] + [cezar_ids.pop(0)] + row[1:3] + content[-1][6:8] + [''] * (3 + pdf_columns)) + wk_sum = sum([float(c[5]) if len(c[5]) else 0.0 for c in content]) + output = [] + name_column = max([len(r[2]) for r in content]) + output.append('M-CE NR ' + ' ' * name_column + ' WK CEZAR +/- WYNIK PKL ' + ('{:^' + str(3 * pdf_columns) + 's}').format('PDF') + ' NAGRODA') + output.append('-' * len(output[-1])) + for c in content: + line = (u'{:>3s} {:>3s} {:' + unicode(name_column) + u's} {:>4s} {:2s} {:5s} {:2s} {:>5s} {:>6s} {:>3s}').format(*(c[0:3] + c[5:7] + c[3:5] + c[8:11])) + pdf = (u' {:' + unicode(3 * pdf_columns) + u's}').format(''.join([u'{:>3s}'.format(cc) for cc in c[11:-1]])) + line += pdf + line += u' {:>6s}'.format(c[-1]) + output.append(line) + output.append(' ' * (8 + name_column) + '-----') + output.append(('{:>' + str(13 + name_column) + 's}').format('Suma WK = {:.1f}'.format(wk_sum))) + return output + +def format_histories(rows): + header = rows.pop(0)[0] + rows.pop(0) + if ' pauza ' in header: + return [] + output = ['WYNIKI PARY NR ' + header, + ''] + content_rows = [] + add_separator = False + for row in rows: + content = [] + if len(row) == 11: + add_separator = (len(''.join(row[0:9])) == 0) and ((add_separator is False) or (row[-2] == 'miejsce')) + content = row[0:4] + [row[4] + ' ' + row[5] + ' ' + row[7]] + [row[6]] + row[8:11] + elif len(row) == 10: + content = [''] + row[0:3] + [row[3] + ' ' + row[4] + ' ' + row[6]] + [row[5]] + row[7:10] + elif len(row) == 9: + content = ['',''] + row[0:2] + [row[2] + ' ' + row[3] + ' ' + row[5]] + [row[4]] + row[6:9] + if content: + if add_separator: + content_rows.append(['','','','','','','','-------','--------']) + content_rows.append(content) + else: + print 'histories: unexpected row length' + print row + column_width = max([len(r[1]) for r in content_rows]) + content_rows = [['RND', 'PRZECIWNIK', 'RZD', ' ', 'KONTRAKT', 'WIST', 'ZAPIS', 'WYNIK ', u'/ BIEŻĄCY']] + content_rows + for content in content_rows: + if content[6]: + score_align = u'>' if content[6][0] == u'-' else (u'' if content[6][0] == u'+' else u'^') + else: + score_align = u'' + output.append((u'{:>3s} {:' + unicode(column_width) + u's} {:>3s} {:2s} {:9s}{:^4s} {:' + score_align + u'7s} {:>7s}{:>8s}').format(*[c or ' ' for c in content])) + output.insert(3, '-' * len(output[2])) + output.append('') + return output + +def format_rows(rows, type): + return globals()['format_' + type](rows) + +def get_rows(content): + soup = BeautifulSoup(content) + output = [] + link_regex = re.compile('^http://www.msc.com.pl') + header = soup.find('h2') + if header: + output.append([header.text]) + for table_row in soup.find_all('tr'): + row = map(lambda t: + os.linesep.join(t.stripped_strings), + table_row.find_all('td')) + row = row + map(lambda l: + l['href'], + table_row.find_all('a', {'href': link_regex})) + output.append(row) + return output + +def get_content(file): + return re.sub('', + lambda img: img.group(1)[0].capitalize(), + open(file, 'r').read()) + +def get_header(directory): + soup = BeautifulSoup(open(os.path.join(directory, 'index.html'), 'r').read()) + return [node.text for node in soup.select('#header *')] + +def get_files(directory): + return dict(map(lambda (key, val): (key, + reduce(list.__add__, map(lambda v: sorted(glob(os.path.join(directory, v))), val), [])), + { + 'boards': ['d?.txt','d??.txt'], + 'protocols': ['p?.txt','p??.txt'], + 'histories': ['h?.txt','h??.txt'], + 'results': ['pary.txt'], + }.items())) + +def compile(directory): + files = get_files(directory) + return dict(map(lambda (key, val): + (key, + list(chain(* + list( + i.next() for i in cycle(map(lambda v: + iter( + map(lambda file: + format_rows( + get_rows( + get_content(file) + ), + v), + files[v])), + val)) + ) + )) + ), + { + 'P': ['boards', 'protocols'], + 'H': ['histories'], + 'W': ['results'] + }.items())) + +directory = sys.argv[1] if len(sys.argv) > 1 else os.path.abspath('.') +header = get_header(directory) + [''] +output = compile(directory) +file_prefix = os.path.basename(directory) + +for file, rows in output.iteritems(): + output_file = open(file_prefix + file + '.txt', 'w') + for line in header: + output_file.write(line.encode('windows-1250') + '\n') + for row in rows: + output_file.write(row.encode('windows-1250') + '\n') diff --git a/rrb2txt.spec b/rrb2txt.spec new file mode 100644 index 0000000..fb0c810 --- /dev/null +++ b/rrb2txt.spec @@ -0,0 +1,24 @@ +# -*- mode: python -*- + +block_cipher = None + + +a = Analysis(['rrb2txt.py'], + pathex=['f:\\Brydz\\RRBridge'], + hiddenimports=[], + hookspath=None, + runtime_hooks=None, + excludes=None, + cipher=block_cipher) +pyz = PYZ(a.pure, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='rrb2txt.exe', + debug=False, + strip=None, + upx=True, + console=True ) -- cgit v1.2.3