diff options
author | emkael <emkael@tlen.pl> | 2022-04-04 01:24:36 +0200 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2022-04-04 01:27:46 +0200 |
commit | cd418f19e42c946c7216005d3dce97c545d120fc (patch) | |
tree | 23a6c01bca7b96e406844fdd848ae10300781a75 | |
parent | 0ebeac6121c10b41ac6060f06c6de50e49f929dd (diff) |
JFR Teamy board scraper
-rw-r--r-- | boards/scrapers/jfrteamy/.gitignore | 3 | ||||
-rw-r--r-- | boards/scrapers/jfrteamy/requirements.txt | 2 | ||||
-rw-r--r-- | boards/scrapers/jfrteamy/scrape-boards.py | 51 | ||||
-rwxr-xr-x | boards/scrapers/jfrteamy/scrape.sh | 45 |
4 files changed, 101 insertions, 0 deletions
diff --git a/boards/scrapers/jfrteamy/.gitignore b/boards/scrapers/jfrteamy/.gitignore new file mode 100644 index 0000000..b983d81 --- /dev/null +++ b/boards/scrapers/jfrteamy/.gitignore @@ -0,0 +1,3 @@ +*.html +*.htm +*.pbn diff --git a/boards/scrapers/jfrteamy/requirements.txt b/boards/scrapers/jfrteamy/requirements.txt new file mode 100644 index 0000000..83780ba --- /dev/null +++ b/boards/scrapers/jfrteamy/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4==4.6.0 +lxml diff --git a/boards/scrapers/jfrteamy/scrape-boards.py b/boards/scrapers/jfrteamy/scrape-boards.py new file mode 100644 index 0000000..b3f806a --- /dev/null +++ b/boards/scrapers/jfrteamy/scrape-boards.py @@ -0,0 +1,51 @@ +from bs4 import BeautifulSoup as bs +import bs4 +import os +import sys + +traveller_file = open(sys.argv[1], encoding='utf8') +traveller = bs(traveller_file, 'lxml') + +print('% PBN 1.0') +print('[Generator "JFRTeamy-restorerer"]') +print('[Event "%s"]' % (traveller_file.name)) + +board_links = traveller.select('td.bdcc a.zb') +for board_link in board_links: + if board_link.has_attr('href'): + board_number = board_link.text.strip() + dealer = ['W', 'N', 'E', 'S'][int(board_number) % 4] + with open( + os.path.join( + os.path.dirname(traveller_file.name), + board_link['href'] + ), encoding='utf8' + ) as board_file: + board = bs(board_file, 'lxml') + card_cells = board.select('td.w') # ordinary JFR + if len(card_cells) != 4: # ukrywacz'ed JFR (TDD with a single table) + cell_brs = board.select('td br') + for br in cell_brs: + cell = br.parent + if cell.name == 'td' and cell not in card_cells: + card_cells.append(cell) + card_strings = [ + [ + line.replace('10', 'T').replace(' ', '').strip() + for line in c + if type(line) == bs4.element.NavigableString + ] for c in card_cells + ] + # ordinary JFR has 8 strings per cell, TDD makes it 4 + cards = [c if len(c) == 4 else c[1::2] for c in card_strings] + print('[Board "%s"]' % board_number) + print('[Dealer "%s"]' % dealer) + print('[Deal "N:%s %s %s %s"]' % ( + '.'.join(cards[0]), + '.'.join(cards[2]), + '.'.join(cards[3]), + '.'.join(cards[1]) + )) + print() + +traveller_file.close() diff --git a/boards/scrapers/jfrteamy/scrape.sh b/boards/scrapers/jfrteamy/scrape.sh new file mode 100755 index 0000000..1ab16e1 --- /dev/null +++ b/boards/scrapers/jfrteamy/scrape.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# usage: ./scrape.sh URL ROUND_FROM ROUND_TO SEGMENT_FROM SEGMENT_TO +# URL should be *rundaX.html or *leadeb.html full URL +# script scrapes only single-hand-record protocols: non-TDD files or TDD-files in ukrywacz-only mode (with a single board layout) + +set -u +shopt -s extglob + +CURRDIR=$(pwd) + +cd $(dirname $0) + +URL=$1 +URLDIR=${URL%/*} +URLPATH=${URL##*/} + +PREFIX=${URLPATH%.html*} +PREFIX=${PREFIX%%+([[:digit:]])} +PREFIX=${PREFIX%runda*} +PREFIX=${PREFIX%leaderb*} + +mkdir -p tmp +cd tmp + +for RND in $(seq $2 $3) +do + for SEGMENT in $(seq $4 $5) + do + SEGMENTPATH="${PREFIX}${RND}t1-${SEGMENT}.html" + if [ ! -f "$SEGMENTPATH" ] + then + curl -s "${URLDIR}/${SEGMENTPATH}" > "$SEGMENTPATH" + fi + for BOARD in {1..12} + do + BOARDPATH="${PREFIX}${RND}b-$(( (SEGMENT-1) * 12 + BOARD)).html" + if [ ! -f "$BOARDPATH" ] + then + curl -s "${URLDIR}/${BOARDPATH}" > "$BOARDPATH" + fi + done + python3 ../scrape-boards.py $SEGMENTPATH > ${CURRDIR}/${PREFIX}_${RND}-${SEGMENT}.pbn + done +done |