From cd418f19e42c946c7216005d3dce97c545d120fc Mon Sep 17 00:00:00 2001
From: emkael <emkael@tlen.pl>
Date: Mon, 4 Apr 2022 01:24:36 +0200
Subject: JFR Teamy board scraper

---
 boards/scrapers/jfrteamy/.gitignore       |  3 ++
 boards/scrapers/jfrteamy/requirements.txt |  2 ++
 boards/scrapers/jfrteamy/scrape-boards.py | 51 +++++++++++++++++++++++++++++++
 boards/scrapers/jfrteamy/scrape.sh        | 45 +++++++++++++++++++++++++++
 4 files changed, 101 insertions(+)
 create mode 100644 boards/scrapers/jfrteamy/.gitignore
 create mode 100644 boards/scrapers/jfrteamy/requirements.txt
 create mode 100644 boards/scrapers/jfrteamy/scrape-boards.py
 create mode 100755 boards/scrapers/jfrteamy/scrape.sh

diff --git a/boards/scrapers/jfrteamy/.gitignore b/boards/scrapers/jfrteamy/.gitignore
new file mode 100644
index 0000000..b983d81
--- /dev/null
+++ b/boards/scrapers/jfrteamy/.gitignore
@@ -0,0 +1,3 @@
+*.html
+*.htm
+*.pbn
diff --git a/boards/scrapers/jfrteamy/requirements.txt b/boards/scrapers/jfrteamy/requirements.txt
new file mode 100644
index 0000000..83780ba
--- /dev/null
+++ b/boards/scrapers/jfrteamy/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.6.0
+lxml
diff --git a/boards/scrapers/jfrteamy/scrape-boards.py b/boards/scrapers/jfrteamy/scrape-boards.py
new file mode 100644
index 0000000..b3f806a
--- /dev/null
+++ b/boards/scrapers/jfrteamy/scrape-boards.py
@@ -0,0 +1,51 @@
+from bs4 import BeautifulSoup as bs
+import bs4
+import os
+import sys
+
+traveller_file = open(sys.argv[1], encoding='utf8')
+traveller = bs(traveller_file, 'lxml')
+
+print('% PBN 1.0')
+print('[Generator "JFRTeamy-restorerer"]')
+print('[Event "%s"]' % (traveller_file.name))
+
+board_links = traveller.select('td.bdcc a.zb')
+for board_link in board_links:
+    if board_link.has_attr('href'):
+        board_number = board_link.text.strip()
+        dealer = ['W', 'N', 'E', 'S'][int(board_number) % 4]
+        with open(
+            os.path.join(
+                os.path.dirname(traveller_file.name),
+                board_link['href']
+            ), encoding='utf8'
+        ) as board_file:
+            board = bs(board_file, 'lxml')
+        card_cells = board.select('td.w') # ordinary JFR
+        if len(card_cells) != 4: # ukrywacz'ed JFR (TDD with a single table)
+            cell_brs = board.select('td br')
+            for br in cell_brs:
+                cell = br.parent
+                if cell.name == 'td' and cell not in card_cells:
+                    card_cells.append(cell)
+        card_strings = [
+            [
+                line.replace('10', 'T').replace(' ', '').strip()
+                for line in c
+                if type(line) == bs4.element.NavigableString
+            ] for c in card_cells
+        ]
+        # ordinary JFR has 8 strings per cell, TDD makes it 4
+        cards = [c if len(c) == 4 else c[1::2] for c in card_strings]
+        print('[Board "%s"]' % board_number)
+        print('[Dealer "%s"]' % dealer)
+        print('[Deal "N:%s %s %s %s"]' % (
+            '.'.join(cards[0]),
+            '.'.join(cards[2]),
+            '.'.join(cards[3]),
+            '.'.join(cards[1])
+        ))
+        print()
+
+traveller_file.close()
diff --git a/boards/scrapers/jfrteamy/scrape.sh b/boards/scrapers/jfrteamy/scrape.sh
new file mode 100755
index 0000000..1ab16e1
--- /dev/null
+++ b/boards/scrapers/jfrteamy/scrape.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# usage: ./scrape.sh URL ROUND_FROM ROUND_TO SEGMENT_FROM SEGMENT_TO
+# URL should be *rundaX.html or *leadeb.html full URL
+# script scrapes only single-hand-record protocols: non-TDD files or TDD-files in ukrywacz-only mode (with a single board layout)
+
+set -u
+shopt -s extglob
+
+CURRDIR=$(pwd)
+
+cd $(dirname $0)
+
+URL=$1
+URLDIR=${URL%/*}
+URLPATH=${URL##*/}
+
+PREFIX=${URLPATH%.html*}
+PREFIX=${PREFIX%%+([[:digit:]])}
+PREFIX=${PREFIX%runda*}
+PREFIX=${PREFIX%leaderb*}
+
+mkdir -p tmp
+cd tmp
+
+for RND in $(seq $2 $3)
+do
+    for SEGMENT in $(seq $4 $5)
+    do
+	SEGMENTPATH="${PREFIX}${RND}t1-${SEGMENT}.html"
+        if [ ! -f "$SEGMENTPATH" ]
+        then
+            curl -s "${URLDIR}/${SEGMENTPATH}" > "$SEGMENTPATH"
+        fi
+        for BOARD in {1..12}
+        do
+	    BOARDPATH="${PREFIX}${RND}b-$(( (SEGMENT-1) * 12 + BOARD)).html"
+            if [ ! -f "$BOARDPATH" ]
+            then
+		curl -s "${URLDIR}/${BOARDPATH}" > "$BOARDPATH"
+            fi
+        done
+        python3 ../scrape-boards.py $SEGMENTPATH > ${CURRDIR}/${PREFIX}_${RND}-${SEGMENT}.pbn
+    done
+done
-- 
cgit v1.2.3