summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2020-01-31 20:16:18 +0100
committeremkael <emkael@tlen.pl>2020-01-31 20:16:18 +0100
commitd0c0d03f2a8ec7609f1533a9868b20120da04ef6 (patch)
tree1ddbbac95f6c9e3c85e0d65979120ec084d346b1
parentf98d9bb01bc99689dabbf412b61e5e7d4deba55c (diff)
MSC markup cleanup and proper list/player detection
-rw-r--r--http/index.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/http/index.py b/http/index.py
index cc21bfe..7b672b6 100644
--- a/http/index.py
+++ b/http/index.py
@@ -81,7 +81,8 @@ def handler(req):
return apache.OK
# we need to sanitize line breaks and double-closed anchors manually
- remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser')
+ remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('</SPAN><BR></SPAN>', '</SPAN><BR>').replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser')
+
# stub template for output page
page_content = bs4('''
<html><head>
@@ -108,7 +109,8 @@ def handler(req):
# unnecessary tables removed
for table in page_content.select('table > tr > td > table')[4:]:
table.extract()
- page_content.select('table > tr > td')[-1].extract()
+ if u'Lista' not in [b.text for b in page_content.select('b')]:
+ page_content.select('table > tr > td')[-1].extract()
# internal link targets (team, WK breakdown, etc.) removed
for internal_link in page_content.select('table > tr > td > table a'):