From d0c0d03f2a8ec7609f1533a9868b20120da04ef6 Mon Sep 17 00:00:00 2001 From: emkael Date: Fri, 31 Jan 2020 20:16:18 +0100 Subject: MSC markup cleanup and proper list/player detection --- http/index.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/http/index.py b/http/index.py index cc21bfe..7b672b6 100644 --- a/http/index.py +++ b/http/index.py @@ -81,7 +81,8 @@ def handler(req): return apache.OK # we need to sanitize line breaks and double-closed anchors manually - remote_content = bs4('
' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('
', '
').replace('', '') + '
', 'html.parser') + remote_content = bs4('
' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('
', '
').replace('
', '
').replace('', '') + '
', 'html.parser') + # stub template for output page page_content = bs4(''' @@ -108,7 +109,8 @@ def handler(req): # unnecessary tables removed for table in page_content.select('table > tr > td > table')[4:]: table.extract() - page_content.select('table > tr > td')[-1].extract() + if u'Lista' not in [b.text for b in page_content.select('b')]: + page_content.select('table > tr > td')[-1].extract() # internal link targets (team, WK breakdown, etc.) removed for internal_link in page_content.select('table > tr > td > table a'): -- cgit v1.2.3