diff options
author | emkael <emkael@tlen.pl> | 2020-01-31 20:16:18 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2020-01-31 20:16:18 +0100 |
commit | d0c0d03f2a8ec7609f1533a9868b20120da04ef6 (patch) | |
tree | 1ddbbac95f6c9e3c85e0d65979120ec084d346b1 | |
parent | f98d9bb01bc99689dabbf412b61e5e7d4deba55c (diff) |
MSC markup cleanup and proper list/player detection
-rw-r--r-- | http/index.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/http/index.py b/http/index.py index cc21bfe..7b672b6 100644 --- a/http/index.py +++ b/http/index.py @@ -81,7 +81,8 @@ def handler(req): return apache.OK # we need to sanitize line breaks and double-closed anchors manually - remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser') + remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('</SPAN><BR></SPAN>', '</SPAN><BR>').replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser') + # stub template for output page page_content = bs4(''' <html><head> @@ -108,7 +109,8 @@ def handler(req): # unnecessary tables removed for table in page_content.select('table > tr > td > table')[4:]: table.extract() - page_content.select('table > tr > td')[-1].extract() + if u'Lista' not in [b.text for b in page_content.select('b')]: + page_content.select('table > tr > td')[-1].extract() # internal link targets (team, WK breakdown, etc.) removed for internal_link in page_content.select('table > tr > td > table a'): |