diff options
Diffstat (limited to 'http')
-rw-r--r-- | http/index.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/http/index.py b/http/index.py index cc21bfe..7b672b6 100644 --- a/http/index.py +++ b/http/index.py @@ -81,7 +81,8 @@ def handler(req): return apache.OK # we need to sanitize line breaks and double-closed anchors manually - remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser') + remote_content = bs4('<div>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('</SPAN><BR></SPAN>', '</SPAN><BR>').replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</div>', 'html.parser') + # stub template for output page page_content = bs4(''' <html><head> @@ -108,7 +109,8 @@ def handler(req): # unnecessary tables removed for table in page_content.select('table > tr > td > table')[4:]: table.extract() - page_content.select('table > tr > td')[-1].extract() + if u'Lista' not in [b.text for b in page_content.select('b')]: + page_content.select('table > tr > td')[-1].extract() # internal link targets (team, WK breakdown, etc.) removed for internal_link in page_content.select('table > tr > td > table a'): |