summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2018-02-24 11:01:05 +0100
committeremkael <emkael@tlen.pl>2018-02-24 11:01:05 +0100
commit9febaf58536bb32a361d6126b8f1327f6a7dcde2 (patch)
treeaea067914c99900e568c700b6e235d13d560858b
parent4706d04936c2b775a7718b3e3f6b18435b5ff221 (diff)
Fixing remote HTML encoding accordingly to HTML content meta tags
-rw-r--r--jfr_playoff/remote.py13
1 files changed, 12 insertions, 1 deletions
diff --git a/jfr_playoff/remote.py b/jfr_playoff/remote.py
index f02a8d4..3a519f6 100644
--- a/jfr_playoff/remote.py
+++ b/jfr_playoff/remote.py
@@ -1,3 +1,5 @@
+import re
+
import requests
from bs4 import BeautifulSoup as bs
@@ -9,5 +11,14 @@ class RemoteUrl:
@classmethod
def fetch(cls, url):
if url not in cls.url_cache:
- cls.url_cache[url] = requests.get(url).text
+ request = requests.get(url)
+ encoding_match = re.search(
+ 'content=".*;( )?charset=(.*)"',
+ request.content, re.IGNORECASE)
+ if encoding_match:
+ request.encoding = encoding_match.group(2)
+ cls.url_cache[url] = request.text
+ PlayoffLogger.get('remote').info(
+ 'content for %s not in cache: retrieved %d bytes',
+ url, len(cls.url_cache[url]))
return bs(cls.url_cache[url], 'lxml')