diff options
author | emkael <emkael@tlen.pl> | 2018-02-24 11:01:05 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2018-02-24 11:01:05 +0100 |
commit | 9febaf58536bb32a361d6126b8f1327f6a7dcde2 (patch) | |
tree | aea067914c99900e568c700b6e235d13d560858b /jfr_playoff/remote.py | |
parent | 4706d04936c2b775a7718b3e3f6b18435b5ff221 (diff) |
Fixing remote HTML encoding accordingly to HTML content meta tags
Diffstat (limited to 'jfr_playoff/remote.py')
-rw-r--r-- | jfr_playoff/remote.py | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/jfr_playoff/remote.py b/jfr_playoff/remote.py index f02a8d4..3a519f6 100644 --- a/jfr_playoff/remote.py +++ b/jfr_playoff/remote.py @@ -1,3 +1,5 @@ +import re + import requests from bs4 import BeautifulSoup as bs @@ -9,5 +11,14 @@ class RemoteUrl: @classmethod def fetch(cls, url): if url not in cls.url_cache: - cls.url_cache[url] = requests.get(url).text + request = requests.get(url) + encoding_match = re.search( + 'content=".*;( )?charset=(.*)"', + request.content, re.IGNORECASE) + if encoding_match: + request.encoding = encoding_match.group(2) + cls.url_cache[url] = request.text + PlayoffLogger.get('remote').info( + 'content for %s not in cache: retrieved %d bytes', + url, len(cls.url_cache[url])) return bs(cls.url_cache[url], 'lxml') |