From 9febaf58536bb32a361d6126b8f1327f6a7dcde2 Mon Sep 17 00:00:00 2001 From: emkael Date: Sat, 24 Feb 2018 11:01:05 +0100 Subject: Fixing remote HTML encoding accordingly to HTML content meta tags --- jfr_playoff/remote.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'jfr_playoff') diff --git a/jfr_playoff/remote.py b/jfr_playoff/remote.py index f02a8d4..3a519f6 100644 --- a/jfr_playoff/remote.py +++ b/jfr_playoff/remote.py @@ -1,3 +1,5 @@ +import re + import requests from bs4 import BeautifulSoup as bs @@ -9,5 +11,14 @@ class RemoteUrl: @classmethod def fetch(cls, url): if url not in cls.url_cache: - cls.url_cache[url] = requests.get(url).text + request = requests.get(url) + encoding_match = re.search( + 'content=".*;( )?charset=(.*)"', + request.content, re.IGNORECASE) + if encoding_match: + request.encoding = encoding_match.group(2) + cls.url_cache[url] = request.text + PlayoffLogger.get('remote').info( + 'content for %s not in cache: retrieved %d bytes', + url, len(cls.url_cache[url])) return bs(cls.url_cache[url], 'lxml') -- cgit v1.2.3