From 6f11a6a00cdc7f96b91ded3ff313eabc57cc9f83 Mon Sep 17 00:00:00 2001 From: emkael Date: Sun, 13 Jun 2021 12:50:35 +0200 Subject: Correctly detecting remote content encoding Remember kids, never parse HTML with regex. --- jfr_playoff/remote.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'jfr_playoff/remote.py') diff --git a/jfr_playoff/remote.py b/jfr_playoff/remote.py index 74d75b5..b918ea1 100644 --- a/jfr_playoff/remote.py +++ b/jfr_playoff/remote.py @@ -16,8 +16,11 @@ class RemoteUrl: if url not in cls.url_cache: request = requests.get(url) encoding_match = re.search( - 'content=".*;( )?charset=(.*)"', + 'content=".*;( )?charset=(.*?)"', request.content, re.IGNORECASE) + PlayoffLogger.get('remote').debug( + 'Content encoding: %s', + encoding_match.group(2)) if encoding_match: request.encoding = encoding_match.group(2) cls.url_cache[url] = request.text -- cgit v1.2.3