summaryrefslogtreecommitdiff
path: root/http/index.py
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2015-02-09 20:14:59 +0100
committeremkael <emkael@tlen.pl>2015-02-09 20:14:59 +0100
commit17167b1c0e088000164b87ef0a02237a3ffc107c (patch)
tree79c61deeea11e57b3c8e99dc39ca76cee491a745 /http/index.py
parentdd3d2509f6048e11f9b2b127f6c7acb80a506d8d (diff)
* mod_python port
Diffstat (limited to 'http/index.py')
-rw-r--r--http/index.py124
1 files changed, 124 insertions, 0 deletions
diff --git a/http/index.py b/http/index.py
new file mode 100644
index 0000000..2e46c8f
--- /dev/null
+++ b/http/index.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+
+from mod_python import apache
+import hashlib, json, os, re, time, urllib, urllib2, urlparse
+from bs4 import BeautifulSoup as bs4
+
+# value for <base> element, path of application relative to server root
+BASE_PATH = '/'
+# relative path to cache folder (relative to this file)
+CACHE_PATH = '../cache'
+# Cezar base URL
+CEZAR_URL = 'http://msc.com.pl/cezar/'
+# cache expiry (oldest valid cache timestamp)
+CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60
+
+__dir__ = os.path.dirname(__file__)
+
+# retrieves remote URL content, forwarding browser's UAS
+def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent):
+ opener = urllib2.build_opener()
+ opener.addheaders = [('User-Agent', user_agent)]
+ open(cache_path, 'w+').write(opener.open(remote_url).read())
+
+# returns content of cached file, refreshing cache if necessary
+def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''):
+ cache_path = os.path.join(__dir__, CACHE_PATH, cache_key)
+ if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)):
+ fetch_with_user_agent_spoof(cache_path, remote_url, user_agent)
+ return open(cache_path, 'r').read()
+
+def handler(req):
+ # we need to recover original request path, from before rewrite
+ orig_req = req
+ while True:
+ if orig_req.prev:
+ orig_req = orig_req.prev
+ else:
+ break
+ path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/'))
+
+ if path:
+ # /[ANYTHING]/refresh forces cache refresh
+ no_cache = len(path) > 1 and path[1] == 'refresh'
+ user_agent = orig_req.headers_in['User-Agent']
+ # compiling remote original URI
+ search_url = urlparse.urljoin(CEZAR_URL,
+ '?' + urllib.urlencode(
+ {
+ 'pid_search': path[0],
+ 'p': 21
+ }
+ ))
+ # fetching content
+ remote_content = get_cache_content(cache_key=path[0],
+ remote_url=search_url,
+ force_refresh=no_cache,
+ user_agent=user_agent).split('\n')
+
+ # slicing only interesting part of remote content
+ delimiter_regex = re.compile('---- page content ')
+ delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)]
+ if len(delimiters) < 2:
+ req.write('Malformed (even more than usually) content :(')
+ return apache.OK
+
+ # we need to sanitize line breaks and double-closed anchors manually
+ remote_content = bs4('<body>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</body>', 'html.parser')
+ # stub template for output page
+ page_content = bs4('''
+ <html><head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <base href="''' + BASE_PATH + '''" />
+ <style>
+ body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}
+ a{text-decoration:none;color:black}
+ </style>
+ </head><body></body></html>
+ ''', 'html.parser')
+ page_content.html.body.append(remote_content.body)
+
+ # internal links being rewritten to application links
+ for link in page_content.select('a[href^="?p=21&pid="]'):
+ link['href'] = urlparse.parse_qs(link['href'])['pid']
+
+ # garbage elements removed
+ garbage_selectors = ['script', 'table.msc_noprint', 'center > p']
+ for selector in garbage_selectors:
+ for garbage in page_content.select(selector):
+ garbage.extract()
+
+ # unnecessary tables removed
+ page_content.select('table > tr > td')[1].extract()
+ for table in page_content.select('table > tr > td > table')[2:4]:
+ table.extract()
+
+ # internal link targets (team, WK breakdown, etc.) removed
+ for internal_link in page_content.select('table > tr > td > table a'):
+ internal_link['href'] = None
+
+ # internal link icons removed
+ for link_icon in page_content.select('img[src*="ico_link_8.gif"]'):
+ link_icon.extract()
+
+ # fetching all external pictures (not pic/* images) to local cache
+ for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]:
+ image_url = urlparse.urljoin(CEZAR_URL, external_image['src'])
+ image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1]
+ if not os.path.exists(image_cache_path) or no_cache:
+ fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent)
+ external_image['src'] = image_cache_path
+
+ # linking to original page
+ original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]']
+ for selector in original_link_selectors:
+ for element in page_content.select(selector):
+ link = element.wrap(page_content.new_tag('a'))
+ link['href'] = search_url
+
+ req.content_type = 'text/html'
+ req.write(page_content.prettify('utf-8'))
+ else:
+ req.write('Nothing to see here, move along.')
+ return apache.OK
+