# coding=utf-8
from mod_python import apache
import hashlib, json, os, re, time, urllib, urllib2, urlparse
from bs4 import BeautifulSoup as bs4
# value for element, path of application relative to server root
BASE_PATH = '/'
# relative path to cache folder (relative to this file)
CACHE_PATH = '../cache'
# Cezar base URL
CEZAR_URL = 'http://msc.com.pl/cezar/'
# cache expiry (oldest valid cache timestamp)
CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60
__dir__ = os.path.dirname(__file__)
# retrieves remote URL content, forwarding browser's UAS
def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent):
opener = urllib2.build_opener()
opener.addheaders = [('User-Agent', user_agent)]
egg = None
for trigger, egg_actions in eggs.iteritems():
if cache_path.split('/')[-1].decode('utf-8').lower() == trigger.decode('utf-8'):
egg = egg_actions
break
if egg is not None:
remote_url = urlparse.urljoin(CEZAR_URL,
'?' + urllib.urlencode({'pid_search': egg['pid'], 'p': 21}))
content = opener.open(remote_url).read()
if egg is not None:
if 'replacements' in egg:
for replacement in egg['replacements']:
content = re.sub(replacement['from'], replacement['to'], content)
open(cache_path, 'w+').write(content)
# returns content of cached file, refreshing cache if necessary
def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''):
cache_path = os.path.join(__dir__, CACHE_PATH, cache_key)
if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)):
fetch_with_user_agent_spoof(cache_path, remote_url, user_agent)
return open(cache_path, 'r').read()
def handler(req):
# MIME type fix for error messages
req.content_type = 'text/plain'
# we need to recover original request path, from before rewrite
orig_req = req
while True:
if orig_req.prev:
orig_req = orig_req.prev
else:
break
path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/'))
if path:
# /[ANYTHING]/refresh forces cache refresh
no_cache = len(path) > 1 and path[1] == 'refresh'
user_agent = orig_req.headers_in['User-Agent']
# compiling remote original URI
search_url = urlparse.urljoin(CEZAR_URL,
'?' + urllib.urlencode(
{
'pid_search': path[0],
'p': 21
}
))
# fetching content
remote_content = get_cache_content(cache_key=path[0],
remote_url=search_url,
force_refresh=no_cache,
user_agent=user_agent).split('\n')
# slicing only interesting part of remote content
delimiter_regex = re.compile('---- page content ')
delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)]
if len(delimiters) < 2:
req.write('Malformed (even more than usually) content :(')
return apache.OK
# we need to sanitize line breaks and double-closed anchors manually
remote_content = bs4('
' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('
', '
').replace('
', '
').replace('', '') + '
', 'html.parser')
# stub template for output page
page_content = bs4('''
''', 'html.parser')
page_content.html.body.append(remote_content.div)
# internal links being rewritten to application links
for link in page_content.select('a[href^="?p=21&pid="]'):
link['href'] = urlparse.parse_qs(link['href'])['pid']
# garbage elements removed
garbage_selectors = ['script', 'table.msc_noprint', 'center > p']
for selector in garbage_selectors:
for garbage in page_content.select(selector):
garbage.extract()
# unnecessary tables removed
for table in page_content.select('table > tr > td > table')[4:]:
table.extract()
if u'Lista' not in [b.text for b in page_content.select('b')] and \
u'Błąd' not in [b.text for b in page_content.select('b')]:
page_content.select('table > tr > td')[-1].extract()
# internal link targets (team, WK breakdown, etc.) removed
for internal_link in page_content.select('table > tr > td > table a'):
internal_link.attrs = {}
# internal link icons removed
for link_icon in page_content.select('img[src*="ico_link_8.gif"]'):
link_icon.extract()
# fetching all external pictures (not pic/* images) to local cache
for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]:
image_url = urlparse.urljoin(CEZAR_URL, external_image['src'])
image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1]
if not os.path.exists(image_cache_path) or no_cache:
fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent)
external_image['src'] = image_cache_path
# linking to original page
original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]']
for selector in original_link_selectors:
for element in page_content.select(selector):
link = element.wrap(page_content.new_tag('a'))
link['href'] = search_url
# credits info
credits_div = bs4('Pomysł Ivana,
Ivan jest zajebisty.
', 'html.parser', from_encoding='utf-8')
page_content.html.body.div.append(credits_div.div)
req.content_type = 'text/html'
req.write(page_content.prettify('utf-8'))
else:
req.write('Nothing to see here, move along.')
return apache.OK
eggs = {
'wąsłowicz': {
'pid': 13650,
'replacements': [
{ 'from': re.compile(r'src="\.\./cezar1/fots.*?"'), 'to': 'src="pic/egg.jpg"' },
{ 'from': 'asłowicz', 'to': 'ąsłowicz' },
{ 'from': 'ważną licencją', 'to': 'gwiazdką' }
]
},
'bubu': {
'pid': 1318
}
}