http/index.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

# coding=utf-8

from mod_python import apache
import hashlib, json, os, re, time, urllib, urllib2, urlparse
from bs4 import BeautifulSoup as bs4

# value for <base> element, path of application relative to server root
BASE_PATH = '/'
# relative path to cache folder (relative to this file)
CACHE_PATH = '../cache'
# Cezar base URL
CEZAR_URL = 'http://msc.com.pl/cezar/'
# cache expiry (oldest valid cache timestamp)
CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60

__dir__ = os.path.dirname(__file__)

# retrieves remote URL content, forwarding browser's UAS
def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-Agent', user_agent)]
    open(cache_path, 'w+').write(opener.open(remote_url).read())

# returns content of cached file, refreshing cache if necessary
def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''):
    cache_path = os.path.join(__dir__, CACHE_PATH, cache_key)
    if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)):
        fetch_with_user_agent_spoof(cache_path, remote_url, user_agent)
    return open(cache_path, 'r').read()

def handler(req):
    # we need to recover original request path, from before rewrite
    orig_req = req
    while True:
        if orig_req.prev:
            orig_req = orig_req.prev
        else:
            break
    path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/'))
    
    if path:
        # /[ANYTHING]/refresh forces cache refresh
        no_cache = len(path) > 1 and path[1] == 'refresh'
        user_agent = orig_req.headers_in['User-Agent']
        # compiling remote original URI
        search_url = urlparse.urljoin(CEZAR_URL,
                                      '?' + urllib.urlencode(
                                          {
                                              'pid_search': path[0],
                                              'p': 21
                                          }
                                      ))
        # fetching content
        remote_content = get_cache_content(cache_key=path[0],
                                           remote_url=search_url,
                                           force_refresh=no_cache,
                                           user_agent=user_agent).split('\n')

        # slicing only interesting part of remote content
        delimiter_regex = re.compile('---- page content ')
        delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)]
        if len(delimiters) < 2:
            req.write('Malformed (even more than usually) content :(')
            return apache.OK

        # we need to sanitize line breaks and double-closed anchors manually
        remote_content = bs4('<body>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</body>', 'html.parser')
        # stub template for output page
        page_content = bs4('''
            <html><head>
            <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
            <base href="''' + BASE_PATH + '''" />
            <style>
            body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}
            a{text-decoration:none;color:black}
            </style>
            </head><body></body></html>
            ''', 'html.parser')
        page_content.html.body.append(remote_content.body)

        # internal links being rewritten to application links
        for link in page_content.select('a[href^="?p=21&pid="]'):
            link['href'] = urlparse.parse_qs(link['href'])['pid']

        # garbage elements removed
        garbage_selectors = ['script', 'table.msc_noprint', 'center > p']
        for selector in garbage_selectors:
            for garbage in page_content.select(selector):
                garbage.extract()

        # unnecessary tables removed
        page_content.select('table > tr > td')[1].extract()
        for table in page_content.select('table > tr > td > table')[2:4]:
            table.extract()

        # internal link targets (team, WK breakdown, etc.) removed
        for internal_link in page_content.select('table > tr > td > table a'):
            internal_link['href'] = None

        # internal link icons removed
        for link_icon in page_content.select('img[src*="ico_link_8.gif"]'):
            link_icon.extract()

        # fetching all external pictures (not pic/* images) to local cache
        for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]:
            image_url = urlparse.urljoin(CEZAR_URL, external_image['src'])
            image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1]
            if not os.path.exists(image_cache_path) or no_cache:
                fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent)
            external_image['src'] = image_cache_path

        # linking to original page
        original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]']
        for selector in original_link_selectors:
            for element in page_content.select(selector):
                link = element.wrap(page_content.new_tag('a'))
                link['href'] = search_url
        
        req.content_type = 'text/html'
        req.write(page_content.prettify('utf-8'))
    else:
        req.write('Nothing to see here, move along.')
    return apache.OK