# coding=utf-8 from mod_python import apache import hashlib, json, os, re, time, urllib, urllib2, urlparse from bs4 import BeautifulSoup as bs4 # value for element, path of application relative to server root BASE_PATH = '/' # relative path to cache folder (relative to this file) CACHE_PATH = '../cache' # Cezar base URL CEZAR_URL = 'http://msc.com.pl/cezar/' # cache expiry (oldest valid cache timestamp) CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60 __dir__ = os.path.dirname(__file__) # retrieves remote URL content, forwarding browser's UAS def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent): opener = urllib2.build_opener() opener.addheaders = [('User-Agent', user_agent)] egg = None for trigger, egg_actions in eggs.iteritems(): if cache_path.split('/')[-1].decode('utf-8').lower() == trigger.decode('utf-8'): egg = egg_actions break if egg is not None: remote_url = urlparse.urljoin(CEZAR_URL, '?' + urllib.urlencode({'pid_search': egg['pid'], 'p': 21})) content = opener.open(remote_url).read() if egg is not None: if 'replacements' in egg: for replacement in egg['replacements']: content = re.sub(replacement['from'], replacement['to'], content) open(cache_path, 'w+').write(content) # returns content of cached file, refreshing cache if necessary def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''): cache_path = os.path.join(__dir__, CACHE_PATH, cache_key) if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)): fetch_with_user_agent_spoof(cache_path, remote_url, user_agent) return open(cache_path, 'r').read() def handler(req): # MIME type fix for error messages req.content_type = 'text/plain' # we need to recover original request path, from before rewrite orig_req = req while True: if orig_req.prev: orig_req = orig_req.prev else: break path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/')) if path: # /[ANYTHING]/refresh forces cache refresh no_cache = len(path) > 1 and path[1] == 'refresh' user_agent = orig_req.headers_in['User-Agent'] # compiling remote original URI search_url = urlparse.urljoin(CEZAR_URL, '?' + urllib.urlencode( { 'pid_search': path[0], 'p': 21 } )) # fetching content remote_content = get_cache_content(cache_key=path[0], remote_url=search_url, force_refresh=no_cache, user_agent=user_agent).split('\n') # slicing only interesting part of remote content delimiter_regex = re.compile('---- page content ') delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)] if len(delimiters) < 2: req.write('Malformed (even more than usually) content :(') return apache.OK # we need to sanitize line breaks and double-closed anchors manually remote_content = bs4('
' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('
', '
').replace('
', '
').replace('', '') + '
', 'html.parser') # stub template for output page page_content = bs4(''' ''', 'html.parser') page_content.html.body.append(remote_content.div) # internal links being rewritten to application links for link in page_content.select('a[href^="?p=21&pid="]'): link['href'] = urlparse.parse_qs(link['href'])['pid'] # garbage elements removed garbage_selectors = ['script', 'table.msc_noprint', 'center > p'] for selector in garbage_selectors: for garbage in page_content.select(selector): garbage.extract() # unnecessary tables removed for table in page_content.select('table > tr > td > table')[4:]: table.extract() if u'Lista' not in [b.text for b in page_content.select('b')] and \ u'Błąd' not in [b.text for b in page_content.select('b')]: page_content.select('table > tr > td')[-1].extract() # internal link targets (team, WK breakdown, etc.) removed for internal_link in page_content.select('table > tr > td > table a'): internal_link.attrs = {} # internal link icons removed for link_icon in page_content.select('img[src*="ico_link_8.gif"]'): link_icon.extract() # fetching all external pictures (not pic/* images) to local cache for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]: image_url = urlparse.urljoin(CEZAR_URL, external_image['src']) image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1] if not os.path.exists(image_cache_path) or no_cache: fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent) external_image['src'] = image_cache_path # linking to original page original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]'] for selector in original_link_selectors: for element in page_content.select(selector): link = element.wrap(page_content.new_tag('a')) link['href'] = search_url # credits info credits_div = bs4('
Pomysł Ivana,
Ivan jest zajebisty.
', 'html.parser', from_encoding='utf-8') page_content.html.body.div.append(credits_div.div) req.content_type = 'text/html' req.write(page_content.prettify('utf-8')) else: req.write('Nothing to see here, move along.') return apache.OK eggs = { 'wąsłowicz': { 'pid': 13650, 'replacements': [ { 'from': re.compile(r'src="\.\./cezar1/fots.*?"'), 'to': 'src="pic/egg.jpg"' }, { 'from': 'asłowicz', 'to': 'ąsłowicz' }, { 'from': 'ważną licencją', 'to': 'gwiazdką' } ] }, 'bubu': { 'pid': 1318 } }