diff options
author | emkael <emkael@tlen.pl> | 2015-02-09 20:14:59 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2015-02-09 20:14:59 +0100 |
commit | 17167b1c0e088000164b87ef0a02237a3ffc107c (patch) | |
tree | 79c61deeea11e57b3c8e99dc39ca76cee491a745 | |
parent | dd3d2509f6048e11f9b2b127f6c7acb80a506d8d (diff) |
* mod_python port
-rw-r--r-- | http/.htaccess | 6 | ||||
-rw-r--r-- | http/index.php | 139 | ||||
-rw-r--r-- | http/index.py | 124 | ||||
-rw-r--r-- | http/pic/.htaccess | 7 | ||||
-rw-r--r-- | http/pic/fetch.php | 13 | ||||
-rw-r--r-- | http/pic/fetch.py | 27 |
6 files changed, 161 insertions, 155 deletions
diff --git a/http/.htaccess b/http/.htaccess index 92130fb..5ad514f 100644 --- a/http/.htaccess +++ b/http/.htaccess @@ -2,5 +2,9 @@ RewriteEngine On RewriteCond %{SCRIPT_FILENAME} !-f RewriteCond %{SCRIPT_FILENAME} !-d -RewriteRule ^[^\.]*$ index.php [QSA,L] +RewriteRule ^[^\.]*$ index.py [QSA,L] +AddHandler mod_python .py +PythonHandler index +PythonDebug On +DirectoryIndex index.html index.htm index.py index.php diff --git a/http/index.php b/http/index.php deleted file mode 100644 index a7034b6..0000000 --- a/http/index.php +++ /dev/null @@ -1,139 +0,0 @@ -<?php - -define('BASE_PATH', '/'); -define('CACHE_PATH', '../cache'); -define('CEZAR_URL', 'http://msc.com.pl/cezar/'); -define('QUERYPATH_PATH', '/usr/share/php/QueryPath/QueryPath.php'); - -// fetch item from content cache -function get_cache_content($cacheKey, $url, $force = FALSE) { - $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey; - if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) { - fetch_with_user_agent_spoof($cacheFile, $url); - } - return file_get_contents($cacheFile); -} - -// save remote URL locally, forwarding browser's User Agent String -function fetch_with_user_agent_spoof($targetFile, $sourceUrl) { - file_put_contents( - $targetFile, - file_get_contents($sourceUrl, - FALSE, - stream_context_create(['http' => [ - 'user_agent' => $_SERVER['HTTP_USER_AGENT'] - ] - ])) - ); -} - -// parse requested path (after rewrite) -$url = parse_url(preg_replace('#^' . preg_quote(BASE_PATH) . '#', '', $_SERVER['REQUEST_URI'])); -$path = array_values(array_filter(explode('/', $url['path']))); - -if ($path) { - // /[ANYTHING]/refresh disables cache (forces cache refresh) - $cache = !(count($path) > 1 && $path[1] == 'refresh'); - - // build Cezar URL for requested path - $path[0] = urldecode($path[0]); - $searchUrl = new http\Url(CEZAR_URL, - ['query' => http_build_query( - ['pid_search' => $path[0], - 'p' => '21'] - )]); - - $mscUrl = $searchUrl->toString(); - - $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content - $contentLines = explode(PHP_EOL, $content); - - // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it - $delimiters = array_keys(preg_grep('/---- page content /', $contentLines)); - if ($delimiters) { - $content = '<html><head>' - . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />' - . '<base href="' . BASE_PATH . '" />' - . '<style>' - . 'body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}' - . 'a{text-decoration:none;color:black;}' - . '</style>' - . '</head><body>' - . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1)) - . '</body></html>'; - } - else { - die('Malformed (even more than usually) content :('); - } - - require_once(QUERYPATH_PATH); - - $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']); - - // for search list pages, replace links with internal links to player IDs - $links = $html->find('a[href^="?p=21&pid="]'); - if ($links->size()) { - foreach ($links as $link) { - $href = []; - parse_str($link->attr('href'), $href); - $link->attr('href', $href['pid']); - } - } - - $html->top(); - - // remove general crap - $html->find('script, table.msc_noprint, center>p')->remove(); - $html->top(); - - // leave only first-ish table of the content - $html->find('table > tr > td')->eq(1)->remove(); - $html->top(); - $html->find('table > tr > td > table')->eq(2)->remove(); - $html->top(); - $html->find('table > tr > td > table')->eq(2)->remove(); - $html->top(); - - // remove internal Cezar links - $innerLinks = $html->find('table > tr > td > table a'); - foreach ($innerLinks as $innerLink) { - $innerLink->removeAttr('href'); - } - // get rid of Cezar link icons (right green arrows) - $html->find('img[src*="ico_link_8.gif"]')->remove(); - $html->top(); - - // proxy all external images, by resolving them relatively to the original server - // and cache them locally - // internal images are left untouched in the markup and are proxied through pic/fetch.php handler - // (if they're not present/overwritten locally) - $images = $html->find('img')->not('[src^="pic/"]'); - foreach ($images as $image) { - $src = $image->attr('src'); - $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE); - $imageUrl = $url->toString(); - $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl)); - if (!file_exists($cachedImageUrl) || !$cache) { - fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl); - } - $image->attr('src', $cachedImageUrl); - } - $html->top(); - - // link to the original URL on the image from foto/ directory and on the name+surname - // (actually, on every text with font-size:28px set) - $linking = $html->find('img[src^="foto/"],span[style*=":28px"]'); - foreach ($linking as $link) { - $link->wrap('<a></a>'); - $link->parent()->attr('href', $mscUrl); - } - $html->top(); - - // all done - print $html->html(); -} -else { - die('Nothing to see here, move along.'); -} - -?> diff --git a/http/index.py b/http/index.py new file mode 100644 index 0000000..2e46c8f --- /dev/null +++ b/http/index.py @@ -0,0 +1,124 @@ +# coding=utf-8 + +from mod_python import apache +import hashlib, json, os, re, time, urllib, urllib2, urlparse +from bs4 import BeautifulSoup as bs4 + +# value for <base> element, path of application relative to server root +BASE_PATH = '/' +# relative path to cache folder (relative to this file) +CACHE_PATH = '../cache' +# Cezar base URL +CEZAR_URL = 'http://msc.com.pl/cezar/' +# cache expiry (oldest valid cache timestamp) +CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60 + +__dir__ = os.path.dirname(__file__) + +# retrieves remote URL content, forwarding browser's UAS +def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent): + opener = urllib2.build_opener() + opener.addheaders = [('User-Agent', user_agent)] + open(cache_path, 'w+').write(opener.open(remote_url).read()) + +# returns content of cached file, refreshing cache if necessary +def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''): + cache_path = os.path.join(__dir__, CACHE_PATH, cache_key) + if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)): + fetch_with_user_agent_spoof(cache_path, remote_url, user_agent) + return open(cache_path, 'r').read() + +def handler(req): + # we need to recover original request path, from before rewrite + orig_req = req + while True: + if orig_req.prev: + orig_req = orig_req.prev + else: + break + path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/')) + + if path: + # /[ANYTHING]/refresh forces cache refresh + no_cache = len(path) > 1 and path[1] == 'refresh' + user_agent = orig_req.headers_in['User-Agent'] + # compiling remote original URI + search_url = urlparse.urljoin(CEZAR_URL, + '?' + urllib.urlencode( + { + 'pid_search': path[0], + 'p': 21 + } + )) + # fetching content + remote_content = get_cache_content(cache_key=path[0], + remote_url=search_url, + force_refresh=no_cache, + user_agent=user_agent).split('\n') + + # slicing only interesting part of remote content + delimiter_regex = re.compile('---- page content ') + delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)] + if len(delimiters) < 2: + req.write('Malformed (even more than usually) content :(') + return apache.OK + + # we need to sanitize line breaks and double-closed anchors manually + remote_content = bs4('<body>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</body>', 'html.parser') + # stub template for output page + page_content = bs4(''' + <html><head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + <base href="''' + BASE_PATH + '''" /> + <style> + body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";} + a{text-decoration:none;color:black} + </style> + </head><body></body></html> + ''', 'html.parser') + page_content.html.body.append(remote_content.body) + + # internal links being rewritten to application links + for link in page_content.select('a[href^="?p=21&pid="]'): + link['href'] = urlparse.parse_qs(link['href'])['pid'] + + # garbage elements removed + garbage_selectors = ['script', 'table.msc_noprint', 'center > p'] + for selector in garbage_selectors: + for garbage in page_content.select(selector): + garbage.extract() + + # unnecessary tables removed + page_content.select('table > tr > td')[1].extract() + for table in page_content.select('table > tr > td > table')[2:4]: + table.extract() + + # internal link targets (team, WK breakdown, etc.) removed + for internal_link in page_content.select('table > tr > td > table a'): + internal_link['href'] = None + + # internal link icons removed + for link_icon in page_content.select('img[src*="ico_link_8.gif"]'): + link_icon.extract() + + # fetching all external pictures (not pic/* images) to local cache + for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]: + image_url = urlparse.urljoin(CEZAR_URL, external_image['src']) + image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1] + if not os.path.exists(image_cache_path) or no_cache: + fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent) + external_image['src'] = image_cache_path + + # linking to original page + original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]'] + for selector in original_link_selectors: + for element in page_content.select(selector): + link = element.wrap(page_content.new_tag('a')) + link['href'] = search_url + + req.content_type = 'text/html' + req.write(page_content.prettify('utf-8')) + else: + req.write('Nothing to see here, move along.') + return apache.OK + diff --git a/http/pic/.htaccess b/http/pic/.htaccess index 615bc19..65ceb47 100644 --- a/http/pic/.htaccess +++ b/http/pic/.htaccess @@ -1,6 +1,9 @@ +AddHandler mod_python .py +PythonHandler fetch +PythonDebug On + RewriteEngine On RewriteCond %{SCRIPT_FILENAME} !-f RewriteCond %{SCRIPT_FILENAME} !-d -RewriteRule .* fetch.php [QSA,L] - +RewriteRule .* fetch.py [QSA,L] diff --git a/http/pic/fetch.php b/http/pic/fetch.php deleted file mode 100644 index 1fb94cd..0000000 --- a/http/pic/fetch.php +++ /dev/null @@ -1,13 +0,0 @@ -<?php - -$resource = @file_get_contents('http://msc.com.pl/cezar' . $_SERVER['REQUEST_URI']); -if ($resource) { - $filename = array_pop(explode('/', $_SERVER['REQUEST_URI'])); - file_put_contents($filename, $resource); - foreach ($http_response_header as $header) { - header($header); - } - readfile($filename); -} - -?> diff --git a/http/pic/fetch.py b/http/pic/fetch.py new file mode 100644 index 0000000..0cc3556 --- /dev/null +++ b/http/pic/fetch.py @@ -0,0 +1,27 @@ +# coding=utf-8 + +from mod_python import apache +import os, urllib2 + +CEZAR_URL = 'http://msc.com.pl/cezar' + +def handler(req): + orig_req = req + while True: + if orig_req.prev: + orig_req = orig_req.prev + else: + break + + remote_resource = CEZAR_URL + orig_req.uri + request = urllib2.Request(remote_resource) + request.add_header('User-Agent', orig_req.headers_in['User-Agent']) + file_name = os.path.join(os.path.dirname(__file__), remote_resource.split('/')[-1]) + try: + response = urllib2.urlopen(request) + open(file_name, 'w+').write(response.read()) + req.content_type = response.headers['Content-Type'] + req.write(open(file_name, 'r').read()) + return apache.OK + except urllib2.URLError: + return apache.HTTP_NOT_FOUND |