* mod_python port

author: emkael <emkael@tlen.pl> 2015-02-09 20:14:59 +0100
committer: emkael <emkael@tlen.pl> 2015-02-09 20:14:59 +0100
commit: 17167b1c0e088000164b87ef0a02237a3ffc107c (patch)
tree: 79c61deeea11e57b3c8e99dc39ca76cee491a745
parent: dd3d2509f6048e11f9b2b127f6c7acb80a506d8d (diff)
6 files changed, 161 insertions, 155 deletions
diff --git a/http/.htaccess b/http/.htaccess
index 92130fb..5ad514f 100644
--- a/http/.htaccess
+++ b/http/.htaccess
@@ -2,5 +2,9 @@ RewriteEngine On
 
 RewriteCond %{SCRIPT_FILENAME} !-f
 RewriteCond %{SCRIPT_FILENAME} !-d
-RewriteRule ^[^\.]*$ index.php [QSA,L]
+RewriteRule ^[^\.]*$ index.py [QSA,L]
 
+AddHandler mod_python .py
+PythonHandler index
+PythonDebug On
+DirectoryIndex index.html index.htm index.py index.php
diff --git a/http/index.php b/http/index.php
deleted file mode 100644
index a7034b6..0000000
--- a/http/index.php
+++ /dev/null
@@ -1,139 +0,0 @@
-<?php
-
-define('BASE_PATH', '/');
-define('CACHE_PATH', '../cache');
-define('CEZAR_URL', 'http://msc.com.pl/cezar/');
-define('QUERYPATH_PATH', '/usr/share/php/QueryPath/QueryPath.php');
-
-// fetch item from content cache
-function get_cache_content($cacheKey, $url, $force = FALSE) {
-  $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey;
-  if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) {
-    fetch_with_user_agent_spoof($cacheFile, $url);
-  }
-  return file_get_contents($cacheFile);
-}
-
-// save remote URL locally, forwarding browser's User Agent String
-function fetch_with_user_agent_spoof($targetFile, $sourceUrl) {
-    file_put_contents(
-                      $targetFile,
-                      file_get_contents($sourceUrl,
-                                        FALSE,
-                                        stream_context_create(['http' => [
-                                                                          'user_agent' => $_SERVER['HTTP_USER_AGENT']
-                                                                          ]
-                                                               ]))
-                      );
-}
-
-// parse requested path (after rewrite)
-$url = parse_url(preg_replace('#^' . preg_quote(BASE_PATH) . '#', '', $_SERVER['REQUEST_URI']));
-$path = array_values(array_filter(explode('/', $url['path'])));
-
-if ($path) {
-  // /[ANYTHING]/refresh disables cache (forces cache refresh)
-  $cache = !(count($path) > 1 && $path[1] == 'refresh');
-
-  // build Cezar URL for requested path
-  $path[0] = urldecode($path[0]);
-  $searchUrl = new http\Url(CEZAR_URL,
-                            ['query' => http_build_query(
-                                                         ['pid_search' => $path[0],
-                                                          'p' => '21']
-                                                         )]);
-
-  $mscUrl = $searchUrl->toString();
-
-  $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content
-  $contentLines = explode(PHP_EOL, $content);
-
-  // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it
-  $delimiters = array_keys(preg_grep('/---- page content /', $contentLines));
-  if ($delimiters) {
-    $content = '<html><head>'
-       . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
-       . '<base href="' . BASE_PATH . '" />'
-       . '<style>'
-       . 'body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}'
-       . 'a{text-decoration:none;color:black;}'
-       . '</style>'
-       . '</head><body>'
-       . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1))
-       . '</body></html>';
-  }
-  else {
-    die('Malformed (even more than usually) content :(');
-  }
-
-  require_once(QUERYPATH_PATH);
-
-  $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']);
-
-  // for search list pages, replace links with internal links to player IDs
-  $links = $html->find('a[href^="?p=21&pid="]');
-  if ($links->size()) {
-    foreach ($links as $link) {
-      $href = [];
-      parse_str($link->attr('href'), $href);
-      $link->attr('href', $href['pid']);
-    }
-  }
-
-  $html->top();
-
-  // remove general crap
-  $html->find('script, table.msc_noprint, center>p')->remove();
-  $html->top();
-
-  // leave only first-ish table of the content
-  $html->find('table > tr > td')->eq(1)->remove();
-  $html->top();
-  $html->find('table > tr > td > table')->eq(2)->remove();
-  $html->top();
-  $html->find('table > tr > td > table')->eq(2)->remove();
-  $html->top();
-
-  // remove internal Cezar links
-  $innerLinks = $html->find('table > tr > td > table a');
-  foreach ($innerLinks as $innerLink) {
-    $innerLink->removeAttr('href');
-  }
-  // get rid of Cezar link icons (right green arrows)
-  $html->find('img[src*="ico_link_8.gif"]')->remove();
-  $html->top();
-
-  // proxy all external images, by resolving them relatively to the original server
-  // and cache them locally
-  // internal images are left untouched in the markup and are proxied through pic/fetch.php handler
-  // (if they're not present/overwritten locally)
-  $images = $html->find('img')->not('[src^="pic/"]');
-  foreach ($images as $image) {
-    $src = $image->attr('src');
-    $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE);
-    $imageUrl = $url->toString();
-    $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl));
-    if (!file_exists($cachedImageUrl) || !$cache) {
-      fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl);
-    }
-    $image->attr('src', $cachedImageUrl);
-  }
-  $html->top();
-
-  // link to the original URL on the image from foto/ directory and on the name+surname
-  // (actually, on every text with font-size:28px set)
-  $linking = $html->find('img[src^="foto/"],span[style*=":28px"]');
-  foreach ($linking as $link) {
-     $link->wrap('<a></a>');
-     $link->parent()->attr('href', $mscUrl);
-  }
-  $html->top();
-
-  // all done
-  print $html->html();
-}
-else {
-  die('Nothing to see here, move along.');
-}
-
-?>
diff --git a/http/index.py b/http/index.py
new file mode 100644
index 0000000..2e46c8f
--- /dev/null
+++ b/http/index.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+
+from mod_python import apache
+import hashlib, json, os, re, time, urllib, urllib2, urlparse
+from bs4 import BeautifulSoup as bs4
+
+# value for <base> element, path of application relative to server root
+BASE_PATH = '/'
+# relative path to cache folder (relative to this file)
+CACHE_PATH = '../cache'
+# Cezar base URL
+CEZAR_URL = 'http://msc.com.pl/cezar/'
+# cache expiry (oldest valid cache timestamp)
+CACHE_EXPIRY_LIMIT = int(time.time()) - 24*60*60
+
+__dir__ = os.path.dirname(__file__)
+
+# retrieves remote URL content, forwarding browser's UAS
+def fetch_with_user_agent_spoof(cache_path, remote_url, user_agent):
+    opener = urllib2.build_opener()
+    opener.addheaders = [('User-Agent', user_agent)]
+    open(cache_path, 'w+').write(opener.open(remote_url).read())
+
+# returns content of cached file, refreshing cache if necessary
+def get_cache_content(cache_key, remote_url, force_refresh=False, user_agent=''):
+    cache_path = os.path.join(__dir__, CACHE_PATH, cache_key)
+    if force_refresh or not (os.path.exists(cache_path) and (int(os.path.getmtime(cache_path)) > CACHE_EXPIRY_LIMIT)):
+        fetch_with_user_agent_spoof(cache_path, remote_url, user_agent)
+    return open(cache_path, 'r').read()
+
+def handler(req):
+    # we need to recover original request path, from before rewrite
+    orig_req = req
+    while True:
+        if orig_req.prev:
+            orig_req = orig_req.prev
+        else:
+            break
+    path = filter(None, re.sub('index\.py$', '', re.sub('^' + BASE_PATH, '', orig_req.uri)).split('/'))
+    
+    if path:
+        # /[ANYTHING]/refresh forces cache refresh
+        no_cache = len(path) > 1 and path[1] == 'refresh'
+        user_agent = orig_req.headers_in['User-Agent']
+        # compiling remote original URI
+        search_url = urlparse.urljoin(CEZAR_URL,
+                                      '?' + urllib.urlencode(
+                                          {
+                                              'pid_search': path[0],
+                                              'p': 21
+                                          }
+                                      ))
+        # fetching content
+        remote_content = get_cache_content(cache_key=path[0],
+                                           remote_url=search_url,
+                                           force_refresh=no_cache,
+                                           user_agent=user_agent).split('\n')
+
+        # slicing only interesting part of remote content
+        delimiter_regex = re.compile('---- page content ')
+        delimiters = [i for i, line in enumerate(remote_content) if re.search(delimiter_regex, line)]
+        if len(delimiters) < 2:
+            req.write('Malformed (even more than usually) content :(')
+            return apache.OK
+
+        # we need to sanitize line breaks and double-closed anchors manually
+        remote_content = bs4('<body>' + ' '.join(remote_content[delimiters[0]+1:delimiters[1]]).replace('<BR>', '<br />').replace('</a></a>', '</a>') + '</body>', 'html.parser')
+        # stub template for output page
+        page_content = bs4('''
+            <html><head>
+            <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+            <base href="''' + BASE_PATH + '''" />
+            <style>
+            body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}
+            a{text-decoration:none;color:black}
+            </style>
+            </head><body></body></html>
+            ''', 'html.parser')
+        page_content.html.body.append(remote_content.body)
+
+        # internal links being rewritten to application links
+        for link in page_content.select('a[href^="?p=21&pid="]'):
+            link['href'] = urlparse.parse_qs(link['href'])['pid']
+
+        # garbage elements removed
+        garbage_selectors = ['script', 'table.msc_noprint', 'center > p']
+        for selector in garbage_selectors:
+            for garbage in page_content.select(selector):
+                garbage.extract()
+
+        # unnecessary tables removed
+        page_content.select('table > tr > td')[1].extract()
+        for table in page_content.select('table > tr > td > table')[2:4]:
+            table.extract()
+
+        # internal link targets (team, WK breakdown, etc.) removed
+        for internal_link in page_content.select('table > tr > td > table a'):
+            internal_link['href'] = None
+
+        # internal link icons removed
+        for link_icon in page_content.select('img[src*="ico_link_8.gif"]'):
+            link_icon.extract()
+
+        # fetching all external pictures (not pic/* images) to local cache
+        for external_image in [image for image in page_content.select('img') if not image['src'].startswith('pic/')]:
+            image_url = urlparse.urljoin(CEZAR_URL, external_image['src'])
+            image_cache_path = 'foto/' + hashlib.md5(image_url).hexdigest() + '.' + image_url.split('.')[-1]
+            if not os.path.exists(image_cache_path) or no_cache:
+                fetch_with_user_agent_spoof(os.path.join(__dir__, image_cache_path), image_url, user_agent)
+            external_image['src'] = image_cache_path
+
+        # linking to original page
+        original_link_selectors = ['img[src^="foto/"]', 'span[style*=":28px"]']
+        for selector in original_link_selectors:
+            for element in page_content.select(selector):
+                link = element.wrap(page_content.new_tag('a'))
+                link['href'] = search_url
+        
+        req.content_type = 'text/html'
+        req.write(page_content.prettify('utf-8'))
+    else:
+        req.write('Nothing to see here, move along.')
+    return apache.OK
+
diff --git a/http/pic/.htaccess b/http/pic/.htaccess
index 615bc19..65ceb47 100644
--- a/http/pic/.htaccess
+++ b/http/pic/.htaccess
@@ -1,6 +1,9 @@
+AddHandler mod_python .py
+PythonHandler fetch
+PythonDebug On
+
 RewriteEngine On
 
 RewriteCond %{SCRIPT_FILENAME} !-f
 RewriteCond %{SCRIPT_FILENAME} !-d
-RewriteRule .* fetch.php [QSA,L]
-
+RewriteRule .* fetch.py [QSA,L]
diff --git a/http/pic/fetch.php b/http/pic/fetch.php
deleted file mode 100644
index 1fb94cd..0000000
--- a/http/pic/fetch.php
+++ /dev/null
@@ -1,13 +0,0 @@
-<?php
-
-$resource = @file_get_contents('http://msc.com.pl/cezar' . $_SERVER['REQUEST_URI']);
-if ($resource) {
-  $filename = array_pop(explode('/', $_SERVER['REQUEST_URI']));
-  file_put_contents($filename, $resource);
-  foreach ($http_response_header as $header) {
-    header($header);
-  }
-  readfile($filename);
-}
-
-?>
diff --git a/http/pic/fetch.py b/http/pic/fetch.py
new file mode 100644
index 0000000..0cc3556
--- /dev/null
+++ b/http/pic/fetch.py
@@ -0,0 +1,27 @@
+# coding=utf-8
+
+from mod_python import apache
+import os, urllib2
+
+CEZAR_URL = 'http://msc.com.pl/cezar'
+
+def handler(req):
+    orig_req = req
+    while True:
+        if orig_req.prev:
+            orig_req = orig_req.prev
+        else:
+            break
+
+    remote_resource = CEZAR_URL + orig_req.uri
+    request = urllib2.Request(remote_resource)
+    request.add_header('User-Agent', orig_req.headers_in['User-Agent'])
+    file_name = os.path.join(os.path.dirname(__file__), remote_resource.split('/')[-1])
+    try:
+        response = urllib2.urlopen(request)
+        open(file_name, 'w+').write(response.read())
+        req.content_type = response.headers['Content-Type']
+        req.write(open(file_name, 'r').read())
+        return apache.OK
+    except urllib2.URLError:
+        return apache.HTTP_NOT_FOUND
author	emkael <emkael@tlen.pl>	2015-02-09 20:14:59 +0100
committer	emkael <emkael@tlen.pl>	2015-02-09 20:14:59 +0100
commit	17167b1c0e088000164b87ef0a02237a3ffc107c (patch)
tree	79c61deeea11e57b3c8e99dc39ca76cee491a745
parent	dd3d2509f6048e11f9b2b127f6c7acb80a506d8d (diff)