summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2015-01-27 16:31:12 +0100
committeremkael <emkael@tlen.pl>2015-01-27 16:31:12 +0100
commitcab5dcbafb0f82ba742ac014741dfc4208be9fbd (patch)
tree3081fc9150ac7d15280c099a4699085a2a27087f
* initial commit
-rw-r--r--.gitignore2
-rw-r--r--cache/.gitignore1
-rw-r--r--http/.htaccess8
-rw-r--r--http/foto/.gitignore1
-rw-r--r--http/index.php121
-rw-r--r--http/pic/.gitignore3
-rw-r--r--http/pic/.htaccess6
-rw-r--r--http/pic/fetch.php13
8 files changed, 155 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bb0d1d4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+_log
+_stats
diff --git a/cache/.gitignore b/cache/.gitignore
new file mode 100644
index 0000000..72e8ffc
--- /dev/null
+++ b/cache/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/http/.htaccess b/http/.htaccess
new file mode 100644
index 0000000..28a620e
--- /dev/null
+++ b/http/.htaccess
@@ -0,0 +1,8 @@
+RewriteEngine On
+
+RewriteBase /
+
+RewriteCond %{SCRIPT_FILENAME} !-f
+RewriteCond %{SCRIPT_FILENAME} !-d
+RewriteRule ^[^\.]*$ index.php [QSA,L]
+
diff --git a/http/foto/.gitignore b/http/foto/.gitignore
new file mode 100644
index 0000000..72e8ffc
--- /dev/null
+++ b/http/foto/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/http/index.php b/http/index.php
new file mode 100644
index 0000000..42a2731
--- /dev/null
+++ b/http/index.php
@@ -0,0 +1,121 @@
+<?php
+
+define('CACHE_PATH', '../cache');
+define('CEZAR_URL', 'http://msc.com.pl/cezar/');
+
+// fetch item from content cache
+function get_cache_content($cacheKey, $url, $force = FALSE) {
+ $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey;
+ if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) {
+ fetch_with_user_agent_spoof($cacheFile, $url);
+ }
+ return file_get_contents($cacheFile);
+}
+
+// save remote URL locally, forwarding browser's User Agent String
+function fetch_with_user_agent_spoof($targetFile, $sourceUrl) {
+ file_put_contents(
+ $targetFile,
+ file_get_contents($sourceUrl,
+ FALSE,
+ stream_context_create(['http' => [
+ 'user_agent' => $_SERVER['HTTP_USER_AGENT']
+ ]
+ ]))
+ );
+}
+
+// parse requested path (after rewrite)
+$url = parse_url($_SERVER['REQUEST_URI']);
+$path = array_values(array_filter(explode('/', $url['path'])));
+
+if ($path) {
+ // /[ANYTHING]/refresh disables cache (forces cache refresh)
+ $cache = !(count($path) > 1 && $path[1] == 'refresh');
+
+ // build Cezar URL for requested path
+ $path[0] = urldecode($path[0]);
+ $searchUrl = new http\Url(CEZAR_URL,
+ ['query' => http_build_query(
+ ['pid_search' => $path[0],
+ 'p' => '21']
+ )]);
+
+ $mscUrl = $searchUrl->toString();
+
+ $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content
+ $contentLines = explode(PHP_EOL, $content);
+
+ // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it
+ $delimiters = array_keys(preg_grep('/---- page content /', $contentLines));
+ if ($delimiters) {
+ $content = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><base href="/" /><style>body {width: 580px; font-family: Tahoma, Geneva, Arial, Helvetica, "sans-serif";}</style></head><body>'
+ . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1))
+ . '</body></html>';
+ }
+ else {
+ die('Malformed (even more than usually) content :(');
+ }
+
+ require_once('/usr/share/php/QueryPath/QueryPath.php');
+
+ $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']);
+
+ // for search list pages, replace links with internal links to player IDs
+ $links = $html->find('a[href^="?p=21&pid="]');
+ if ($links->size()) {
+ foreach ($links as $link) {
+ $href = [];
+ parse_str($link->attr('href'), $href);
+ $link->attr('href', $href['pid']);
+ }
+ }
+
+ $html->top();
+
+ // remove general crap
+ $html->find('script, table.msc_noprint, center>p')->remove();
+ $html->top();
+
+ // leave only first-ish table of the content
+ $html->find('table > tr > td')->eq(1)->remove();
+ $html->top();
+ $html->find('table > tr > td > table')->eq(2)->remove();
+ $html->top();
+ $html->find('table > tr > td > table')->eq(2)->remove();
+ $html->top();
+
+ // remove internal Cezar links
+ $innerLinks = $html->find('table > tr > td > table a');
+ foreach ($innerLinks as $innerLink) {
+ $innerLink->removeAttr('href');
+ }
+ // get rid of Cezar link icons (right green arrows)
+ $html->find('img[src*="ico_link_8.gif"]')->remove();
+ $html->top();
+
+ // proxy all external images, by resolving them relatively to the original server
+ // and cache them locally
+ // internal images are left untouched in the markup and are proxied through pic/fetch.php handler
+ // (if they're not present/overwritten locally)
+ $images = $html->find('img')->not('[src^="pic/"]');
+ foreach ($images as $image) {
+ $src = $image->attr('src');
+ $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE);
+ $imageUrl = $url->toString();
+ $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl));
+ if (!file_exists($cachedImageUrl) || !$cache) {
+ fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl);
+ }
+ $image->attr('src', $cachedImageUrl);
+ }
+ $html->top();
+
+ // all done
+ print $html->html();
+}
+else {
+ die('Nothing to see here, move along.');
+}
+
+?>
diff --git a/http/pic/.gitignore b/http/pic/.gitignore
new file mode 100644
index 0000000..ef5c65e
--- /dev/null
+++ b/http/pic/.gitignore
@@ -0,0 +1,3 @@
+*.jpg
+*.png
+*.gif
diff --git a/http/pic/.htaccess b/http/pic/.htaccess
new file mode 100644
index 0000000..615bc19
--- /dev/null
+++ b/http/pic/.htaccess
@@ -0,0 +1,6 @@
+RewriteEngine On
+
+RewriteCond %{SCRIPT_FILENAME} !-f
+RewriteCond %{SCRIPT_FILENAME} !-d
+RewriteRule .* fetch.php [QSA,L]
+
diff --git a/http/pic/fetch.php b/http/pic/fetch.php
new file mode 100644
index 0000000..1fb94cd
--- /dev/null
+++ b/http/pic/fetch.php
@@ -0,0 +1,13 @@
+<?php
+
+$resource = @file_get_contents('http://msc.com.pl/cezar' . $_SERVER['REQUEST_URI']);
+if ($resource) {
+ $filename = array_pop(explode('/', $_SERVER['REQUEST_URI']));
+ file_put_contents($filename, $resource);
+ foreach ($http_response_header as $header) {
+ header($header);
+ }
+ readfile($filename);
+}
+
+?>