diff options
Diffstat (limited to 'http')
-rw-r--r-- | http/.htaccess | 8 | ||||
-rw-r--r-- | http/foto/.gitignore | 1 | ||||
-rw-r--r-- | http/index.php | 121 | ||||
-rw-r--r-- | http/pic/.gitignore | 3 | ||||
-rw-r--r-- | http/pic/.htaccess | 6 | ||||
-rw-r--r-- | http/pic/fetch.php | 13 |
6 files changed, 152 insertions, 0 deletions
diff --git a/http/.htaccess b/http/.htaccess new file mode 100644 index 0000000..28a620e --- /dev/null +++ b/http/.htaccess @@ -0,0 +1,8 @@ +RewriteEngine On + +RewriteBase / + +RewriteCond %{SCRIPT_FILENAME} !-f +RewriteCond %{SCRIPT_FILENAME} !-d +RewriteRule ^[^\.]*$ index.php [QSA,L] + diff --git a/http/foto/.gitignore b/http/foto/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/http/foto/.gitignore @@ -0,0 +1 @@ +* diff --git a/http/index.php b/http/index.php new file mode 100644 index 0000000..42a2731 --- /dev/null +++ b/http/index.php @@ -0,0 +1,121 @@ +<?php + +define('CACHE_PATH', '../cache'); +define('CEZAR_URL', 'http://msc.com.pl/cezar/'); + +// fetch item from content cache +function get_cache_content($cacheKey, $url, $force = FALSE) { + $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey; + if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) { + fetch_with_user_agent_spoof($cacheFile, $url); + } + return file_get_contents($cacheFile); +} + +// save remote URL locally, forwarding browser's User Agent String +function fetch_with_user_agent_spoof($targetFile, $sourceUrl) { + file_put_contents( + $targetFile, + file_get_contents($sourceUrl, + FALSE, + stream_context_create(['http' => [ + 'user_agent' => $_SERVER['HTTP_USER_AGENT'] + ] + ])) + ); +} + +// parse requested path (after rewrite) +$url = parse_url($_SERVER['REQUEST_URI']); +$path = array_values(array_filter(explode('/', $url['path']))); + +if ($path) { + // /[ANYTHING]/refresh disables cache (forces cache refresh) + $cache = !(count($path) > 1 && $path[1] == 'refresh'); + + // build Cezar URL for requested path + $path[0] = urldecode($path[0]); + $searchUrl = new http\Url(CEZAR_URL, + ['query' => http_build_query( + ['pid_search' => $path[0], + 'p' => '21'] + )]); + + $mscUrl = $searchUrl->toString(); + + $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content + $contentLines = explode(PHP_EOL, $content); + + // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it + $delimiters = array_keys(preg_grep('/---- page content /', $contentLines)); + if ($delimiters) { + $content = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><base href="/" /><style>body {width: 580px; font-family: Tahoma, Geneva, Arial, Helvetica, "sans-serif";}</style></head><body>' + . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1)) + . '</body></html>'; + } + else { + die('Malformed (even more than usually) content :('); + } + + require_once('/usr/share/php/QueryPath/QueryPath.php'); + + $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']); + + // for search list pages, replace links with internal links to player IDs + $links = $html->find('a[href^="?p=21&pid="]'); + if ($links->size()) { + foreach ($links as $link) { + $href = []; + parse_str($link->attr('href'), $href); + $link->attr('href', $href['pid']); + } + } + + $html->top(); + + // remove general crap + $html->find('script, table.msc_noprint, center>p')->remove(); + $html->top(); + + // leave only first-ish table of the content + $html->find('table > tr > td')->eq(1)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + + // remove internal Cezar links + $innerLinks = $html->find('table > tr > td > table a'); + foreach ($innerLinks as $innerLink) { + $innerLink->removeAttr('href'); + } + // get rid of Cezar link icons (right green arrows) + $html->find('img[src*="ico_link_8.gif"]')->remove(); + $html->top(); + + // proxy all external images, by resolving them relatively to the original server + // and cache them locally + // internal images are left untouched in the markup and are proxied through pic/fetch.php handler + // (if they're not present/overwritten locally) + $images = $html->find('img')->not('[src^="pic/"]'); + foreach ($images as $image) { + $src = $image->attr('src'); + $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE); + $imageUrl = $url->toString(); + $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl)); + if (!file_exists($cachedImageUrl) || !$cache) { + fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl); + } + $image->attr('src', $cachedImageUrl); + } + $html->top(); + + // all done + print $html->html(); +} +else { + die('Nothing to see here, move along.'); +} + +?> diff --git a/http/pic/.gitignore b/http/pic/.gitignore new file mode 100644 index 0000000..ef5c65e --- /dev/null +++ b/http/pic/.gitignore @@ -0,0 +1,3 @@ +*.jpg +*.png +*.gif diff --git a/http/pic/.htaccess b/http/pic/.htaccess new file mode 100644 index 0000000..615bc19 --- /dev/null +++ b/http/pic/.htaccess @@ -0,0 +1,6 @@ +RewriteEngine On + +RewriteCond %{SCRIPT_FILENAME} !-f +RewriteCond %{SCRIPT_FILENAME} !-d +RewriteRule .* fetch.php [QSA,L] + diff --git a/http/pic/fetch.php b/http/pic/fetch.php new file mode 100644 index 0000000..1fb94cd --- /dev/null +++ b/http/pic/fetch.php @@ -0,0 +1,13 @@ +<?php + +$resource = @file_get_contents('http://msc.com.pl/cezar' . $_SERVER['REQUEST_URI']); +if ($resource) { + $filename = array_pop(explode('/', $_SERVER['REQUEST_URI'])); + file_put_contents($filename, $resource); + foreach ($http_response_header as $header) { + header($header); + } + readfile($filename); +} + +?> |