diff options
author | emkael <emkael@tlen.pl> | 2015-01-27 16:31:12 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2015-01-27 16:31:12 +0100 |
commit | cab5dcbafb0f82ba742ac014741dfc4208be9fbd (patch) | |
tree | 3081fc9150ac7d15280c099a4699085a2a27087f /http/index.php |
* initial commit
Diffstat (limited to 'http/index.php')
-rw-r--r-- | http/index.php | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/http/index.php b/http/index.php new file mode 100644 index 0000000..42a2731 --- /dev/null +++ b/http/index.php @@ -0,0 +1,121 @@ +<?php + +define('CACHE_PATH', '../cache'); +define('CEZAR_URL', 'http://msc.com.pl/cezar/'); + +// fetch item from content cache +function get_cache_content($cacheKey, $url, $force = FALSE) { + $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey; + if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) { + fetch_with_user_agent_spoof($cacheFile, $url); + } + return file_get_contents($cacheFile); +} + +// save remote URL locally, forwarding browser's User Agent String +function fetch_with_user_agent_spoof($targetFile, $sourceUrl) { + file_put_contents( + $targetFile, + file_get_contents($sourceUrl, + FALSE, + stream_context_create(['http' => [ + 'user_agent' => $_SERVER['HTTP_USER_AGENT'] + ] + ])) + ); +} + +// parse requested path (after rewrite) +$url = parse_url($_SERVER['REQUEST_URI']); +$path = array_values(array_filter(explode('/', $url['path']))); + +if ($path) { + // /[ANYTHING]/refresh disables cache (forces cache refresh) + $cache = !(count($path) > 1 && $path[1] == 'refresh'); + + // build Cezar URL for requested path + $path[0] = urldecode($path[0]); + $searchUrl = new http\Url(CEZAR_URL, + ['query' => http_build_query( + ['pid_search' => $path[0], + 'p' => '21'] + )]); + + $mscUrl = $searchUrl->toString(); + + $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content + $contentLines = explode(PHP_EOL, $content); + + // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it + $delimiters = array_keys(preg_grep('/---- page content /', $contentLines)); + if ($delimiters) { + $content = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><base href="/" /><style>body {width: 580px; font-family: Tahoma, Geneva, Arial, Helvetica, "sans-serif";}</style></head><body>' + . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1)) + . '</body></html>'; + } + else { + die('Malformed (even more than usually) content :('); + } + + require_once('/usr/share/php/QueryPath/QueryPath.php'); + + $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']); + + // for search list pages, replace links with internal links to player IDs + $links = $html->find('a[href^="?p=21&pid="]'); + if ($links->size()) { + foreach ($links as $link) { + $href = []; + parse_str($link->attr('href'), $href); + $link->attr('href', $href['pid']); + } + } + + $html->top(); + + // remove general crap + $html->find('script, table.msc_noprint, center>p')->remove(); + $html->top(); + + // leave only first-ish table of the content + $html->find('table > tr > td')->eq(1)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + + // remove internal Cezar links + $innerLinks = $html->find('table > tr > td > table a'); + foreach ($innerLinks as $innerLink) { + $innerLink->removeAttr('href'); + } + // get rid of Cezar link icons (right green arrows) + $html->find('img[src*="ico_link_8.gif"]')->remove(); + $html->top(); + + // proxy all external images, by resolving them relatively to the original server + // and cache them locally + // internal images are left untouched in the markup and are proxied through pic/fetch.php handler + // (if they're not present/overwritten locally) + $images = $html->find('img')->not('[src^="pic/"]'); + foreach ($images as $image) { + $src = $image->attr('src'); + $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE); + $imageUrl = $url->toString(); + $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl)); + if (!file_exists($cachedImageUrl) || !$cache) { + fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl); + } + $image->attr('src', $cachedImageUrl); + } + $html->top(); + + // all done + print $html->html(); +} +else { + die('Nothing to see here, move along.'); +} + +?> |