From cab5dcbafb0f82ba742ac014741dfc4208be9fbd Mon Sep 17 00:00:00 2001 From: emkael Date: Tue, 27 Jan 2015 16:31:12 +0100 Subject: * initial commit --- .gitignore | 2 + cache/.gitignore | 1 + http/.htaccess | 8 ++++ http/foto/.gitignore | 1 + http/index.php | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++ http/pic/.gitignore | 3 ++ http/pic/.htaccess | 6 +++ http/pic/fetch.php | 13 ++++++ 8 files changed, 155 insertions(+) create mode 100644 .gitignore create mode 100644 cache/.gitignore create mode 100644 http/.htaccess create mode 100644 http/foto/.gitignore create mode 100644 http/index.php create mode 100644 http/pic/.gitignore create mode 100644 http/pic/.htaccess create mode 100644 http/pic/fetch.php diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb0d1d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +_log +_stats diff --git a/cache/.gitignore b/cache/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/cache/.gitignore @@ -0,0 +1 @@ +* diff --git a/http/.htaccess b/http/.htaccess new file mode 100644 index 0000000..28a620e --- /dev/null +++ b/http/.htaccess @@ -0,0 +1,8 @@ +RewriteEngine On + +RewriteBase / + +RewriteCond %{SCRIPT_FILENAME} !-f +RewriteCond %{SCRIPT_FILENAME} !-d +RewriteRule ^[^\.]*$ index.php [QSA,L] + diff --git a/http/foto/.gitignore b/http/foto/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/http/foto/.gitignore @@ -0,0 +1 @@ +* diff --git a/http/index.php b/http/index.php new file mode 100644 index 0000000..42a2731 --- /dev/null +++ b/http/index.php @@ -0,0 +1,121 @@ + strtotime('-1 day'))) { + fetch_with_user_agent_spoof($cacheFile, $url); + } + return file_get_contents($cacheFile); +} + +// save remote URL locally, forwarding browser's User Agent String +function fetch_with_user_agent_spoof($targetFile, $sourceUrl) { + file_put_contents( + $targetFile, + file_get_contents($sourceUrl, + FALSE, + stream_context_create(['http' => [ + 'user_agent' => $_SERVER['HTTP_USER_AGENT'] + ] + ])) + ); +} + +// parse requested path (after rewrite) +$url = parse_url($_SERVER['REQUEST_URI']); +$path = array_values(array_filter(explode('/', $url['path']))); + +if ($path) { + // /[ANYTHING]/refresh disables cache (forces cache refresh) + $cache = !(count($path) > 1 && $path[1] == 'refresh'); + + // build Cezar URL for requested path + $path[0] = urldecode($path[0]); + $searchUrl = new http\Url(CEZAR_URL, + ['query' => http_build_query( + ['pid_search' => $path[0], + 'p' => '21'] + )]); + + $mscUrl = $searchUrl->toString(); + + $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content + $contentLines = explode(PHP_EOL, $content); + + // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it + $delimiters = array_keys(preg_grep('/---- page content /', $contentLines)); + if ($delimiters) { + $content = '' + . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1)) + . ''; + } + else { + die('Malformed (even more than usually) content :('); + } + + require_once('/usr/share/php/QueryPath/QueryPath.php'); + + $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']); + + // for search list pages, replace links with internal links to player IDs + $links = $html->find('a[href^="?p=21&pid="]'); + if ($links->size()) { + foreach ($links as $link) { + $href = []; + parse_str($link->attr('href'), $href); + $link->attr('href', $href['pid']); + } + } + + $html->top(); + + // remove general crap + $html->find('script, table.msc_noprint, center>p')->remove(); + $html->top(); + + // leave only first-ish table of the content + $html->find('table > tr > td')->eq(1)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + $html->find('table > tr > td > table')->eq(2)->remove(); + $html->top(); + + // remove internal Cezar links + $innerLinks = $html->find('table > tr > td > table a'); + foreach ($innerLinks as $innerLink) { + $innerLink->removeAttr('href'); + } + // get rid of Cezar link icons (right green arrows) + $html->find('img[src*="ico_link_8.gif"]')->remove(); + $html->top(); + + // proxy all external images, by resolving them relatively to the original server + // and cache them locally + // internal images are left untouched in the markup and are proxied through pic/fetch.php handler + // (if they're not present/overwritten locally) + $images = $html->find('img')->not('[src^="pic/"]'); + foreach ($images as $image) { + $src = $image->attr('src'); + $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE); + $imageUrl = $url->toString(); + $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl)); + if (!file_exists($cachedImageUrl) || !$cache) { + fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl); + } + $image->attr('src', $cachedImageUrl); + } + $html->top(); + + // all done + print $html->html(); +} +else { + die('Nothing to see here, move along.'); +} + +?> diff --git a/http/pic/.gitignore b/http/pic/.gitignore new file mode 100644 index 0000000..ef5c65e --- /dev/null +++ b/http/pic/.gitignore @@ -0,0 +1,3 @@ +*.jpg +*.png +*.gif diff --git a/http/pic/.htaccess b/http/pic/.htaccess new file mode 100644 index 0000000..615bc19 --- /dev/null +++ b/http/pic/.htaccess @@ -0,0 +1,6 @@ +RewriteEngine On + +RewriteCond %{SCRIPT_FILENAME} !-f +RewriteCond %{SCRIPT_FILENAME} !-d +RewriteRule .* fetch.php [QSA,L] + diff --git a/http/pic/fetch.php b/http/pic/fetch.php new file mode 100644 index 0000000..1fb94cd --- /dev/null +++ b/http/pic/fetch.php @@ -0,0 +1,13 @@ + -- cgit v1.2.3