strtotime('-1 day'))) { fetch_with_user_agent_spoof($cacheFile, $url); } return file_get_contents($cacheFile); } // save remote URL locally, forwarding browser's User Agent String function fetch_with_user_agent_spoof($targetFile, $sourceUrl) { file_put_contents( $targetFile, file_get_contents($sourceUrl, FALSE, stream_context_create(['http' => [ 'user_agent' => $_SERVER['HTTP_USER_AGENT'] ] ])) ); } // parse requested path (after rewrite) $url = parse_url($_SERVER['REQUEST_URI']); $path = array_values(array_filter(explode('/', $url['path']))); if ($path) { // /[ANYTHING]/refresh disables cache (forces cache refresh) $cache = !(count($path) > 1 && $path[1] == 'refresh'); // build Cezar URL for requested path $path[0] = urldecode($path[0]); $searchUrl = new http\Url(CEZAR_URL, ['query' => http_build_query( ['pid_search' => $path[0], 'p' => '21'] )]); $mscUrl = $searchUrl->toString(); $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content $contentLines = explode(PHP_EOL, $content); // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it $delimiters = array_keys(preg_grep('/---- page content /', $contentLines)); if ($delimiters) { $content = '' . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1)) . ''; } else { die('Malformed (even more than usually) content :('); } require_once('/usr/share/php/QueryPath/QueryPath.php'); $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']); // for search list pages, replace links with internal links to player IDs $links = $html->find('a[href^="?p=21&pid="]'); if ($links->size()) { foreach ($links as $link) { $href = []; parse_str($link->attr('href'), $href); $link->attr('href', $href['pid']); } } $html->top(); // remove general crap $html->find('script, table.msc_noprint, center>p')->remove(); $html->top(); // leave only first-ish table of the content $html->find('table > tr > td')->eq(1)->remove(); $html->top(); $html->find('table > tr > td > table')->eq(2)->remove(); $html->top(); $html->find('table > tr > td > table')->eq(2)->remove(); $html->top(); // remove internal Cezar links $innerLinks = $html->find('table > tr > td > table a'); foreach ($innerLinks as $innerLink) { $innerLink->removeAttr('href'); } // get rid of Cezar link icons (right green arrows) $html->find('img[src*="ico_link_8.gif"]')->remove(); $html->top(); // proxy all external images, by resolving them relatively to the original server // and cache them locally // internal images are left untouched in the markup and are proxied through pic/fetch.php handler // (if they're not present/overwritten locally) $images = $html->find('img')->not('[src^="pic/"]'); foreach ($images as $image) { $src = $image->attr('src'); $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE); $imageUrl = $url->toString(); $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl)); if (!file_exists($cachedImageUrl) || !$cache) { fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl); } $image->attr('src', $cachedImageUrl); } $html->top(); // all done print $html->html(); } else { die('Nothing to see here, move along.'); } ?>