summaryrefslogtreecommitdiff
path: root/http/index.php
blob: eac2162e67d783b28f2fca2931a0192272aa4dd9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<?php

define('BASE_PATH', '/');
define('CACHE_PATH', '../cache');
define('CEZAR_URL', 'http://msc.com.pl/cezar/');

// fetch item from content cache
function get_cache_content($cacheKey, $url, $force = FALSE) {
  $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey;
  if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) {
    fetch_with_user_agent_spoof($cacheFile, $url);
  }
  return file_get_contents($cacheFile);
}

// save remote URL locally, forwarding browser's User Agent String
function fetch_with_user_agent_spoof($targetFile, $sourceUrl) {
    file_put_contents(
                      $targetFile,
                      file_get_contents($sourceUrl,
                                        FALSE,
                                        stream_context_create(['http' => [
                                                                          'user_agent' => $_SERVER['HTTP_USER_AGENT']
                                                                          ]
                                                               ]))
                      );
}

// parse requested path (after rewrite)
$url = parse_url(preg_replace('#^' . preg_quote(BASE_PATH) . '#', '', $_SERVER['REQUEST_URI']));
$path = array_values(array_filter(explode('/', $url['path'])));

if ($path) {
  // /[ANYTHING]/refresh disables cache (forces cache refresh)
  $cache = !(count($path) > 1 && $path[1] == 'refresh');

  // build Cezar URL for requested path
  $path[0] = urldecode($path[0]);
  $searchUrl = new http\Url(CEZAR_URL,
                            ['query' => http_build_query(
                                                         ['pid_search' => $path[0],
                                                          'p' => '21']
                                                         )]);

  $mscUrl = $searchUrl->toString();

  $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content
  $contentLines = explode(PHP_EOL, $content);

  // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it
  $delimiters = array_keys(preg_grep('/---- page content /', $contentLines));
  if ($delimiters) {
    $content = '<html><head>'
       . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
       . '<base href="' . BASE_PATH . '" />'
       . '<style>body {width: 580px; font-family: Tahoma, Geneva, Arial, Helvetica, "sans-serif";}</style>'
       . '</head><body>'
       . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1))
       . '</body></html>';
  }
  else {
    die('Malformed (even more than usually) content :(');
  }

  require_once('/usr/share/php/QueryPath/QueryPath.php');

  $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']);

  // for search list pages, replace links with internal links to player IDs
  $links = $html->find('a[href^="?p=21&pid="]');
  if ($links->size()) {
    foreach ($links as $link) {
      $href = [];
      parse_str($link->attr('href'), $href);
      $link->attr('href', $href['pid']);
    }
  }

  $html->top();

  // remove general crap
  $html->find('script, table.msc_noprint, center>p')->remove();
  $html->top();

  // leave only first-ish table of the content
  $html->find('table > tr > td')->eq(1)->remove();
  $html->top();
  $html->find('table > tr > td > table')->eq(2)->remove();
  $html->top();
  $html->find('table > tr > td > table')->eq(2)->remove();
  $html->top();

  // remove internal Cezar links
  $innerLinks = $html->find('table > tr > td > table a');
  foreach ($innerLinks as $innerLink) {
    $innerLink->removeAttr('href');
  }
  // get rid of Cezar link icons (right green arrows)
  $html->find('img[src*="ico_link_8.gif"]')->remove();
  $html->top();

  // proxy all external images, by resolving them relatively to the original server
  // and cache them locally
  // internal images are left untouched in the markup and are proxied through pic/fetch.php handler
  // (if they're not present/overwritten locally)
  $images = $html->find('img')->not('[src^="pic/"]');
  foreach ($images as $image) {
    $src = $image->attr('src');
    $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE);
    $imageUrl = $url->toString();
    $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl));
    if (!file_exists($cachedImageUrl) || !$cache) {
      fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl);
    }
    $image->attr('src', $cachedImageUrl);
  }
  $html->top();

  // all done
  print $html->html();
}
else {
  die('Nothing to see here, move along.');
}

?>