summaryrefslogtreecommitdiff
path: root/http/index.php
blob: a7034b651d99a6a695608ea87543af6654122518 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
<?php

define('BASE_PATH', '/');
define('CACHE_PATH', '../cache');
define('CEZAR_URL', 'http://msc.com.pl/cezar/');
define('QUERYPATH_PATH', '/usr/share/php/QueryPath/QueryPath.php');

// fetch item from content cache
function get_cache_content($cacheKey, $url, $force = FALSE) {
  $cacheFile = realpath(dirname(__FILE__) . '/' . CACHE_PATH) . '/' . $cacheKey;
  if ($force || !(file_exists($cacheFile) && filemtime($cacheFile) > strtotime('-1 day'))) {
    fetch_with_user_agent_spoof($cacheFile, $url);
  }
  return file_get_contents($cacheFile);
}

// save remote URL locally, forwarding browser's User Agent String
function fetch_with_user_agent_spoof($targetFile, $sourceUrl) {
    file_put_contents(
                      $targetFile,
                      file_get_contents($sourceUrl,
                                        FALSE,
                                        stream_context_create(['http' => [
                                                                          'user_agent' => $_SERVER['HTTP_USER_AGENT']
                                                                          ]
                                                               ]))
                      );
}

// parse requested path (after rewrite)
$url = parse_url(preg_replace('#^' . preg_quote(BASE_PATH) . '#', '', $_SERVER['REQUEST_URI']));
$path = array_values(array_filter(explode('/', $url['path'])));

if ($path) {
  // /[ANYTHING]/refresh disables cache (forces cache refresh)
  $cache = !(count($path) > 1 && $path[1] == 'refresh');

  // build Cezar URL for requested path
  $path[0] = urldecode($path[0]);
  $searchUrl = new http\Url(CEZAR_URL,
                            ['query' => http_build_query(
                                                         ['pid_search' => $path[0],
                                                          'p' => '21']
                                                         )]);

  $mscUrl = $searchUrl->toString();

  $content = get_cache_content($path[0], $mscUrl, !$cache); // requested content
  $contentLines = explode(PHP_EOL, $content);

  // if the comment delimiters are present, we're possibly dealing with the content we want, slice it and wrap it
  $delimiters = array_keys(preg_grep('/---- page content /', $contentLines));
  if ($delimiters) {
    $content = '<html><head>'
       . '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
       . '<base href="' . BASE_PATH . '" />'
       . '<style>'
       . 'body{width:580px;font-family:Tahoma,Geneva,Arial,Helvetica,"sans-serif";}'
       . 'a{text-decoration:none;color:black;}'
       . '</style>'
       . '</head><body>'
       . implode(PHP_EOL, array_slice($contentLines, $delimiters[0]+1, $delimiters[1]-$delimiters[0]-1))
       . '</body></html>';
  }
  else {
    die('Malformed (even more than usually) content :(');
  }

  require_once(QUERYPATH_PATH);

  $html = htmlqp($content, NULL, ['convert_to_encoding' => 'utf-8']);

  // for search list pages, replace links with internal links to player IDs
  $links = $html->find('a[href^="?p=21&pid="]');
  if ($links->size()) {
    foreach ($links as $link) {
      $href = [];
      parse_str($link->attr('href'), $href);
      $link->attr('href', $href['pid']);
    }
  }

  $html->top();

  // remove general crap
  $html->find('script, table.msc_noprint, center>p')->remove();
  $html->top();

  // leave only first-ish table of the content
  $html->find('table > tr > td')->eq(1)->remove();
  $html->top();
  $html->find('table > tr > td > table')->eq(2)->remove();
  $html->top();
  $html->find('table > tr > td > table')->eq(2)->remove();
  $html->top();

  // remove internal Cezar links
  $innerLinks = $html->find('table > tr > td > table a');
  foreach ($innerLinks as $innerLink) {
    $innerLink->removeAttr('href');
  }
  // get rid of Cezar link icons (right green arrows)
  $html->find('img[src*="ico_link_8.gif"]')->remove();
  $html->top();

  // proxy all external images, by resolving them relatively to the original server
  // and cache them locally
  // internal images are left untouched in the markup and are proxied through pic/fetch.php handler
  // (if they're not present/overwritten locally)
  $images = $html->find('img')->not('[src^="pic/"]');
  foreach ($images as $image) {
    $src = $image->attr('src');
    $url = new http\Url(CEZAR_URL, $src, http\Url::FROM_ENV | http\Url::SANITIZE_PATH | http\Url::JOIN_PATH | http\Url::REPLACE);
    $imageUrl = $url->toString();
    $cachedImageUrl = 'foto/' . md5($imageUrl) . '.' . array_pop(explode('.', $imageUrl));
    if (!file_exists($cachedImageUrl) || !$cache) {
      fetch_with_user_agent_spoof($cachedImageUrl, $imageUrl);
    }
    $image->attr('src', $cachedImageUrl);
  }
  $html->top();

  // link to the original URL on the image from foto/ directory and on the name+surname
  // (actually, on every text with font-size:28px set)
  $linking = $html->find('img[src^="foto/"],span[style*=":28px"]');
  foreach ($linking as $link) {
     $link->wrap('<a></a>');
     $link->parent()->attr('href', $mscUrl);
  }
  $html->top();

  // all done
  print $html->html();
}
else {
  die('Nothing to see here, move along.');
}

?>