summaryrefslogtreecommitdiff
path: root/buildscripts/index/api_index.php
blob: 1de8fb56fd88a0f774eb0dbff3eb18a776daf934 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
<?php
/*
 * Created on 10/05/2006
 */
 
class api_index 
{
	const API_URL = '';
	
	private $_index;	
	private $_api;
	
	public function __construct($index_file, $api)
	{
		$this->_api = $api;
		$this->_index = new Zend_Search_Lucene($index_file, true);
		
		
	}
	
	function create_index()
	{
		echo "Building search index...\n";
		$files = $this->get_files($this->_api);
		$count = 0;
		foreach($files as $file)
		{
			$content = $this->get_details($file, $this->_api);
			
			$doc = new Zend_Search_Lucene_Document();
			
			$title = $content['class'];
			
			echo "  Adding ".$title."\n";
			
			//unsearchable text
			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('link', $content['link']));
			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('title', $title));
			//$doc->addField(Zend_Search_Lucene_Field::UnIndexed('text', $content['content']));
			
			//searchable
			$body = strtolower($this->sanitize($content['content'])).' '.strtolower($title);			
			$doc->addField(Zend_Search_Lucene_Field::Keyword('page', strtolower(str_replace('.',' ',$title))));
			$doc->addField(Zend_Search_Lucene_Field::Unstored('contents',$body));
			$this->_index->addDocument($doc);
			$count++;
		}
		$this->_index->commit();
		echo "\n {$count} files indexed.\n";
	}

	function sanitize($input) 
	{
		return htmlentities(strip_tags( $input ));
	}	

	function get_files($path)
	{
		$d = dir($path);
		
		$files = array();
		while (false !== ($entry = $d->read()))
		{
			$filepath = $path.'/'.$entry;
			if(is_file($filepath) && strpos($entry, 'class-')===0)
				$files[] = realpath($filepath);
		}
		$d->close();
		return $files;
	}
	
	function get_doc_content($file)
	{
		$content = file_get_contents($file);
		$html = preg_replace('/<h1>/','~~~', $content);
		$html = preg_replace('/<![^~]+/m', '', $html);
		$html = preg_replace('/<div class="credit">[\s\w\W\S]+/m', '', $html);
		$html = preg_replace('/&nbsp;|~+|\s{2,}/',' ',$html);
		$html = preg_replace('/\s{2,}/',' ',$html);
		$text = strip_tags($html);
		$text = str_replace(' , ',', ',$text);
		return $text;
	}
	
	function get_details($file, $base)
	{
		$result['content'] = $this->get_doc_content($file);
		$find = array($base, '.html', 'class-');
		$replace = array('', '', '');
		$path = str_replace($find, $replace, $file);
		$result['class'] = $path;
		$result['link'] = self::API_URL.$file;
		return $result;
	}
}