1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
<?php
/*
* Created on 10/05/2006
*/
class api_index
{
const API_URL = '';
private $_index;
private $_api;
public function __construct($index_file, $api)
{
$this->_api = $api;
$this->_index = new Zend_Search_Lucene($index_file, true);
}
function create_index()
{
echo "Building search index...\n";
$files = $this->get_file_list($this->_api);
$count = 0;
foreach($files as $file)
{
echo " processing $file...\n";
$content = $this->get_details($file, $this->_api);
$doc = new Zend_Search_Lucene_Document();
$title = $content['namespace'].'.'.$content['class'];
echo " Adding ".$title."\n";
//unsearchable text
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('link', $content['link']));
$doc->addField(Zend_Search_Lucene_Field::UnIndexed('title', $title));
//$doc->addField(Zend_Search_Lucene_Field::UnIndexed('text', $content['content']));
//searchable
$body = strtolower($this->sanitize($content['content'])).' '.strtolower($title);
$doc->addField(Zend_Search_Lucene_Field::Keyword('page', strtolower(str_replace('.',' ',$title))));
$doc->addField(Zend_Search_Lucene_Field::Unstored('contents',$body));
$this->_index->addDocument($doc);
$count++;
}
$this->_index->commit();
echo "\n {$count} files indexed.\n";
}
function sanitize($input)
{
return htmlentities(strip_tags( $input ));
}
function get_file_list($path)
{
$d = dir($path);
$files = array();
while (false !== ($entry = $d->read()))
{
$filepath = $path.'/'.$entry;
if(is_dir($filepath) && is_int(strpos($entry, 'System')))
{
$files = array_merge($files, $this->get_files($filepath));
}
}
$d->close();
return $files;
}
function get_files($path)
{
$d = dir($path);
$files = array();
while (false !== ($entry = $d->read()))
{
$filepath = $path.'/'.$entry;
if(is_file($filepath) && $entry[0] !== '_')
$files[] = realpath($filepath);
}
$d->close();
return $files;
}
function get_doc_content($file)
{
$content = file_get_contents($file);
$html = preg_replace('/<h1>/','~~~', $content);
$html = preg_replace('/<![^~]+/m', '', $html);
$html = preg_replace('/<div class="credit">[\s\w\W\S]+/m', '', $html);
$html = preg_replace('/ |~+|\s{2,}/',' ',$html);
$html = preg_replace('/\s{2,}/',' ',$html);
$text = strip_tags($html);
$text = str_replace(' , ',', ',$text);
return $text;
}
function get_details($file, $base)
{
$result['content'] = $this->get_doc_content($file);
$find = array($base, '.html', '-');
$replace = array('', '', '.');
$path = preg_split('/\/|\\\/', str_replace($find, $replace, $file));
$result['namespace'] = $path[1];
$result['class'] = $path[2];
$result['link'] = self::API_URL.$path[1].'/'.$path[2].'.html';
return $result;
}
}
?>
|