From f4de82bcdafba51e4eed9cae6b2d3e5375ffd115 Mon Sep 17 00:00:00 2001 From: xue <> Date: Tue, 9 May 2006 12:11:38 +0000 Subject: --- buildscripts/texbuilder/create_index.php | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 buildscripts/texbuilder/create_index.php (limited to 'buildscripts/texbuilder/create_index.php') diff --git a/buildscripts/texbuilder/create_index.php b/buildscripts/texbuilder/create_index.php new file mode 100644 index 00000000..b451473d --- /dev/null +++ b/buildscripts/texbuilder/create_index.php @@ -0,0 +1,90 @@ +_index = new Zend_Search_Lucene($index_file, true); + $this->_dir = $index_file; + echo "Building search index...\n"; + } + + public function add($content, $section, $mtime) + { + foreach($this->split_headings($content) as $headers) + { + $doc = new Zend_Search_Lucene_Document(); + $link = "index.php?page=".preg_replace('/\/|\\\/', '.', $section); + $link = str_replace('.page', '', $link).'#'.$headers['section']; + + //unsearchable text + $doc->addField(Zend_Search_Lucene_Field::UnIndexed('link', $link)); + $doc->addField(Zend_Search_Lucene_Field::UnIndexed('mtime', $mtime)); + $doc->addField(Zend_Search_Lucene_Field::UnIndexed('title', $headers['title'])); + $doc->addField(Zend_Search_Lucene_Field::UnIndexed('text', $headers['content'])); + + //searchable text + $doc->addField(Zend_Search_Lucene_Field::Keyword('page', strtolower($headers['title']))); + $body = strtolower($this->sanitize($headers['content'])).' '.strtolower($headers['title']); + $doc->addField(Zend_Search_Lucene_Field::Unstored('contents',$body)); + $this->_index->addDocument($doc); + } + } + + function sanitize($input) + { + return htmlentities(strip_tags( $input )); + } + + public function index() + { + return $this->_index; + } + + protected function split_headings($html) + { + $html = preg_replace('/<\/?com:TContent[^<]*>/', '', $html); + + $html = preg_replace('/([^<]*)<\/b>/', '$1', $html); + $html = preg_replace('/([^<]*)<\/i>/', '$1', $html); + $html = preg_replace('/([^<]*)<\/tt>/', '$1', $html); + + $html = preg_replace('/]*)>([^<]*)<\/h1>/', '$2', $html); + $html = preg_replace('/]*)>([^<]*)<\/h2>/', '$2', $html); + $html = preg_replace('/]*)>([^<]*)<\/h3>/', '$2', $html); + + + $sections = preg_split('/]*>([^<]+)<\/hh>/', $html,-1); + $headers = array(); + preg_match_all('/]*)>([^<]+)<\/hh>/', $html, $headers); + $contents = array(); + for($i = 1, $t = count($sections); $i < $t; $i++) + { + $content['title'] = trim($this->sanitize($headers[2][$i-1])); + $sec = array(); + preg_match('/"([^"]*)"/', $headers[1][$i-1], $sec); + $content['section'] = str_replace('"', '',$sec[0]); + $content['content'] = trim($this->sanitize($sections[$i])); + $contents[] = $content; + } + + return $contents; + } + + public function commit() + { + $this->_index->commit(); + $count = $this->_index->count(); + echo "\nSaving search index ({$count}) to {$this->_dir}\n\n"; + } +} +?> \ No newline at end of file -- cgit v1.2.3