diff options
author | wei <> | 2006-05-07 03:34:25 +0000 |
---|---|---|
committer | wei <> | 2006-05-07 03:34:25 +0000 |
commit | 30eddf57c8de433e8ea02b9e552c8e1744a505a7 (patch) | |
tree | 9e81f3a15f9a695cb96c5cc4dd80de5a3a0bb7b2 /buildscripts | |
parent | 0bb2822f68dfe3cf568affd4acf0d8120d9d53c7 (diff) |
Add search to quickstart demo.
Diffstat (limited to 'buildscripts')
42 files changed, 6729 insertions, 1 deletions
diff --git a/buildscripts/texbuilder/Zend/Exception.php b/buildscripts/texbuilder/Zend/Exception.php new file mode 100644 index 00000000..ab5e4e95 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Exception.php @@ -0,0 +1,28 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Exception extends Exception +{} + diff --git a/buildscripts/texbuilder/Zend/LICENSE.txt b/buildscripts/texbuilder/Zend/LICENSE.txt new file mode 100644 index 00000000..bfd3ff11 --- /dev/null +++ b/buildscripts/texbuilder/Zend/LICENSE.txt @@ -0,0 +1,52 @@ +----------------------------------------------------------------------- + The Zend Framework License, Version 1.0 + Copyright (c) 2005 Zend Technologies USA, Inc. All rights reserved. +----------------------------------------------------------------------- + +Redistribution and use in source and binary forms, with or without +modification, is permitted provided that the following conditions +are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + 3. The names "Zend" and "Zend Framework" must not be used to endorse + or promote products derived from this software without prior + permission from Zend Technologies USA, Inc. For written + permission, please contact license@zend.com. + + 4. Zend Technologies USA, Inc. may publish revised and/or new + versions of the license from time to time. Each version will + be given a distinguishing version number. + Once covered code has been published under a particular version + of the license, you may always continue to use it under the + terms of that version. You may also choose to use such covered + code under the terms of any subsequent version of the license + published by Zend Technologies USA, Inc. No one other than Zend + Technologies USA, Inc. has the right to modify the terms + applicable to covered code created under this License. + + 5. Redistributions of any form whatsoever must retain the following + acknowledgment: + "This product includes the Zend Framework, freely available at + http://www.zend.com" + +THIS SOFTWARE IS PROVIDED BY ZEND TECHNOLOGIES USA, INC. ``AS IS'' AND +ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ZEND +TECHNOLOGIES USA, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +----------------------------------------------------------------------- diff --git a/buildscripts/texbuilder/Zend/Search/Exception.php b/buildscripts/texbuilder/Zend/Search/Exception.php new file mode 100644 index 00000000..e0aa2221 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Exception.php @@ -0,0 +1,34 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Framework base exception + */ +require_once 'Zend/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Exception extends Zend_Exception +{} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene.php b/buildscripts/texbuilder/Zend/Search/Lucene.php new file mode 100644 index 00000000..700a8b8a --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene.php @@ -0,0 +1,569 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Document */ +require_once 'Zend/Search/Lucene/Document.php'; + +/** Zend_Search_Lucene_Storage_Directory */ +require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php'; + +/** Zend_Search_Lucene_Index_Term */ +require_once 'Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Index_TermInfo */ +require_once 'Zend/Search/Lucene/Index/TermInfo.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + +/** Zend_Search_Lucene_Index_FieldInfo */ +require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; + +/** Zend_Search_Lucene_Index_Writer */ +require_once 'Zend/Search/Lucene/Index/Writer.php'; + +/** Zend_Search_Lucene_Search_QueryParser */ +require_once 'Zend/Search/Lucene/Search/QueryParser.php'; + +/** Zend_Search_Lucene_Search_QueryHit */ +require_once 'Zend/Search/Lucene/Search/QueryHit.php'; + +/** Zend_Search_Lucene_Search_Similarity */ +require_once 'Zend/Search/Lucene/Search/Similarity.php'; + + +/** + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene +{ + /** + * File system adapter. + * + * @var Zend_Search_Lucene_Storage_Directory + */ + private $_directory = null; + + /** + * File system adapter closing option + * + * @var boolean + */ + private $_closeDirOnExit = true; + + /** + * Writer for this index, not instantiated unless required. + * + * @var Zend_Search_Lucene_Index_Writer + */ + private $_writer = null; + + /** + * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. + * + * @var array Zend_Search_Lucene_Index_SegmentInfo + */ + private $_segmentInfos = array(); + + /** + * Number of documents in this index. + * + * @var integer + */ + private $_docCount = 0; + + + /** + * Opens the index. + * + * IndexReader constructor needs Directory as a parameter. It should be + * a string with a path to the index folder or a Directory object. + * + * @param mixed $directory + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($directory = null, $create = false) + { + if ($directory === null) { + throw new Zend_Search_Exception('No index directory specified'); + } + + if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { + $this->_directory = $directory; + $this->_closeDirOnExit = false; + } else { + $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); + $this->_closeDirOnExit = true; + } + + if ($create) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); + } else { + $this->_writer = null; + } + + $this->_segmentInfos = array(); + + $segmentsFile = $this->_directory->getFileObject('segments'); + + $format = $segmentsFile->readInt(); + + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $segmentsFile->readLong(); + + // read counter + $segmentsFile->readInt(); + + $segments = $segmentsFile->readInt(); + + $this->_docCount = 0; + + // read segmentInfos + for ($count = 0; $count < $segments; $count++) { + $segName = $segmentsFile->readString(); + $segSize = $segmentsFile->readInt(); + $this->_docCount += $segSize; + + $this->_segmentInfos[$count] = + new Zend_Search_Lucene_Index_SegmentInfo($segName, + $segSize, + $this->_directory); + } + } + + + /** + * Object destructor + */ + public function __destruct() + { + $this->commit(); + + if ($this->_closeDirOnExit) { + $this->_directory->close(); + } + } + + /** + * Returns an instance of Zend_Search_Lucene_Index_Writer for the index + * + * @return Zend_Search_Lucene_Index_Writer + */ + public function getIndexWriter() + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + return $this->_writer; + } + + + /** + * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. + * + * @return Zend_Search_Lucene_Storage_Directory + */ + public function getDirectory() + { + return $this->_directory; + } + + + /** + * Returns the total number of documents in this index. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * @param mixed $query + * @return array ZSearchHit + */ + public function find($query) + { + if (is_string($query)) { + $query = Zend_Search_Lucene_Search_QueryParser::parse($query); + } + + if (!$query instanceof Zend_Search_Lucene_Search_Query) { + throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); + } + + $this->commit(); + + $hits = array(); + $scores = array(); + + $docNum = $this->count(); + for( $count=0; $count < $docNum; $count++ ) { + $docScore = $query->score( $count, $this); + if( $docScore != 0 ) { + $hit = new Zend_Search_Lucene_Search_QueryHit($this); + $hit->id = $count; + $hit->score = $docScore; + + $hits[] = $hit; + $scores[] = $docScore; + } + } + array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); + + return $hits; + } + + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false) + { + $result = array(); + foreach( $this->_segmentInfos as $segmentInfo ) { + $result = array_merge($result, $segmentInfo->getFields($indexed)); + } + return $result; + } + + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id) + { + if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { + /* @var $id Zend_Search_Lucene_Search_QueryHit */ + $id = $id->id; + } + + if ($id >= $this->_docCount) { + /** + * @todo exception here? + */ + return null; + } + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + + $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); + $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); + $fieldValuesPosition = $fdxFile->readLong(); + + $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); + $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); + $fieldCount = $fdtFile->readVInt(); + + $doc = new Zend_Search_Lucene_Document(); + for( $count = 0; $count < $fieldCount; $count++ ) { + $fieldNum = $fdtFile->readVInt(); + $bits = $fdtFile->readByte(); + + $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); + + if( !($bits & 2) ) { // Text data + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readString(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } else { + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readBinary(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } + + $doc->addField($field); + } + + return $doc; + } + + + /** + * Returns an array of all the documents which contain term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $docId = 0; + for( $count=0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + } else { + $docId += $docDelta/2; + // read freq + $frqFile->readVInt(); + } + $result[] = $segmentStartDocId + $docId; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + foreach( $this->_segmentInfos as $segInfo ) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $freqs = array(); + $docId = 0; + + for( $count = 0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + $freqs[ $docId ] = 1; + } else { + $docId += $docDelta/2; + $freqs[ $docId ] = $frqFile->readVInt(); + } + } + + $prxFile = $segInfo->openCompoundFile('.prx'); + $prxFile->seek($termInfo->proxPointer,SEEK_CUR); + foreach ($freqs as $docId => $freq) { + $termPosition = 0; + $positions = array(); + + for ($count = 0; $count < $freq; $count++ ) { + $termPosition += $prxFile->readVInt(); + $positions[] = $termPosition; + } + $result[ $segmentStartDocId + $docId ] = $positions; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term) + { + $result = 0; + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + if ($termInfo !== null) { + $result += $termInfo->docFreq; + } + } + + return $result; + } + + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity() + { + return Zend_Search_Lucene_Search_Similarity::getDefault(); + } + + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return Zend_Search_Lucene_Document + */ + public function norm( $id, $fieldName ) + { + if( $id >= $this->_docCount ) + return null; + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + + return $this->_segmentInfos[ $segCount ]->norm($id - $segmentStartId, $fieldName); + } + + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + $this->_writer->addDocument($document); + } + + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + * + * @todo delete() and undeleteAll processing. + */ + public function commit() + { + if ($this->_writer !== null) { + foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { + if ($segmentInfo !== null) { + $this->_segmentInfos[] = $segmentInfo; + $this->_docCount += $segmentInfo->count(); + } else { + foreach ($this->_segmentInfos as $segId => $segInfo) { + if ($segInfo->getName() == $segmentName) { + unset($this->_segmentInfos[$segId]); + } + } + } + } + } + } + + + /************************************************************************* + @todo UNIMPLEMENTED + *************************************************************************/ + + /** + * Returns an array of all terms in this index. + * + * @todo Implementation + * @return array + */ + public function terms() + { + return array(); + } + + + /** + * Returns true if any documents have been deleted from this index. + * + * @todo Implementation + * @return boolean + */ + public function hasDeletions() + { + return false; + } + + + /** + * Deletes a document from the index. $doc may contain a Zend_Search_Lucene_Document + * or the number of the document to delete. + * + * @todo Implementation + * @param mixed $item_to_del + */ + public function delete($doc) + {} + + + /** + * Undeletes all documents currently marked as deleted in this index. + * + * @todo Implementation + */ + public function undeleteAll() + {} +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php new file mode 100644 index 00000000..8e234c16 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php @@ -0,0 +1,94 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Token */ +require_once 'Zend/Search/Lucene/Analysis/Token.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; + + + +/** + * An Analyzer is used to analyze text. + * It thus represents a policy for extracting index terms from text. + * + * Note: + * Lucene Java implementation is oriented to streams. It provides effective work + * with a huge documents (more then 20Mb). + * But engine itself is not oriented such documents. + * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays). + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +abstract class Zend_Search_Lucene_Analysis_Analyzer +{ + /** + * The Analyzer implementation used by default. + * + * @var Zend_Search_Lucene_Analysis_Analyzer + */ + static private $_defaultImpl; + + /** + * Tokenize text to a terms + * Returns array of Zend_Search_Lucene_Analysis_Token objects + * + * @param string $data + * @return array + */ + abstract public function tokenize($data); + + + /** + * Set the default Analyzer implementation used by indexing code. + * + * @param Zend_Search_Lucene_Analysis_Analyzer $similarity + */ + static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) + { + self::$_defaultImpl = $analyzer; + } + + + /** + * Return the default Analyzer implementation used by indexing code. + * + * @return Zend_Search_Lucene_Analysis_Analyzer + */ + static public function getDefault() + { + if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { + self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); + } + + return self::$_defaultImpl; + } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php new file mode 100644 index 00000000..5c61e5b5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php @@ -0,0 +1,73 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; + + +/** + * Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface. + * There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis + * subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer. + * + * @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer +{ + /** + * The set of Token filters applied to the Token stream. + * Array of Zend_Search_Lucene_Analysis_TokenFilter objects. + * + * @var array + */ + private $_filters = array(); + + /** + * Add Token filter to the Analyzer + * + * @param Zend_Search_Lucene_Analysis_TokenFilter $filter + */ + public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter) + { + $this->_filters[] = $filter; + } + + /** + * Apply filters to the token. + * + * @param Zend_Search_Lucene_Analysis_Token $token + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $token) + { + foreach ($this->_filters as $filter) { + $token = $filter->normalize($token); + } + + return $token; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php new file mode 100644 index 00000000..2a80c1f8 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -0,0 +1,76 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer_Common */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common +{ + /** + * Tokenize text to a terms + * Returns array of Zend_Search_Lucene_Analysis_Token objects + * + * @param string $data + * @return array + */ + public function tokenize($data) + { + $tokenStream = array(); + + $position = 0; + while ($position < strlen($data)) { + // skip white space + while ($position < strlen($data) && !ctype_alpha( $data{$position} )) { + $position++; + } + + $termStartPosition = $position; + + // read token + while ($position < strlen($data) && ctype_alpha( $data{$position} )) { + $position++; + } + + // Empty token, end of stream. + if ($position == $termStartPosition) { + break; + } + + $token = new Zend_Search_Lucene_Analysis_Token(substr($data, + $termStartPosition, + $position-$termStartPosition), + $termStartPosition, + $position); + $tokenStream[] = $this->normalize($token); + } + + return $tokenStream; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php new file mode 100644 index 00000000..d77e38d5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php @@ -0,0 +1,43 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; + +/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */ +require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text +{ + public function __construct() + { + $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase()); + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php new file mode 100644 index 00000000..a60d5d96 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php @@ -0,0 +1,170 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage document + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Analysis_Token +{ + /** + * The text of the term. + * + * @var string + */ + private $_termText; + + /** + * Start in source text. + * + * @var integer + */ + private $_startOffset; + + /** + * End in source text + * + * @var integer + */ + private $_endOffset; + + /** + * Lexical type. + * + * @var string + */ + private $_type; + + /** + * The position of this token relative to the previous Token. + * + * The default value is one. + * + * Some common uses for this are: + * Set it to zero to put multiple terms in the same position. This is + * useful if, e.g., a word has multiple stems. Searches for phrases + * including either stem will match. In this case, all but the first stem's + * increment should be set to zero: the increment of the first instance + * should be one. Repeating a token with an increment of zero can also be + * used to boost the scores of matches on that token. + * + * Set it to values greater than one to inhibit exact phrase matches. + * If, for example, one does not want phrases to match across removed stop + * words, then one could build a stop word filter that removes stop words and + * also sets the increment to the number of stop words removed before each + * non-stop word. Then exact phrase queries will only match when the terms + * occur with no intervening stop words. + * + * @var integer + */ + private $_positionIncrement; + + + /** + * Object constructor + * + * @param string $text + * @param integer $start + * @param integer $end + * @param string $type + */ + public function __construct($text, $start, $end, $type = 'word' ) + { + $this->_termText = $text; + $this->_startOffset = $start; + $this->_endOffset = $end; + $this->_type = $type; + + $this->_positionIncrement = 1; + } + + + /** + * positionIncrement setter + * + * @param integer $positionIncrement + */ + public function setPositionIncrement($positionIncrement) + { + $this->_positionIncrement = $positionIncrement; + } + + /** + * Returns the position increment of this Token. + * + * @return integer + */ + public function getPositionIncrement() + { + return $this->_positionIncrement; + } + + /** + * Returns the Token's term text. + * + * @return string + */ + public function getTermText() + { + return $this->_termText; + } + + /** + * Returns this Token's starting offset, the position of the first character + * corresponding to this token in the source text. + * + * Note: + * The difference between getEndOffset() and getStartOffset() may not be equal + * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered + * by a stemmer or some other filter. + * + * @return integer + */ + public function getStartOffset() + { + return $this->_startOffset; + } + + /** + * Returns this Token's ending offset, one greater than the position of the + * last character corresponding to this token in the source text. + * + * @return integer + */ + public function getEndOffset() + { + return $this->_endOffset; + } + + /** + * Returns this Token's lexical type. Defaults to 'word'. + * + * @return string + */ + public function getType() + { + return $this->_type; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php new file mode 100644 index 00000000..9ea5125f --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php @@ -0,0 +1,45 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Token */ +require_once 'Zend/Search/Lucene/Analysis/Token.php'; + + +/** + * Token filter converts (normalizes) Token ore removes it from a token stream. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +abstract class Zend_Search_Lucene_Analysis_TokenFilter +{ + /** + * Normalize Token or remove it (if null is returned) + * + * @param Zend_Search_Lucene_Analysis_Token $srcToken + * @return Zend_Search_Lucene_Analysis_Token + */ + abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php new file mode 100644 index 00000000..53585e21 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php @@ -0,0 +1,55 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_TokenFilter */ +require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php'; + + +/** + * Lower case Token filter. + * + * @package Zend_Search_Lucene + * @subpackage Analysis + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter +{ + /** + * Normalize Token or remove it (if null is returned) + * + * @param Zend_Search_Lucene_Analysis_Token $srcToken + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) + { + $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ), + $srcToken->getStartOffset(), + $srcToken->getEndOffset(), + $srcToken->getType()); + + $newToken->setPositionIncrement($srcToken->getPositionIncrement()); + + return $newToken; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Document.php b/buildscripts/texbuilder/Zend/Search/Lucene/Document.php new file mode 100644 index 00000000..29c0c2d9 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Document.php @@ -0,0 +1,109 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage document + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Field */ +require_once 'Zend/Search/Lucene/Field.php'; + + +/** + * A Document is a set of fields. Each field has a name and a textual value. + * + * @package Zend_Search_Lucene + * @subpackage document + * @copyright Copyright (c) 2005-2006 Zend Technologies Inc. (http://www.zend.com) + * @license Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Document +{ + + /** + * Associative array Zend_Search_Lucene_Field objects where the keys to the + * array are the names of the fields. + * + * @var array + */ + protected $_fields = array(); + + public $boost = 1.0; + + + /** + * Proxy method for getFieldValue(), provides more convenient access to + * the string value of a field. + * + * @param $offset + * @return string + */ + public function __get($offset) + { + return $this->getFieldValue($offset); + } + + + /** + * Add a field object to this document. + * + * @param Zend_Search_Lucene_Field $field + */ + public function addField(Zend_Search_Lucene_Field $field) + { + $this->_fields[$field->name] = $field; + } + + + /** + * Return an array with the names of the fields in this document. + * + * @return array + */ + public function getFieldNames() + { + return array_keys($this->_fields); + } + + + /** + * Returns Zend_Search_Lucene_Field object for a named field in this document. + * + * @param string $fieldName + * @return Zend_Search_Lucene_Field + */ + public function getField($fieldName) + { + if (!array_key_exists($fieldName, $this->_fields)) { + throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); + } + return $this->_fields[$fieldName]; + } + + + /** + * Returns the string value of a named field in this document. + * + * @see __get() + * @return string + */ + public function getFieldValue($fieldName) + { + return $this->getField($fieldName)->stringValue; + } + +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php b/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php new file mode 100644 index 00000000..5f12c5f6 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php @@ -0,0 +1,34 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Framework base exception + */ +require_once 'Zend/Search/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Exception extends Zend_Search_Exception +{} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Field.php b/buildscripts/texbuilder/Zend/Search/Lucene/Field.php new file mode 100644 index 00000000..cce6bfce --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Field.php @@ -0,0 +1,134 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage document + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A field is a section of a Document. Each field has two parts, + * a name and a value. Values may be free text or they may be atomic + * keywords, which are not further processed. Such keywords may + * be used to represent dates, urls, etc. Fields are optionally + * stored in the index, so that they may be returned with hits + * on the document. + * + * @package Zend_Search_Lucene + * @subpackage document + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Field +{ + public $kind; + + public $name = 'body'; + public $stringValue = null; + public $isStored = false; + public $isIndexed = true; + public $isTokenized = true; + public $isBinary = false; + + public $storeTermVector = false; + + public $boost = 1.0; + + public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false) + { + $this->name = $name; + $this->stringValue = $stringValue; + $this->isStored = $isStored; + $this->isIndexed = $isIndexed; + $this->isTokenized = $isTokenized; + $this->isBinary = $isBinary; + + $this->storeTermVector = false; + $this->boost = 1.0; + } + + + /** + * Constructs a String-valued Field that is not tokenized, but is indexed + * and stored. Useful for non-text fields, e.g. date or url. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Keyword($name, $value) + { + return new self($name, $value, true, true, false); + } + + + /** + * Constructs a String-valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnIndexed($name, $value) + { + return new self($name, $value, true, false, false); + } + + + /** + * Constructs a Binary String valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Binary($name, $value) + { + return new self($name, $value, true, false, false, true); + } + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * and is stored in the index, for return with hits. Useful for short text + * fields, like "title" or "subject". Term vector will not be stored for this field. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Text($name, $value) + { + return new self($name, $value, true, true, true); + } + + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * but that is not stored in the index. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnStored($name, $value) + { + return new self($name, $value, false, true, true); + } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php new file mode 100644 index 00000000..eaca4ecf --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php @@ -0,0 +1,43 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_FieldInfo +{ + public $name; + public $isIndexed; + public $number; + public $storeTermVector; + + public function __construct( $name, $isIndexed, $number, $storeTermVector ) + { + $this->name = $name; + $this->isIndexed = $isIndexed; + $this->number = $number; + $this->storeTermVector = $storeTermVector; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php new file mode 100644 index 00000000..f5c596a0 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php @@ -0,0 +1,412 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_SegmentInfo +{ + /** + * Number of docs in a segment + * + * @var integer + */ + private $_docCount; + + /** + * Segment name + * + * @var string + */ + private $_name; + + /** + * Term Dictionary Index + * Array of the Zend_Search_Lucene_Index_Term objects + * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos + * + * @var array + */ + private $_termDictionary; + + /** + * Term Dictionary Index TermInfos + * Array of the Zend_Search_Lucene_Index_TermInfo objects + * + * @var array + */ + private $_termDictionaryInfos; + + /** + * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment + * + * @var array + */ + private $_fields; + + /** + * Field positions in a dictionary. + * (Term dictionary contains filelds ordered by names) + * + * @var array + */ + private $_fieldsDicPositions; + + + /** + * Associative array where the key is the file name and the value is data offset + * in a compound segment file (.csf). + * + * @var array + */ + private $_segFiles; + + /** + * File system adapter. + * + * @var Zend_Search_Lucene_Storage_Directory_Filesystem + */ + private $_directory; + + /** + * Normalization factors. + * An array fieldName => normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) + * + * @var array + */ + private $_norms = array(); + + /** + * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, + * Documents count and Directory as a parameter. + * + * @param string $name + * @param integer $docCount + * @param Zend_Search_Lucene_Storage_Directory $directory + */ + public function __construct($name, $docCount, $directory) + { + $this->_name = $name; + $this->_docCount = $docCount; + $this->_directory = $directory; + $this->_termDictionary = null; + + $this->_segFiles = array(); + $cfsFile = $this->_directory->getFileObject($name . '.cfs'); + $segFilesCount = $cfsFile->readVInt(); + + for ($count = 0; $count < $segFilesCount; $count++) { + $dataOffset = $cfsFile->readLong(); + $fileName = $cfsFile->readString(); + $this->_segFiles[$fileName] = $dataOffset; + } + + $fnmFile = $this->openCompoundFile('.fnm'); + $fieldsCount = $fnmFile->readVInt(); + $fieldNames = array(); + $fieldNums = array(); + $this->_fields = array(); + for ($count=0; $count < $fieldsCount; $count++) { + $fieldName = $fnmFile->readString(); + $fieldBits = $fnmFile->readByte(); + $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, + $fieldBits & 1, + $count, + $fieldBits & 2 ); + if ($fieldBits & 0x10) { + // norms are omitted for the indexed field + $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); + } + + $fieldNums[$count] = $count; + $fieldNames[$count] = $fieldName; + } + array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); + $this->_fieldsDicPositions = array_flip($fieldNums); + } + + /** + * Opens index file stoted within compound index file + * + * @param string $extension + * @throws Zend_Search_Lucene_Exception + * @return Zend_Search_Lucene_Storage_File + */ + public function openCompoundFile($extension) + { + $filename = $this->_name . $extension; + + if( !isset($this->_segFiles[ $filename ]) ) { + throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' + . $filename . ' file.' ); + } + + $file = $this->_directory->getFileObject( $this->_name.".cfs" ); + $file->seek( $this->_segFiles[ $filename ] ); + return $file; + } + + /** + * Returns field index or -1 if field is not found + * + * @param string $fieldName + * @return integer + */ + public function getFieldNum($fieldName) + { + foreach( $this->_fields as $field ) { + if( $field->name == $fieldName ) { + return $field->number; + } + } + + return -1; + } + + /** + * Returns field info for specified field + * + * @param integer $fieldNum + * @return ZSearchFieldInfo + */ + public function getField($fieldNum) + { + return $this->_fields[$fieldNum]; + } + + /** + * Returns array of fields. + * if $indexed parameter is true, then returns only indexed fields. + * + * @param boolean $indexed + * @return array + */ + public function getFields($indexed = false) + { + $result = array(); + foreach( $this->_fields as $field ) { + if( (!$indexed) || $field->isIndexed ) { + $result[ $field->name ] = $field->name; + } + } + return $result; + } + + /** + * Returns the total number of documents in this segment. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + + /** + * Loads Term dictionary from TermInfoIndex file + */ + protected function _loadDictionary() + { + if ($this->_termDictionary !== null) { + return; + } + + $this->_termDictionary = array(); + $this->_termDictionaryInfos = array(); + + $tiiFile = $this->openCompoundFile('.tii'); + $tiVersion = $tiiFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + $indexTermCount = $tiiFile->readLong(); + $tiiFile->readInt(); // IndexInterval + $skipInterval = $tiiFile->readInt(); + + $prevTerm = ''; + $freqPointer = 0; + $proxPointer = 0; + $indexPointer = 0; + for ($count = 0; $count < $indexTermCount; $count++) { + $termPrefixLength = $tiiFile->readVInt(); + $termSuffix = $tiiFile->readString(); + $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; + + $termFieldNum = $tiiFile->readVInt(); + $docFreq = $tiiFile->readVInt(); + $freqPointer += $tiiFile->readVInt(); + $proxPointer += $tiiFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipDelta = $tiiFile->readVInt(); + } else { + $skipDelta = 0; + } + + $indexPointer += $tiiFile->readVInt(); + + $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); + $this->_termDictionaryInfos[] = + new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + $prevTerm = $termValue; + } + } + + + /** + * Return segment name + * + * @return string + */ + public function getName() + { + return $this->_name; + } + + + /** + * Scans terms dictionary and returns term info + * + * @param Zend_Search_Lucene_Index_Term $term + * @return Zend_Search_Lucene_Index_TermInfo + */ + public function getTermInfo($term) + { + $this->_loadDictionary(); + + $searchField = $this->getFieldNum($term->field); + + if ($searchField == -1) { + return null; + } + $searchDicField = $this->_fieldsDicPositions[$searchField]; + + // search for appropriate value in dictionary + $lowIndex = 0; + $highIndex = count($this->_termDictionary)-1; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $midTerm = $this->_termDictionary[$mid]; + + $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field]; + if ($delta == 0) { + $delta = strcmp($term->text, $midTerm->text); + } + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $this->_termDictionaryInfos[$mid]; // We got it! + } + } + + if ($highIndex == -1) { + // Term is out of the dictionary range + return null; + } + + $prevPosition = $highIndex; + $prevTerm = $this->_termDictionary[$prevPosition]; + $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; + + $tisFile = $this->openCompoundFile('.tis'); + $tiVersion = $tisFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); + } + + $termCount = $tisFile->readLong(); + $indexInterval = $tisFile->readInt(); + $skipInterval = $tisFile->readInt(); + + $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); + + $termValue = $prevTerm->text; + $termFieldNum = $prevTerm->field; + $freqPointer = $prevTermInfo->freqPointer; + $proxPointer = $prevTermInfo->proxPointer; + for ($count = $prevPosition*$indexInterval + 1; + $count < $termCount && + ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField || + ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField && + strcmp($termValue, $term->text) < 0) ); + $count++) { + $termPrefixLength = $tisFile->readVInt(); + $termSuffix = $tisFile->readString(); + $termFieldNum = $tisFile->readVInt(); + $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; + + $docFreq = $tisFile->readVInt(); + $freqPointer += $tisFile->readVInt(); + $proxPointer += $tisFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipOffset = $tisFile->readVInt(); + } else { + $skipOffset = 0; + } + } + + if ($termFieldNum == $searchField && $termValue == $term->text) { + return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); + } else { + return null; + } + } + + /** + * Returns normalization factor for specified documents + * + * @param integer $id + * @param string $fieldName + * @return string + */ + public function norm($id, $fieldName) + { + $fieldNum = $this->getFieldNum($fieldName); + + if ( !($this->_fields[$fieldNum]->isIndexed) ) { + return null; + } + + if ( !isset( $this->_norms[$fieldNum] )) { + $fFile = $this->openCompoundFile('.f' . $fieldNum); + $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); + } + + return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php new file mode 100644 index 00000000..f90d6ed3 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php @@ -0,0 +1,491 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_SegmentWriter +{ + /** + * Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful. + * + * @var integer + */ + static public $indexInterval = 128; + + /** Expert: The fraction of TermDocs entries stored in skip tables. + * Larger values result in smaller indexes, greater acceleration, but fewer + * accelerable cases, while smaller values result in bigger indexes, + * less acceleration and more + * accelerable cases. More detailed experiments would be useful here. + * + * 0x0x7FFFFFFF indicates that we don't use skip data + * Default value is 16 + * + * @var integer + */ + static public $skipInterval = 0x7FFFFFFF; + + /** + * Number of docs in a segment + * + * @var integer + */ + private $_docCount; + + /** + * Segment name + * + * @var string + */ + private $_name; + + /** + * File system adapter. + * + * @var Zend_Search_Lucene_Storage_Directory + */ + private $_directory; + + /** + * List of the index files. + * Used for automatic compound file generation + * + * @var unknown_type + */ + private $_files; + + /** + * Term Dictionary + * Array of the Zend_Search_Lucene_Index_Term objects + * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos + * + * @var array + */ + private $_termDictionary; + + /** + * Documents, which contain the term + * + * @var array + */ + private $_termDocs; + + /** + * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment + * + * @var array + */ + private $_fields; + + /** + * Normalization factors. + * An array fieldName => normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) + * + * @var array + */ + private $_norms; + + + /** + * '.fdx' file - Stored Fields, the field index. + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_fdxFile; + + /** + * '.fdx' file - Stored Fields, the field data. + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_fdtFile; + + + /** + * Object constructor. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param string $name + */ + public function __construct($directory, $name) + { + $this->_directory = $directory; + $this->_name = $name; + $this->_docCount = 0; + + $this->_fields = array(); + $this->_termDocs = array(); + $this->_files = array(); + $this->_norms = array(); + + $this->_fdxFile = null; + $this->_fdtFile = null; + } + + + /** + * Add field to the segment + * + * @param Zend_Search_Lucene_Field $field + */ + private function _addFieldInfo(Zend_Search_Lucene_Field $field) + { + if (!isset($this->_fields[$field->name])) { + $this->_fields[$field->name] = + new Zend_Search_Lucene_Index_FieldInfo($field->name, + $field->isIndexed, + count($this->_fields), + $field->storeTermVector); + } else { + $this->_fields[$field->name]->isIndexed |= $field->isIndexed; + $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; + } + } + + + /** + * Adds a document to this segment. + * + * @param Zend_Search_Lucene_Document $document + * @throws Zend_Search_Lucene_Exception + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $storedFields = array(); + + foreach ($document->getFieldNames() as $fieldName) { + $field = $document->getField($fieldName); + $this->_addFieldInfo($field); + + if ($field->storeTermVector) { + /** + * @todo term vector storing support + */ + throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); + } + + if ($field->isIndexed) { + if ($field->isTokenized) { + $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); + } else { + $tokenList = array(); + $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); + } + + $position = 0; + foreach ($tokenList as $token) { + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); + $termKey = $term->key(); + + if (!isset($this->_termDictionary[$termKey])) { + // New term + $this->_termDictionary[$termKey] = $term; + $this->_termDocs[$termKey] = array(); + $this->_termDocs[$termKey][$this->_docCount] = array(); + } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { + // Existing term, but new term entry + $this->_termDocs[$termKey][$this->_docCount] = array(); + } + $position += $token->getPositionIncrement(); + $this->_termDocs[$termKey][$this->_docCount][] = $position; + } + } + + if ($field->isStored) { + $storedFields[] = $field; + } + } + + if (count($storedFields) != 0) { + if (!isset($this->_fdxFile)) { + $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); + $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); + + $this->_files[] = $this->_name . '.fdx'; + $this->_files[] = $this->_name . '.fdt'; + } + + $this->_fdxFile->writeLong($this->_fdtFile->tell()); + + $this->_fdtFile->writeVInt(count($storedFields)); + foreach ($storedFields as $field) { + $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); + $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 | + $field->isBinary ? 0x02 : 0x00 | + 0x00 /* 0x04 - third bit, compressed (ZLIB) */ ); + if ($field->isBinary) { + $this->_fdtFile->writeVInt(strlen($field->stringValue)); + $this->_fdtFile->writeBytes($field->stringValue); + } else { + $this->_fdtFile->writeString($field->stringValue); + } + } + } + + $this->_docCount++; + } + + + /** + * Dump Field Info (.fnm) segment file + */ + private function _dumpFNM() + { + $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); + $fnmFile->writeVInt(count($this->_fields)); + + foreach ($this->_fields as $field) { + $fnmFile->writeString($field->name); + $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | + ($field->storeTermVector ? 0x02 : 0x00) | +// not supported yet 0x04 /* term positions are stored with the term vectors */ | +// not supported yet 0x08 /* term offsets are stored with the term vectors */ | +/* not supported yet */ 0x10 /* norms are omitted for the indexed field */ + ); + } + + $this->_files[] = $this->_name . '.fnm'; + } + + + /** + * Dump Term Dictionary segment file entry. + * Used to write entry to .tis or .tii files + * + * @param Zend_Search_Lucene_Storage_File $dicFile + * @param Zend_Search_Lucene_Index_Term $prevTerm + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo + * @param Zend_Search_Lucene_Index_TermInfo $termInfo + */ + private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, + &$prevTerm, Zend_Search_Lucene_Index_Term $term, + &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) + { + if (isset($prevTerm) && $prevTerm->field == $term->field) { + $prefixLength = 0; + while ($prefixLength < strlen($prevTerm->text) && + $prefixLength < strlen($term->text) && + $prevTerm->text{$prefixLength} == $term->text{$prefixLength} + ) { + $prefixLength++; + } + // Write preffix length + $dicFile->writeVInt($prefixLength); + // Write suffix + $dicFile->writeString( substr($term->text, $prefixLength) ); + } else { + // Write preffix length + $dicFile->writeVInt(0); + // Write suffix + $dicFile->writeString($term->text); + } + // Write field number + $dicFile->writeVInt($term->field); + // DocFreq (the count of documents which contain the term) + $dicFile->writeVInt($termInfo->docFreq); + + $prevTerm = $term; + + if (!isset($prevTermInfo)) { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer); + } else { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); + } + // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval + if ($termInfo->skipOffset != 0) { + $dicFile->writeVInt($termInfo->skipOffset); + } + + $prevTermInfo = $termInfo; + } + + /** + * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files + */ + private function _dumpDictionary() + { + $tisFile = $this->_directory->createFile($this->_name . '.tis'); + $tisFile->writeInt((int)0xFFFFFFFE); + $tisFile->writeLong(count($this->_termDictionary)); + $tisFile->writeInt(self::$indexInterval); + $tisFile->writeInt(self::$skipInterval); + + $tiiFile = $this->_directory->createFile($this->_name . '.tii'); + $tiiFile->writeInt((int)0xFFFFFFFE); + $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1); + $tiiFile->writeInt(self::$indexInterval); + $tiiFile->writeInt(self::$skipInterval); + + $frqFile = $this->_directory->createFile($this->_name . '.frq'); + $prxFile = $this->_directory->createFile($this->_name . '.prx'); + + $termKeys = array_keys($this->_termDictionary); + sort($termKeys, SORT_STRING); + + $termCount = 0; + + $prevTerm = null; + $prevTermInfo = null; + $prevIndexTerm = null; + $prevIndexTermInfo = null; + $prevIndexPosition = 0; + + foreach ($termKeys as $termId) { + $freqPointer = $frqFile->tell(); + $proxPointer = $prxFile->tell(); + + $prevDoc = 0; + foreach ($this->_termDocs[$termId] as $docId => $termPositions) { + $docDelta = ($docId - $prevDoc)*2; + $prevDoc = $docId; + if (count($termPositions) > 1) { + $frqFile->writeVInt($docDelta); + $frqFile->writeVInt(count($termPositions)); + } else { + $frqFile->writeVInt($docDelta + 1); + } + + $prevPosition = 0; + foreach ($termPositions as $position) { + $prxFile->writeVInt($position - $prevPosition); + $prevPosition = $position; + } + } + + if (count($this->_termDocs[$termId]) >= self::$skipInterval) { + /** + * @todo Write Skip Data to a freq file. + * It's not used now, but must be implemented to be compatible with Lucene + */ + $skipOffset = $frqFile->tell() - $freqPointer; + } else { + $skipOffset = 0; + } + + $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, + $this->_fields[$this->_termDictionary[$termId]->field]->number); + $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), + $freqPointer, $proxPointer, $skipOffset); + + $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); + + if ($termCount % self::$indexInterval == 0) { + $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); + + $indexPosition = $tisFile->tell(); + $tiiFile->writeVInt($indexPosition - $prevIndexPosition); + $prevIndexPosition = $indexPosition; + } + $termCount++; + } + + $this->_files[] = $this->_name . '.tis'; + $this->_files[] = $this->_name . '.tii'; + $this->_files[] = $this->_name . '.frq'; + $this->_files[] = $this->_name . '.prx'; + } + + + /** + * Generate compound index file + */ + private function _generateCFS() + { + $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); + $cfsFile->writeVInt(count($this->_files)); + + $dataOffsetPointers = array(); + foreach ($this->_files as $fileName) { + $dataOffsetPointers[$fileName] = $cfsFile->tell(); + $cfsFile->writeLong(0); // write dummy data + $cfsFile->writeString($fileName); + } + + foreach ($this->_files as $fileName) { + // Get actual data offset + $dataOffset = $cfsFile->tell(); + // Seek to the data offset pointer + $cfsFile->seek($dataOffsetPointers[$fileName]); + // Write actual data offset value + $cfsFile->writeLong($dataOffset); + // Seek back to the end of file + $cfsFile->seek($dataOffset); + + $dataFile = $this->_directory->getFileObject($fileName); + $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName))); + + $this->_directory->deleteFile($fileName); + } + } + + + /** + * Close segment, write it to disk and return segment info + * + * @return Zend_Search_Lucene_Index_SegmentInfo + */ + public function close() + { + if ($this->_docCount == 0) { + return null; + } + + $this->_dumpFNM(); + $this->_dumpDictionary(); + + $this->_generateCFS(); + + return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, + $this->_docCount, + $this->_directory); + } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php new file mode 100644 index 00000000..e30ce587 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php @@ -0,0 +1,70 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A Term represents a word from text. This is the unit of search. It is + * composed of two elements, the text of the word, as a string, and the name of + * the field that the text occured in, an interned string. + * + * Note that terms may represent more than words from text fields, but also + * things like dates, email addresses, urls, etc. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_Term +{ + /** + * Field name or field number (depending from context) + * + * @var mixed + */ + public $field; + + /** + * Term value + * + * @var string + */ + public $text; + + + /** + * @todo docblock + */ + public function __construct( $text, $field = 'contents' ) + { + $this->field = $field; + $this->text = $text; + } + + + /** + * @todo docblock + */ + public function key() + { + return $this->field . chr(0) . $this->text; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php new file mode 100644 index 00000000..ddef721d --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php @@ -0,0 +1,77 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_TermInfo +{ + /** + * The number of documents which contain the term. + * + * @var integer + */ + public $docFreq; + + /** + * Data offset in a Frequencies file. + * + * @var integer + */ + public $freqPointer; + + /** + * Data offset in a Positions file. + * + * @var integer + */ + public $proxPointer; + + /** + * ScipData offset in a Frequencies file. + * + * @var integer + */ + public $skipOffset; + + /** + * Term offset of the _next_ term in a TermDictionary file. + * Used only for Term Index + * + * @var integer + */ + public $indexPointer; + + public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null) + { + $this->docFreq = $docFreq; + $this->freqPointer = $freqPointer; + $this->proxPointer = $proxPointer; + $this->skipOffset = $skipOffset; + $this->indexPointer = $indexPointer; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php new file mode 100644 index 00000000..da4af000 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php @@ -0,0 +1,308 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Index_SegmentWriter */ +require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Index + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_Writer +{ + /** + * @todo Implement segment merger + * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage. + * @todo Implement Analyzer substitution + * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for + * temporary index files + * @todo Directory lock processing + */ + + /** + * File system adapter. + * + * @var Zend_Search_Lucene_Storage_Directory + */ + private $_directory = null; + + + /** + * Index version + * Counts how often the index has been changed by adding or deleting docs + * + * @var integer + */ + private $_version; + + /** + * Segment name counter. + * Used to name new segments . + * + * @var integer + */ + private $_segmentNameCounter; + + /** + * Number of the segments in the index + * + * @var inteher + */ + private $_segments; + + /** + * Determines how often segment indices + * are merged by addDocument(). + * + * @var integer + */ + public $mergeFactor; + + /** + * Determines the minimal number of documents required before + * the buffered in-memory documents are merging and a new Segment + * is created. + * + * @var integer + */ + public $minMergeDocs; + + /** + * Determines the largest number of documents ever merged by addDocument(). + * + * @var integer + */ + public $maxMergeDocs; + + /** + * List of the segments, created by index writer + * Array of Zend_Search_Lucene_Index_SegmentInfo objects + * + * @var array + */ + private $_newSegments; + + /** + * Current segment to add documents + * + * @var Zend_Search_Lucene_Index_SegmentWriter + */ + private $_currentSegment; + + /** + * Opens the index for writing + * + * IndexWriter constructor needs Directory as a parameter. It should be + * a string with a path to the index folder or a Directory object. + * Second constructor parameter create is optional - true to create the + * index or overwrite the existing one. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param boolean $create + */ + public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false) + { + $this->_directory = $directory; + + if ($create) { + foreach ($this->_directory->fileList() as $file) { + if ($file == 'deletable' || + $file == 'segments' || + substr($file, strlen($file)-4) == '.cfs') { + $this->_directory->deleteFile($file); + } + } + $segmentsFile = $this->_directory->createFile('segments'); + $segmentsFile->writeInt((int)0xFFFFFFFF); + // write version + $segmentsFile->writeLong(0); + // write name counter + $segmentsFile->writeInt(0); + // write segment counter + $segmentsFile->writeInt(0); + + $deletableFile = $this->_directory->createFile('deletable'); + // write counter + $deletableFile->writeInt(0); + + $this->_version = 0; + $this->_segmentNameCounter = 0; + $this->_segments = 0; + } else { + $segmentsFile = $this->_directory->getFileObject('segments'); + $format = $segmentsFile->readInt(); + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $this->_version = $segmentsFile->readLong(); + // read counter + $this->_segmentNameCounter = $segmentsFile->readInt(); + // read segment counter + $this->_segments = $segmentsFile->readInt(); + } + + $this->_newSegments = array(); + $this->_currentSegment = null; + } + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if ($this->_currentSegment === null) { + $this->_currentSegment = + new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); + } + $this->_currentSegment->addDocument($document); + $this->_version++; + } + + + + /** + * Update segments file by adding current segment to a list + * @todo !!!!!Finish the implementation + * + * @throws Zend_Search_Lucene_Exception + */ + private function _updateSegments() + { + $segmentsFile = $this->_directory->getFileObject('segments'); + $newSegmentFile = $this->_directory->createFile('segments.new'); + + $newSegmentFile->writeInt((int)0xFFFFFFFF); + $newSegmentFile->writeLong($this->_version); + $newSegmentFile->writeInt($this->_segmentNameCounter); + $newSegmentFile->writeInt($this->_segments + count($this->_newSegments)); + + $segmentsFile->seek(20); + $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); + + foreach ($this->_newSegments as $segmentName => $segmentInfo) { + $newSegmentFile->writeString($segmentName); + $newSegmentFile->writeInt($segmentInfo->count()); + } + + $this->_directory->renameFile('segments.new', 'segments'); + } + + + /** + * Commit current changes + * returns array of new segments + * + * @return array + */ + public function commit() + { + if ($this->_currentSegment !== null) { + $newSegment = $this->_currentSegment->close(); + if ($newSegment !== null) { + $this->_newSegments[$newSegment->getName()] = $newSegment; + } + $this->_currentSegment = null; + } + + if (count($this->_newSegments) != 0) { + $this->_updateSegments(); + } + + $result = $this->_newSegments; + $this->_newSegments = array(); + + return $result; + } + + + /** + * Merges the provided indexes into this index. + * + * @param array $readers + * @return void + */ + public function addIndexes($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Returns the number of documents currently in this index. + * + * @return integer + */ + public function docCount($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Flushes all changes to an index and closes all associated files. + * + */ + public function close() + { + /** + * @todo implementation + */ + } + + + /** + * Merges all segments together into a single segment, optimizing + * an index for search. + * + * return void + */ + public function optimize() + { + /** + * @todo implementation + */ + } + + /** + * Get name for new segment + * + * @return string + */ + private function _newSegmentName() + { + return '_' . base_convert($this->_segmentNameCounter++, 10, 36); + } + +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php new file mode 100644 index 00000000..dd8698e8 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php @@ -0,0 +1,98 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Query +{ + + /** + * query boost factor + * + * @var float + */ + private $_boost = 1.0; + + /** + * Query weight + * + * @var Zend_Search_Lucene_Search_Weight + */ + protected $_weight; + + + /** + * Gets the boost for this clause. Documents matching + * this clause will (in addition to the normal weightings) have their score + * multiplied by boost. The boost is 1.0 by default. + * + * @return float + */ + public function getBoost() + { + return $this->_boost; + } + + /** + * Sets the boost for this query clause to $boost. + * + * @param float $boost + */ + public function setBoost($boost) + { + $this->_boost = $boost; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + abstract public function score($docId, $reader); + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + abstract protected function _createWeight($reader); + + /** + * Constructs an initializes a Weight for a query. + * + * @param Zend_Search_Lucene $reader + */ + protected function _initWeight($reader) + { + $this->_weight = $this->_createWeight($reader); + $sum = $this->_weight->sumOfSquaredWeights(); + $queryNorm = $reader->getSimilarity()->queryNorm($sum); + $this->_weight->normalize($queryNorm); + } + +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php new file mode 100644 index 00000000..4a99c0f7 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php @@ -0,0 +1,437 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Query */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_MultiTerm */ +require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query +{ + + /** + * Terms to find. + * Array of Zend_Search_Lucene_Index_Term + * + * @var array + */ + private $_terms = array(); + + /** + * Term signs. + * If true then term is required. + * If false then term is prohibited. + * If null then term is neither prohibited, nor required + * + * If array is null then all terms are required + * + * @var array + */ + + private $_signs = array(); + + /** + * Result vector. + * Bitset or array of document IDs + * (depending from Bitset extension availability). + * + * @var mixed + */ + private $_resVector = null; + + /** + * Terms positions vectors. + * Array of Arrays: + * term1Id => (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + + /** + * A score factor based on the fraction of all query terms + * that a document contains. + * float for conjunction queries + * array of float for non conjunction queries + * + * @var mixed + */ + private $_coord = null; + + + /** + * Terms weights + * array of Zend_Search_Lucene_Search_Weight + * + * @var array + */ + private $_weights = array(); + + + /** + * Class constructor. Create a new multi-term query object. + * + * @param array $terms Array of Zend_Search_Lucene_Index_Term objects + * @param array $signs Array of signs. Sign is boolean|null. + * @return void + */ + public function __construct($terms = null, $signs = null) + { + /** + * @todo Check contents of $terms and $signs before adding them. + */ + if (is_array($terms)) { + $this->_terms = $terms; + + $this->_signs = null; + // Check if all terms are required + if (is_array($signs)) { + foreach ($signs as $sign ) { + if ($sign !== true) { + $this->_signs = $signs; + continue; + } + } + } + } + } + + + /** + * Add a $term (Zend_Search_Lucene_Index_Term) to this query. + * + * The sign is specified as: + * TRUE - term is required + * FALSE - term is prohibited + * NULL - term is neither prohibited, nor required + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean|null $sign + * @return void + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) { + $this->_terms[] = $term; + + /** + * @todo This is not good. Sometimes $this->_signs is an array, sometimes + * it is null, even when there are terms. It will be changed so that + * it is always an array. + */ + if ($this->_signs === null) { + if ($sign !== null) { + $this->_signs = array(); + foreach ($this->_terms as $term) { + $this->_signs[] = null; + } + $this->_signs[] = $sign; + } + } else { + $this->_signs[] = $sign; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Return terms signs + * + * @return array + */ + public function getSigns() + { + return $this->_signs; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); + } + + + /** + * Calculate result vector for Conjunction query + * (like '+something +another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Calculate result vector for non Conjunction query + * (like '+something -another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateNonConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + $required = null; + $neither = bitset_empty(); + $prohibited = bitset_empty(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = bitset_from_array($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + $required = bitset_intersection($required, $termDocs); + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + $prohibited = bitset_union($prohibited, $termDocs); + } else { + // neither required, nor prohibited + $neither = bitset_union($neither, $termDocs); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + $this->_resVector = bitset_intersection( $required, + bitset_invert($prohibited, $reader->count()) ); + } else { + $required = null; + $neither = array(); + $prohibited = array(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = array_flip($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + // substitute for bitset_intersection + foreach ($required as $key => $value) { + if (!isset( $termDocs[$key] )) { + unset($required[$key]); + } + } + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $prohibited[$key] = $value; + } + } else { + // neither required, nor prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $neither[$key] = $value; + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + + foreach ($required as $key=>$value) { + if (isset( $prohibited[$key] )) { + unset($required[$key]); + } + } + $this->_resVector = $required; + } + } + + + /** + * Score calculator for conjunction queries (all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _conjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), + count($this->_terms) ); + } + + $score = 0.0; + + foreach ($this->_terms as $termId=>$term) { + $score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + + return $score * $this->_coord; + } + + + /** + * Score calculator for non conjunction queries (not all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _nonConjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = array(); + + $maxCoord = 0; + foreach ($this->_signs as $sign) { + if ($sign !== false /* not prohibited */) { + $maxCoord++; + } + } + + for ($count = 0; $count <= $maxCoord; $count++) { + $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); + } + } + + $score = 0.0; + $matchedTerms = 0; + foreach ($this->_terms as $termId=>$term) { + // Check if term is + if ($this->_signs[$termId] !== false && // not prohibited + isset($this->_termsPositions[$termId][$docId]) // matched + ) { + $matchedTerms++; + $score += + $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + } + + return $score * $this->_coord[$matchedTerms]; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + if($this->_resVector === null) { + if ($this->_signs === null) { + $this->_calculateConjunctionResult($reader); + } else { + $this->_calculateNonConjunctionResult($reader); + } + + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_signs === null) { + return $this->_conjunctionScore($docId, $reader); + } else { + return $this->_nonConjunctionScore($docId, $reader); + } + } else { + return 0; + } + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php new file mode 100644 index 00000000..3e52666b --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php @@ -0,0 +1,424 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Zend_Search_Lucene_Search_Query + */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** + * Zend_Search_Lucene_Search_Weight_MultiTerm + */ +require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php'; + + +/** + * A Query that matches documents containing a particular sequence of terms. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query +{ + /** + * Terms to find. + * Array of Zend_Search_Lucene_Index_Term objects. + * + * @var array + */ + private $_terms; + + /** + * Term positions (relative positions of terms within the phrase). + * Array of integers + * + * @var array + */ + private $_offsets; + + /** + * Sets the number of other words permitted between words in query phrase. + * If zero, then this is an exact phrase search. For larger values this works + * like a WITHIN or NEAR operator. + * + * The slop is in fact an edit-distance, where the units correspond to + * moves of terms in the query phrase out of position. For example, to switch + * the order of two words requires two moves (the first move places the words + * atop one another), so to permit re-orderings of phrases, the slop must be + * at least two. + * More exact matches are scored higher than sloppier matches, thus search + * results are sorted by exactness. + * + * The slop is zero by default, requiring exact matches. + * + * @var unknown_type + */ + private $_slop; + + /** + * Result vector. + * Bitset or array of document IDs + * (depending from Bitset extension availability). + * + * @var mixed + */ + private $_resVector = null; + + /** + * Terms positions vectors. + * Array of Arrays: + * term1Id => (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + /** + * Class constructor. Create a new prase query. + * + * @param string $field Field to search. + * @param array $terms Terms to search Array of strings. + * @param array $offsets Relative term positions. Array of integers. + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($terms = null, $offsets = null, $field = null) + { + $this->_slop = 0; + + if (is_array($terms)) { + $this->_terms = array(); + foreach ($terms as $termId => $termText) { + $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field): + new Zend_Search_Lucene_Index_Term($termText); + } + } else if ($terms === null) { + $this->_terms = array(); + } else { + throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null'); + } + + if (is_array($offsets)) { + if (count($this->_terms) != count($offsets)) { + throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.'); + } + $this->_offsets = $offsets; + } else if ($offsets === null) { + $this->_offsets = array(); + foreach ($this->_terms as $termId => $term) { + $position = count($this->_offsets); + $this->_offsets[$termId] = $position; + } + } else { + throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null'); + } + } + + /** + * Set slop + * + * @param integer $slop + */ + public function setSlop($slop) + { + $this->_slop = $slop; + } + + + /** + * Get slop + * + * @return integer + */ + public function getSlop() + { + return $this->_slop; + } + + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is specified explicitly or the one immediately + * after the last term added. + * + * @param Zend_Search_Lucene_Index_Term $term + * @param integer $position + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) { + if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) { + throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' . + $term->field . ':' . $term->text); + } + + $this->_terms[] = $term; + if ($position !== null) { + $this->_offsets[] = $position; + } else if (count($this->_offsets) != 0) { + $this->_offsets[] = end($this->_offsets) + 1; + } else { + $this->_offsets[] = 0; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); + } + + + /** + * Calculate result vector + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Score calculator for exact phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @return float + */ + public function _exactPhraseFreq($docId) + { + $freq = 0; + + // Term Id with lowest cardinality + $lowCardTermId = null; + + // Calculate $lowCardTermId + foreach ($this->_terms as $termId => $term) { + if ($lowCardTermId === null || + count($this->_termsPositions[$termId][$docId]) < + count($this->_termsPositions[$lowCardTermId][$docId]) ) { + $lowCardTermId = $termId; + } + } + + // Walk through positions of the term with lowest cardinality + foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { + // We expect phrase to be found + $freq++; + + // Walk through other terms + foreach ($this->_terms as $termId => $term) { + if ($termId != $lowCardTermId) { + $expectedPosition = $lowCardPos + + ($this->_offsets[$termId] - + $this->_offsets[$lowCardTermId]); + + if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { + $freq--; // Phrase wasn't found. + break; + } + } + } + } + + return $freq; + } + + /** + * Score calculator for sloppy phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader) + { + $freq = 0; + + $phraseQueue = array(); + $phraseQueue[0] = array(); // empty phrase + $lastTerm = null; + + // Walk through the terms to create phrases. + foreach ($this->_terms as $termId => $term) { + $queueSize = count($phraseQueue); + $firstPass = true; + + // Walk through the term positions. + // Each term position produces a set of phrases. + foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { + if ($firstPass) { + for ($count = 0; $count < $queueSize; $count++) { + $phraseQueue[$count][$termId] = $termPosition; + } + } else { + for ($count = 0; $count < $queueSize; $count++) { + if ($lastTerm !== null && + abs( $termPosition - $phraseQueue[$count][$lastTerm] - + ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { + continue; + } + + $newPhraseId = count($phraseQueue); + $phraseQueue[$newPhraseId] = $phraseQueue[$count]; + $phraseQueue[$newPhraseId][$termId] = $termPosition; + } + + } + + $firstPass = false; + } + $lastTerm = $termId; + } + + + foreach ($phraseQueue as $phrasePos) { + $minDistance = null; + + for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { + $distance = 0; + $start = reset($phrasePos) - reset($this->_offsets) + $shift; + + foreach ($this->_terms as $termId => $term) { + $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); + + if($distance > $this->_slop) { + break; + } + } + + if ($minDistance === null || $distance < $minDistance) { + $minDistance = $distance; + } + } + + if ($minDistance <= $this->_slop) { + $freq += $reader->getSimilarity()->sloppyFreq($minDistance); + } + } + + return $freq; + } + + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + // optimize zero-term case + if (count($this->_terms) == 0) { + return 0; + } + + if($this->_resVector === null) { + $this->_calculateResult($reader); + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_slop == 0) { + $freq = $this->_exactPhraseFreq($docId); + } else { + $freq = $this->_sloppyPhraseFreq($docId, $reader); + } + +/* + return $reader->getSimilarity()->tf($freq) * + $this->_weight->getValue() * + $reader->norm($docId, reset($this->_terms)->field); +*/ + if ($freq != 0) { + $tf = $reader->getSimilarity()->tf($freq); + $weight = $this->_weight->getValue(); + $norm = $reader->norm($docId, reset($this->_terms)->field); + + return $tf*$weight*$norm; + } + } else { + return 0; + } + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php new file mode 100644 index 00000000..d622f845 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php @@ -0,0 +1,126 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Query */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_Term */ +require_once 'Zend/Search/Lucene/Search/Weight/Term.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query +{ + /** + * Term to find. + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_term; + + /** + * Term sign. + * If true then term is required + * If false then term is prohibited. + * + * @var bool + */ + private $_sign; + + /** + * Documents vector. + * Bitset or array of document IDs + * (depending from Bitset extension availability). + * + * @var mixed + */ + private $_docVector = null; + + /** + * Term positions vector. + * Array: docId => array( pos1, pos2, ... ) + * + * @var array + */ + private $_termPositions; + + + /** + * Zend_Search_Lucene_Search_Query_Term constructor + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean $sign + */ + public function __construct( $term, $sign = true ) + { + $this->_term = $term; + $this->_sign = $sign; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score( $docId, $reader ) + { + if($this->_docVector===null) { + if (extension_loaded('bitset')) { + $this->_docVector = bitset_from_array( $reader->termDocs($this->_term) ); + } else { + $this->_docVector = array_flip($reader->termDocs($this->_term)); + } + + $this->_termPositions = $reader->termPositions($this->_term); + $this->_initWeight($reader); + } + + $match = extension_loaded('bitset') ? bitset_in($this->_docVector, $docId) : + isset($this->_docVector[$docId]); + if ($this->_sign && $match) { + return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) * + $this->_weight->getValue() * + $reader->norm($docId, $this->_term->field); + } else { + return 0; + } + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php new file mode 100644 index 00000000..65290a9e --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php @@ -0,0 +1,106 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryHit +{ + /** + * Object handle of the index + * @var Zend_Search_Lucene + */ + protected $_index = null; + + /** + * Object handle of the document associated with this hit + * @var Zend_Search_Lucene_Document + */ + protected $_document = null; + + /** + * Number of the document in the index + * @var integer + */ + public $id; + + /** + * Score of the hit + * @var float + */ + public $score; + + + /** + * Constructor - pass object handle of Zend_Search_Lucene index that produced + * the hit so the document can be retrieved easily from the hit. + * + * @param Zend_Search_Lucene $index + */ + + public function __construct(Zend_Search_Lucene $index) + { + $this->_index = $index; + } + + + /** + * Convenience function for getting fields from the document + * associated with this hit. + * + * @param string $offset + * @return string + */ + public function __get($offset) + { + return $this->getDocument()->getFieldValue($offset); + } + + + /** + * Return the document object for this hit + * + * @return Zend_Search_Lucene_Document + */ + public function getDocument() + { + if (!$this->_document instanceof Zend_Search_Lucene_Document) { + $this->_document = $this->_index->getDocument($this->id); + } + + return $this->_document; + } + + + /** + * Return the index object for this hit + * + * @return Zend_Search_Lucene + */ + public function getIndex() + { + return $this->_index; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php new file mode 100644 index 00000000..9387afca --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php @@ -0,0 +1,140 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_QueryTokenizer */ +require_once 'Zend/Search/Lucene/Search/QueryTokenizer.php'; + +/** Zend_Search_Lucene_Index_Term */ +require_once 'Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Search_Query_Term */ +require_once 'Zend/Search/Lucene/Search/Query/Term.php'; + +/** Zend_Search_Lucene_Search_Query_MultiTerm */ +require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; + +/** Zend_Search_Lucene_Search_Query_Phrase */ +require_once 'Zend/Search/Lucene/Search/Query/Phrase.php'; + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryParser +{ + + /** + * Parses a query string, returning a Zend_Search_Lucene_Search_Query + * + * @param string $strQuery + * @return Zend_Search_Lucene_Search_Query + */ + static public function parse($strQuery) + { + $tokens = new Zend_Search_Lucene_Search_QueryTokenizer($strQuery); + + // Empty query + if (!$tokens->count()) { + throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.'); + } + + // Term query + if ($tokens->count() == 1) { + if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents')); + } else { + throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.'); + } + } + + + /** + * MultiTerm Query + * + * Process each token that was returned by the tokenizer. + */ + $terms = array(); + $signs = array(); + $prevToken = null; + $openBrackets = 0; + $field = 'contents'; + foreach ($tokens as $token) { + switch ($token->type) { + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD: + $terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field); + $field = 'contents'; + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + if ($prevToken->text == "+") { + $signs[] = true; + } else { + $signs[] = false; + } + } else { + $signs[] = null; + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN: + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.'); + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD: + $field = $token->text; + // let previous token to be signed as next $prevToken + $token = $prevToken; + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET: + $token->text=='(' ? $openBrackets++ : $openBrackets--; + } + $prevToken = $token; + } + + // Finish up parsing: check the last token in the query for an opening sign or parenthesis. + if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.'); + } + + // Finish up parsing: check that every opening bracket has a matching closing bracket. + if ($openBrackets != 0) { + throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.'); + } + + switch (count($terms)) { + case 0: + throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.'); + case 1: + return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false); + default: + return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs); + } + } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php new file mode 100644 index 00000000..995e0d3c --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php @@ -0,0 +1,102 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryToken +{ + /** + * Token type Word. + */ + const TOKTYPE_WORD = 0; + + /** + * Token type Field. + * Field indicator in 'field:word' pair + */ + const TOKTYPE_FIELD = 1; + + /** + * Token type Sign. + * '+' (required) or '-' (absentee) sign + */ + const TOKTYPE_SIGN = 2; + + /** + * Token type Bracket. + * '(' or ')' + */ + const TOKTYPE_BRACKET = 3; + + + /** + * Token type. + * + * @var integer + */ + public $type; + + /** + * Token text. + * + * @var integer + */ + public $text; + + + /** + * IndexReader constructor needs token type and token text as a parameters. + * + * @param $tokType integer + * @param $tokText string + */ + public function __construct($tokType, $tokText) + { + switch ($tokType) { + case self::TOKTYPE_BRACKET: + // fall through to the next case + case self::TOKTYPE_FIELD: + // fall through to the next case + case self::TOKTYPE_SIGN: + // fall through to the next case + case self::TOKTYPE_WORD: + break; + default: + throw new Zend_Search_Lucene_Exception("Unrecognized token type \"$tokType\"."); + } + + if (!strlen($tokText)) { + throw new Zend_Search_Lucene_Exception('Token text must be supplied.'); + } + + $this->type = $tokType; + $this->text = $tokText; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php new file mode 100644 index 00000000..986f8899 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -0,0 +1,162 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_QueryToken */ +require_once 'Zend/Search/Lucene/Search/QueryToken.php'; + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryTokenizer implements Iterator +{ + /** + * inputString tokens. + * + * @var array + */ + protected $_tokens = array(); + + /** + * tokens pointer. + * + * @var integer + */ + protected $_currToken = 0; + + + /** + * QueryTokenize constructor needs query string as a parameter. + * + * @param string $inputString + */ + public function __construct($inputString) + { + if (!strlen($inputString)) { + throw new Zend_Search_Lucene_Exception('Cannot tokenize empty query string.'); + } + + $currentToken = ''; + for ($count = 0; $count < strlen($inputString); $count++) { + if (ctype_alnum( $inputString{$count} )) { + $currentToken .= $inputString{$count}; + } else { + // Previous token is finished + if (strlen($currentToken)) { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, + $currentToken); + $currentToken = ''; + } + + if ($inputString{$count} == '+' || $inputString{$count} == '-') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN, + $inputString{$count}); + } elseif ($inputString{$count} == '(' || $inputString{$count} == ')') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET, + $inputString{$count}); + } elseif ($inputString{$count} == ':' && $this->count()) { + if ($this->_tokens[count($this->_tokens)-1]->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + $this->_tokens[count($this->_tokens)-1]->type = Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD; + } + } + } + } + + if (strlen($currentToken)) { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, $currentToken); + } + } + + + /** + * Returns number of tokens + * + * @return integer + */ + public function count() + { + return count($this->_tokens); + } + + + /** + * Returns TRUE if a token exists at the current position. + * + * @return boolean + */ + public function valid() + { + return $this->_currToken < $this->count(); + } + + + /** + * Resets token stream. + * + * @return integer + */ + public function rewind() + { + $this->_currToken = 0; + } + + + /** + * Returns the token at the current position or FALSE if + * the position does not contain a valid token. + * + * @return mixed + */ + public function current() + { + return $this->valid() ? $this->_tokens[$this->_currToken] : false; + } + + + /** + * Returns next token + * + * @return Zend_Search_Lucene_Search_QueryToken + */ + public function next() + { + return ++$this->_currToken; + } + + + /** + * Return the position of the current token. + * + * @return integer + */ + public function key() + { + return $this->_currToken; + } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php new file mode 100644 index 00000000..8b758213 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php @@ -0,0 +1,551 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Similarity_Default */ +require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Similarity +{ + /** + * The Similarity implementation used by default. + * + * @var Zend_Search_Lucene_Search_Similarity + */ + static private $_defaultImpl; + + /** + * Cache of decoded bytes. + * Array of floats + * + * @var array + */ + static private $_normTable = array( 0 => 0.0, + 1 => 5.820766E-10, + 2 => 6.9849193E-10, + 3 => 8.1490725E-10, + 4 => 9.313226E-10, + 5 => 1.1641532E-9, + 6 => 1.3969839E-9, + 7 => 1.6298145E-9, + 8 => 1.8626451E-9, + 9 => 2.3283064E-9, + 10 => 2.7939677E-9, + 11 => 3.259629E-9, + 12 => 3.7252903E-9, + 13 => 4.656613E-9, + 14 => 5.5879354E-9, + 15 => 6.519258E-9, + 16 => 7.4505806E-9, + 17 => 9.313226E-9, + 18 => 1.1175871E-8, + 19 => 1.3038516E-8, + 20 => 1.4901161E-8, + 21 => 1.8626451E-8, + 22 => 2.2351742E-8, + 23 => 2.6077032E-8, + 24 => 2.9802322E-8, + 25 => 3.7252903E-8, + 26 => 4.4703484E-8, + 27 => 5.2154064E-8, + 28 => 5.9604645E-8, + 29 => 7.4505806E-8, + 30 => 8.940697E-8, + 31 => 1.0430813E-7, + 32 => 1.1920929E-7, + 33 => 1.4901161E-7, + 34 => 1.7881393E-7, + 35 => 2.0861626E-7, + 36 => 2.3841858E-7, + 37 => 2.9802322E-7, + 38 => 3.5762787E-7, + 39 => 4.172325E-7, + 40 => 4.7683716E-7, + 41 => 5.9604645E-7, + 42 => 7.1525574E-7, + 43 => 8.34465E-7, + 44 => 9.536743E-7, + 45 => 1.1920929E-6, + 46 => 1.4305115E-6, + 47 => 1.66893E-6, + 48 => 1.9073486E-6, + 49 => 2.3841858E-6, + 50 => 2.861023E-6, + 51 => 3.33786E-6, + 52 => 3.8146973E-6, + 53 => 4.7683716E-6, + 54 => 5.722046E-6, + 55 => 6.67572E-6, + 56 => 7.6293945E-6, + 57 => 9.536743E-6, + 58 => 1.1444092E-5, + 59 => 1.335144E-5, + 60 => 1.5258789E-5, + 61 => 1.9073486E-5, + 62 => 2.2888184E-5, + 63 => 2.670288E-5, + 64 => 3.0517578E-5, + 65 => 3.8146973E-5, + 66 => 4.5776367E-5, + 67 => 5.340576E-5, + 68 => 6.1035156E-5, + 69 => 7.6293945E-5, + 70 => 9.1552734E-5, + 71 => 1.0681152E-4, + 72 => 1.2207031E-4, + 73 => 1.5258789E-4, + 74 => 1.8310547E-4, + 75 => 2.1362305E-4, + 76 => 2.4414062E-4, + 77 => 3.0517578E-4, + 78 => 3.6621094E-4, + 79 => 4.272461E-4, + 80 => 4.8828125E-4, + 81 => 6.1035156E-4, + 82 => 7.324219E-4, + 83 => 8.544922E-4, + 84 => 9.765625E-4, + 85 => 0.0012207031, + 86 => 0.0014648438, + 87 => 0.0017089844, + 88 => 0.001953125, + 89 => 0.0024414062, + 90 => 0.0029296875, + 91 => 0.0034179688, + 92 => 0.00390625, + 93 => 0.0048828125, + 94 => 0.005859375, + 95 => 0.0068359375, + 96 => 0.0078125, + 97 => 0.009765625, + 98 => 0.01171875, + 99 => 0.013671875, + 100 => 0.015625, + 101 => 0.01953125, + 102 => 0.0234375, + 103 => 0.02734375, + 104 => 0.03125, + 105 => 0.0390625, + 106 => 0.046875, + 107 => 0.0546875, + 108 => 0.0625, + 109 => 0.078125, + 110 => 0.09375, + 111 => 0.109375, + 112 => 0.125, + 113 => 0.15625, + 114 => 0.1875, + 115 => 0.21875, + 116 => 0.25, + 117 => 0.3125, + 118 => 0.375, + 119 => 0.4375, + 120 => 0.5, + 121 => 0.625, + 122 => 0.75, + 123 => 0.875, + 124 => 1.0, + 125 => 1.25, + 126 => 1.5, + 127 => 1.75, + 128 => 2.0, + 129 => 2.5, + 130 => 3.0, + 131 => 3.5, + 132 => 4.0, + 133 => 5.0, + 134 => 6.0, + 135 => 7.0, + 136 => 8.0, + 137 => 10.0, + 138 => 12.0, + 139 => 14.0, + 140 => 16.0, + 141 => 20.0, + 142 => 24.0, + 143 => 28.0, + 144 => 32.0, + 145 => 40.0, + 146 => 48.0, + 147 => 56.0, + 148 => 64.0, + 149 => 80.0, + 150 => 96.0, + 151 => 112.0, + 152 => 128.0, + 153 => 160.0, + 154 => 192.0, + 155 => 224.0, + 156 => 256.0, + 157 => 320.0, + 158 => 384.0, + 159 => 448.0, + 160 => 512.0, + 161 => 640.0, + 162 => 768.0, + 163 => 896.0, + 164 => 1024.0, + 165 => 1280.0, + 166 => 1536.0, + 167 => 1792.0, + 168 => 2048.0, + 169 => 2560.0, + 170 => 3072.0, + 171 => 3584.0, + 172 => 4096.0, + 173 => 5120.0, + 174 => 6144.0, + 175 => 7168.0, + 176 => 8192.0, + 177 => 10240.0, + 178 => 12288.0, + 179 => 14336.0, + 180 => 16384.0, + 181 => 20480.0, + 182 => 24576.0, + 183 => 28672.0, + 184 => 32768.0, + 185 => 40960.0, + 186 => 49152.0, + 187 => 57344.0, + 188 => 65536.0, + 189 => 81920.0, + 190 => 98304.0, + 191 => 114688.0, + 192 => 131072.0, + 193 => 163840.0, + 194 => 196608.0, + 195 => 229376.0, + 196 => 262144.0, + 197 => 327680.0, + 198 => 393216.0, + 199 => 458752.0, + 200 => 524288.0, + 201 => 655360.0, + 202 => 786432.0, + 203 => 917504.0, + 204 => 1048576.0, + 205 => 1310720.0, + 206 => 1572864.0, + 207 => 1835008.0, + 208 => 2097152.0, + 209 => 2621440.0, + 210 => 3145728.0, + 211 => 3670016.0, + 212 => 4194304.0, + 213 => 5242880.0, + 214 => 6291456.0, + 215 => 7340032.0, + 216 => 8388608.0, + 217 => 1.048576E7, + 218 => 1.2582912E7, + 219 => 1.4680064E7, + 220 => 1.6777216E7, + 221 => 2.097152E7, + 222 => 2.5165824E7, + 223 => 2.9360128E7, + 224 => 3.3554432E7, + 225 => 4.194304E7, + 226 => 5.0331648E7, + 227 => 5.8720256E7, + 228 => 6.7108864E7, + 229 => 8.388608E7, + 230 => 1.00663296E8, + 231 => 1.17440512E8, + 232 => 1.34217728E8, + 233 => 1.6777216E8, + 234 => 2.01326592E8, + 235 => 2.34881024E8, + 236 => 2.68435456E8, + 237 => 3.3554432E8, + 238 => 4.02653184E8, + 239 => 4.69762048E8, + 240 => 5.3687091E8, + 241 => 6.7108864E8, + 242 => 8.0530637E8, + 243 => 9.395241E8, + 244 => 1.07374182E9, + 245 => 1.34217728E9, + 246 => 1.61061274E9, + 247 => 1.87904819E9, + 248 => 2.14748365E9, + 249 => 2.68435456E9, + 250 => 3.22122547E9, + 251 => 3.75809638E9, + 252 => 4.2949673E9, + 253 => 5.3687091E9, + 254 => 6.4424509E9, + 255 => 7.5161928E9 ); + + + /** + * Set the default Similarity implementation used by indexing and search + * code. + * + * @param Zend_Search_Lucene_Search_Similarity $similarity + */ + static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) + { + self::$_defaultImpl = $similarity; + } + + + /** + * Return the default Similarity implementation used by indexing and search + * code. + * + * @return Zend_Search_Lucene_Search_Similarity + */ + static public function getDefault() + { + if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { + self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); + } + + return self::$_defaultImpl; + } + + + /** + * Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + * Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when 'numTokens' is large, + * and larger values when 'numTokens' is small. + * + * That these values are computed under + * IndexWriter::addDocument(Document) and stored then using + * encodeNorm(float). Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * fieldName - name of field + * numTokens - the total number of tokens contained in fields named + * 'fieldName' of 'doc'. + * Returns a normalization factor for hits on this field of this document + * + * @param string $fieldName + * @param integer $numTokens + * @return float + */ + abstract public function lengthNorm($fieldName, $numTokens); + + /** + * Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + * This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * sumOfSquaredWeights - the sum of the squares of query term weights + * Returns a normalization factor for query weights + * + * @param float $sumOfSquaredWeights + * @return float + */ + abstract public function queryNorm($sumOfSquaredWeights); + + + /** + * Decodes a normalization factor stored in an index. + * + * @param integer $byte + * @return float + */ + static public function decodeNorm($byte) + { + return self::$_normTable[$byte & 0xFF]; + } + + + /** + * Encodes a normalization factor for storage in an index. + * + * The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @param float $f + * @return integer + */ + static function encodeNorm($f) + { + return self::_floatToByte($f); + } + + /** + * Float to byte conversion + * + * @param integer $b + * @return float + */ + static private function _floatToByte($f) + { + // round negatives up to zero + if ($f <= 0.0) { + return 0; + } + + // search for appropriate value + $lowIndex = 0; + $highIndex = 255; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $delta = $f - self::$_normTable[$mid]; + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $mid; // We got it! + } + } + + // round to closest value + if ($highIndex != 255 && + $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { + return $highIndex + 1; + } else { + return $highIndex; + } + } + + + /** + * Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the idf(Term, Searcher) + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when 'freq' is large, and smaller values when 'freq' + * is small. + * + * freq - the frequency of a term within a document + * Returns a score factor based on a term's within-document frequency + * + * @param float $freq + * @return float + */ + abstract public function tf($freq); + + /** + * Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to tf(float). + * + * A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * distance - the edit distance of this sloppy phrase match + * Returns the frequency increment for this match + * + * @param integer $distance + * @return float + */ + abstract public function sloppyFreq($distance); + + + /** + * Computes a score factor for a simple term or a phrase. + * + * The default implementation is: + * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); + * + * input - the term in question or array of terms + * reader - reader the document collection being searched + * Returns a score factor for the term + * + * @param mixed $input + * @param Zend_Search_Lucene $reader + * @return a score factor for the term + */ + public function idf($input, $reader) + { + if (!is_array($input)) { + return $this->idfFreq($reader->docFreq($input), $reader->count()); + } else { + $idf = 0.0; + foreach ($input as $term) { + $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); + } + return $idf; + } + } + + /** + * Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * tf(int) factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + * Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * docFreq - the number of documents which contain the term + * numDocs - the total number of documents in the collection + * Returns a score factor based on the term's document frequency + * + * @param integer $docFreq + * @param integer $numDocs + * @return float + */ + abstract public function idfFreq($docFreq, $numDocs); + + /** + * Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + * The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * overlap - the number of query terms matched in the document + * maxOverlap - the total number of terms in the query + * Returns a score factor based on term overlap with the query + * + * @param integer $overlap + * @param integer $maxOverlap + * @return float + */ + abstract public function coord($overlap, $maxOverlap); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php new file mode 100644 index 00000000..1551d8bd --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php @@ -0,0 +1,99 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity +{ + + /** + * Implemented as '1/sqrt(numTerms)'. + * + * @param string $fieldName + * @param integer numTerms + * @return float + */ + public function lengthNorm($fieldName, $numTerms) + { + return 1.0/sqrt($numTerms); + } + + /** + * Implemented as '1/sqrt(sumOfSquaredWeights)'. + * + * @param float $sumOfSquaredWeights + * @return float + */ + public function queryNorm($sumOfSquaredWeights) + { + return 1.0/sqrt($sumOfSquaredWeights); + } + + /** + * Implemented as 'sqrt(freq)'. + * + * @param float $freq + * @return float + */ + public function tf($freq) + { + return sqrt($freq); + } + + /** + * Implemented as '1/(distance + 1)'. + * + * @param integer $distance + * @return float + */ + public function sloppyFreq($distance) + { + return 1.0/($distance + 1); + } + + /** + * Implemented as 'log(numDocs/(docFreq+1)) + 1'. + * + * @param integer $docFreq + * @param integer $numDocs + * @return float + */ + public function idfFreq($docFreq, $numDocs) + { + return log($numDocs/(float)($docFreq+1)) + 1.0; + } + + /** + * Implemented as 'overlap/maxOverlap'. + * + * @param integer $overlap + * @param integer $maxOverlap + * @return float + */ + public function coord($overlap, $maxOverlap) + { + return $overlap/(float)$maxOverlap; + } +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php new file mode 100644 index 00000000..2d5b7a72 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php @@ -0,0 +1,59 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Calculate query weights and build query scorers. + * + * A Weight is constructed by a query Query->createWeight(). + * The sumOfSquaredWeights() method is then called on the top-level + * query to compute the query normalization factor Similarity->queryNorm(float). + * This factor is then passed to normalize(float). At this point the weighting + * is complete. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Weight +{ + /** + * The weight for this query. + * + * @return float + */ + abstract public function getValue(); + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + abstract public function sumOfSquaredWeights(); + + /** + * Assigns the query normalization factor to this. + * + * @param $norm + */ + abstract public function normalize($norm); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php new file mode 100644 index 00000000..69528ba4 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php @@ -0,0 +1,133 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Weight */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight +{ + /** + * IndexReader. + * + * @var Zend_Search_Lucene + */ + private $_reader; + + /** + * The query that this concerns. + * + * @var Zend_Search_Lucene_Search_Query_MultiTerm + */ + private $_query; + + /** + * Query terms weights + * Array of Zend_Search_Lucene_Search_Weight_Term + * + * @var array + */ + private $_weights; + + + /** + * Zend_Search_Lucene_Search_Weight_MultiTerm constructor + * query - the query that this concerns. + * reader - index reader + * + * @param Zend_Search_Lucene_Search_Query_MultiTerm $query + * @param Zend_Search_Lucene $reader + */ + public function __construct($query, $reader) + { + $this->_query = $query; + $this->_reader = $reader; + $this->_weights = array(); + + $signs = $query->getSigns(); + + foreach ($query->getTerms() as $num => $term) { + if ($signs === null || $signs[$num] === null || $signs[$num]) { + $this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); + $query->setWeight($num, $this->_weights[$num]); + } + } + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_query->getBoost(); + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + $sum = 0; + foreach ($this->_weights as $weight) { + // sum sub weights + $sum += $weight->sumOfSquaredWeights(); + } + + // boost each sub-weight + $sum *= $this->_query->getBoost() * $this->_query->getBoost(); + + // check for empty query (like '-something -another') + if ($sum == 0) { + $sum = 1.0; + } + return $sum; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + // incorporate boost + $queryNorm *= $this->_query->getBoost(); + + foreach ($this->_weights as $weight) { + $weight->normalize($queryNorm); + } + } +} + + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php new file mode 100644 index 00000000..77e94f28 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php @@ -0,0 +1,138 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Zend_Search_Lucene_Search_Weight + */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight +{ + /** + * IndexReader. + * + * @var Zend_Search_Lucene + */ + private $_reader; + + /** + * The query that this concerns. + * + * @var Zend_Search_Lucene_Search_Query_Phrase + */ + private $_query; + + /** + * Weight value + * + * @var float + */ + private $_value; + + /** + * Score factor + * + * @var float + */ + private $_idf; + + /** + * Normalization factor + * + * @var float + */ + private $_queryNorm; + + + /** + * Query weight + * + * @var float + */ + private $_queryWeight; + + + /** + * Zend_Search_Lucene_Search_Weight_Phrase constructor + * + * @param Zend_Search_Lucene_Search_Query_Phrase $query + * @param Zend_Search_Lucene $reader + */ + public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query, Zend_Search_Lucene $reader) + { + $this->_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php new file mode 100644 index 00000000..3e6102f3 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php @@ -0,0 +1,144 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Weight */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight +{ + /** + * IndexReader. + * + * @var Zend_Search_Lucene + */ + private $_reader; + + /** + * Term + * + * @var Zend_Search_Lucene_Index_Term + */ + private $_term; + + /** + * The query that this concerns. + * + * @var Zend_Search_Lucene_Search_Query + */ + private $_query; + + /** + * Weight value + * + * @var float + */ + private $_value; + + /** + * Score factor + * + * @var float + */ + private $_idf; + + /** + * Normalization factor + * + * @var float + */ + private $_queryNorm; + + + /** + * Query weight + * + * @var float + */ + private $_queryWeight; + + + /** + * Zend_Search_Lucene_Search_Weight_Term constructor + * reader - index reader + * + * @param Zend_Search_Lucene $reader + */ + public function __construct($term, $query, $reader) + { + $this->_term = $term; + $this->_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php new file mode 100644 index 00000000..48114a76 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php @@ -0,0 +1,118 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Storage_Directory +{ + + /** + * Closes the store. + * + * @return void + */ + abstract public function close(); + + /** + * Returns an array of strings, one for each file in the directory. + * + * @return array + */ + abstract public function fileList(); + + /** + * Creates a new, empty file in the directory with the given $filename. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + abstract public function createFile($filename); + + + /** + * Removes an existing $filename in the directory. + * + * @param string $filename + * @return void + */ + abstract public function deleteFile($filename); + + + /** + * Returns true if a file with the given $filename exists. + * + * @param string $filename + * @return boolean + */ + abstract public function fileExists($filename); + + + /** + * Returns the length of a $filename in the directory. + * + * @param string $filename + * @return integer + */ + abstract public function fileLength($filename); + + + /** + * Returns the UNIX timestamp $filename was last modified. + * + * @param string $filename + * @return integer + */ + abstract public function fileModified($filename); + + + /** + * Renames an existing file in the directory. + * + * @param string $from + * @param string $to + * @return void + */ + abstract public function renameFile($from, $to); + + + /** + * Sets the modified time of $filename to now. + * + * @param string $filename + * @return void + */ + abstract public function touchFile($filename); + + + /** + * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + abstract public function getFileObject($filename); + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php new file mode 100644 index 00000000..73d10659 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php @@ -0,0 +1,269 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Storage_Directory */ +require_once 'Zend/Search/Lucene/Storage/Directory.php'; + +/** Zend_Search_Lucene_Storage_File_Filesystem */ +require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php'; + + +/** + * FileSystem implementation of Directory abstraction. + * + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory +{ + /** + * Filesystem path to the directory + * + * @var string + */ + private $_dirPath = null; + + /** + * Cache for Zend_Search_Lucene_Storage_File_Filesystem objects + * Array: filename => Zend_Search_Lucene_Storage_File object + * + * @var array + * @throws Zend_Search_Lucene_Exception + */ + private $_fileHandlers; + + + /** + * Utility function to recursive directory creation + * + * @param string $dir + * @param integer $mode + * @param boolean $recursive + * @return boolean + */ + + static public function mkdirs($dir, $mode = 0777, $recursive = true) + { + if (is_null($dir) || $dir === '') { + return false; + } + if (is_dir($dir) || $dir === '/') { + return true; + } + if (self::mkdirs(dirname($dir), $mode, $recursive)) { + return mkdir($dir, $mode); + } + return false; + } + + + /** + * Object constructor + * Checks if $path is a directory or tries to create it. + * + * @param string $path + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($path) + { + if (!is_dir($path)) { + if (file_exists($path)) { + throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory'); + } else { + if (!self::mkdirs($path)) { + throw new Zend_Search_Lucene_Exception("Can't create directory '$path'."); + } + } + } + $this->_dirPath = $path; + $this->_fileHandlers = array(); + } + + + /** + * Closes the store. + * + * @return void + */ + public function close() + { + foreach ($this->_fileHandlers as $fileObject) { + $fileObject->close(); + } + + unset($this->_fileHandlers); + } + + + /** + * Returns an array of strings, one for each file in the directory. + * + * @return array + */ + public function fileList() + { + $result = array(); + + $dirContent = opendir( $this->_dirPath ); + while ($file = readdir($dirContent)) { + if (($file == '..')||($file == '.')) continue; + + $fullName = $this->_dirPath . '/' . $file; + + if( !is_dir($this->_dirPath . '/' . $file) ) { + $result[] = $file; + } + } + + return $result; + } + + /** + * Creates a new, empty file in the directory with the given $filename. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function createFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b'); + return $this->_fileHandlers[$filename]; + } + + + /** + * Removes an existing $filename in the directory. + * + * @param string $filename + * @return void + */ + public function deleteFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + unlink($this->_dirPath .'/'. $filename); + } + + + /** + * Returns true if a file with the given $filename exists. + * + * @param string $filename + * @return boolean + */ + public function fileExists($filename) + { + return file_exists($this->_dirPath .'/'. $filename); + } + + + /** + * Returns the length of a $filename in the directory. + * + * @param string $filename + * @return integer + */ + public function fileLength($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + return $this->_fileHandlers[$filename]->size(); + } + return filesize($this->_dirPath .'/'. $filename); + } + + + /** + * Returns the UNIX timestamp $filename was last modified. + * + * @param string $filename + * @return integer + */ + public function fileModified($filename) + { + return filemtime($this->_dirPath .'/'. $filename); + } + + + /** + * Renames an existing file in the directory. + * + * @param string $from + * @param string $to + * @return void + */ + public function renameFile($from, $to) + { + if ($this->_fileHandlers[$from] !== null) { + $this->_fileHandlers[$from]->close(); + } + unset($this->_fileHandlers[$from]); + + if ($this->_fileHandlers[$to] !== null) { + $this->_fileHandlers[$to]->close(); + } + unset($this->_fileHandlers[$to]); + + if (file_exists($this->_dirPath . '/' . $to)) { + unlink($this->_dirPath . '/' . $to); + } + + return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); + } + + + /** + * Sets the modified time of $filename to now. + * + * @param string $filename + * @return void + */ + public function touchFile($filename) + { + return touch($this->_dirPath .'/'. $filename); + } + + + /** + * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function getFileObject($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + $this->_fileHandlers[$filename]->seek(0); + return $this->_fileHandlers[$filename]; + } + + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'rb'); + return $this->_fileHandlers[$filename]; + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php new file mode 100644 index 00000000..f62af33a --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php @@ -0,0 +1,376 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Storage_File +{ + + /** + * Class constructor. Open the file. + */ + abstract public function __construct($filename, $mode='r'); + + + /** + * Reads $length number of bytes at the current position in the + * file and advances the file pointer. + * + * @param integer $length + * @return string + */ + abstract protected function _fread($length=1); + + + /** + * Sets the file position indicator and advances the file pointer. + * The new position, measured in bytes from the beginning of the file, + * is obtained by adding offset to the position specified by whence, + * whose values are defined as follows: + * SEEK_SET - Set position equal to offset bytes. + * SEEK_CUR - Set position to current location plus offset. + * SEEK_END - Set position to end-of-file plus offset. (To move to + * a position before the end-of-file, you need to pass a negative value + * in offset.) + * Upon success, returns 0; otherwise, returns -1 + * + * @param integer $offset + * @param integer $whence + * @return integer + */ + abstract public function seek($offset, $whence=SEEK_SET); + + /** + * Get file position. + * + * @return integer + */ + abstract public function tell(); + + /** + * Writes $length number of bytes (all, if $length===null) to the end + * of the file. + * + * @param string $data + * @param integer $length + */ + abstract protected function _fwrite($data, $length=null); + + + /** + * Reads a byte from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readByte() + { + return ord($this->_fread(1)); + } + + /** + * Writes a byte to the end of the file. + * + * @param integer $byte + */ + public function writeByte($byte) + { + return $this->_fwrite(chr($byte), 1); + } + + /** + * Read num bytes from the current position in the file + * and advances the file pointer. + * + * @param integer $num + * @return string + */ + public function readBytes($num) + { + return $this->_fread($num); + } + + /** + * Writes num bytes of data (all, if $num===null) to the end + * of the file. + * + * @param string $data + * @param integer $num + */ + public function writeBytes($data, $num=null) + { + $this->_fwrite($data, $num); + } + + + /** + * Reads an integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readInt() + { + $str = $this->_fread(4); + + return ord($str{0}) << 24 | + ord($str{1}) << 16 | + ord($str{2}) << 8 | + ord($str{3}); + } + + + /** + * Writes an integer to the end of file. + * + * @param integer $value + */ + public function writeInt($value) + { + settype($value, 'integer'); + $this->_fwrite( chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 4 ); + } + + + /** + * Returns a long integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readLong() + { + $str = $this->_fread(8); + + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + return /* ord($str{0}) << 56 | */ + /* ord($str{1}) << 48 | */ + /* ord($str{2}) << 40 | */ + /* ord($str{3}) << 32 | */ + ord($str{4}) << 24 | + ord($str{5}) << 16 | + ord($str{6}) << 8 | + ord($str{7}); + } + + /** + * Writes long integer to the end of file + * + * @param integer $value + */ + public function writeLong($value) + { + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + settype($value, 'integer'); + $this->_fwrite( "\x00\x00\x00\x00" . + chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 8 ); + } + + + + /** + * Returns a variable-length integer from the current + * position in the file and advances the file pointer. + * + * @return integer + */ + public function readVInt() + { + $nextByte = ord($this->_fread(1)); + $val = $nextByte & 0x7F; + + for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { + $nextByte = ord($this->_fread(1)); + $val |= ($nextByte & 0x7F) << $shift; + } + return $val; + } + + /** + * Writes a variable-length integer to the end of file. + * + * @param integer $value + */ + public function writeVInt($value) + { + settype($value, 'integer'); + while ($value > 0x7F) { + $this->_fwrite(chr( ($value & 0x7F)|0x80 )); + $value >>= 7; + } + $this->_fwrite(chr($value)); + } + + + /** + * Reads a string from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readString() + { + $strlen = $this->readVInt(); + if ($strlen == 0) { + return ''; + } else { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + $str_val = $this->_fread($strlen); + + for ($count = 0; $count < $strlen; $count++ ) { + if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($str_val{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + if (ord($str_val{$count}) & 0x10 ) { + $addBytes++; + } + } + $str_val .= $this->_fread($addBytes); + $strlen += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($str_val{$count}) == 0xC0 && + ord($str_val{$count+1}) == 0x80 ) { + $str_val{$count} = 0; + $str_val = substr($str_val,0,$count+1) + . substr($str_val,$count+2); + } + $count += $addBytes; + } + } + + return $str_val; + } + } + + /** + * Writes a string to the end of file. + * + * @param string $str + * @throws Zend_Search_Lucene_Exception + */ + public function writeString($str) + { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + // convert input to a string before iterating string characters + settype($str, 'string'); + + $chars = $strlen = strlen($str); + $containNullChars = false; + + for ($count = 0; $count < $strlen; $count++ ) { + /** + * String is already in Java 2 representation. + * We should only calculate actual string length and replace + * \x00 by \xC0\x80 + */ + if ((ord($str{$count}) & 0xC0) == 0xC0) { + $addBytes = 1; + if (ord($str{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + // and we dont't support non-BMP characters + if (ord($str{$count}) & 0x10 ) { + $addBytes++; + } + } + $chars -= $addBytes; + + if (ord($str{$count}) == 0 ) { + $containNullChars = true; + } + $count += $addBytes; + } + } + + if ($chars < 0) { + throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string'); + } + + $this->writeVInt($chars); + if ($containNullChars) { + $this->_fwrite(str_replace($str, "\x00", "\xC0\x80")); + } else { + $this->_fwrite($str); + } + } + + + /** + * Reads binary data from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readBinary() + { + return $this->_fread($this->readVInt()); + } +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php new file mode 100644 index 00000000..fc6adcf5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php @@ -0,0 +1,170 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Storage_File */ +require_once 'Zend/Search/Lucene/Storage/File.php'; + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Storage + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + * + */ +class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File +{ + /** + * Resource of the open file + * + * @var resource + */ + private $_fileHandle; + + + /** + * Class constructor. Open the file. + * + * @param string $filename + * @param string $mode + */ + public function __construct($filename, $mode='rb') + { + global $php_errormsg; + + $trackErrors = ini_get( "track_errors"); + ini_set('track_errors', '1'); + + $this->_fileHandle = @fopen($filename, $mode); + + if ($this->_fileHandle===false) { + ini_set('track_errors', $trackErrors); + throw new Zend_Search_Lucene_Exception($php_errormsg); + } + + ini_set('track_errors', $trackErrors); + } + + + /** + * Sets the file position indicator and advances the file pointer. + * The new position, measured in bytes from the beginning of the file, + * is obtained by adding offset to the position specified by whence, + * whose values are defined as follows: + * SEEK_SET - Set position equal to offset bytes. + * SEEK_CUR - Set position to current location plus offset. + * SEEK_END - Set position to end-of-file plus offset. (To move to + * a position before the end-of-file, you need to pass a negative value + * in offset.) + * Upon success, returns 0; otherwise, returns -1 + * + * @param integer $offset + * @param integer $whence + * @return integer + */ + public function seek($offset, $whence=SEEK_SET) + { + return fseek($this->_fileHandle, $offset, $whence); + } + + + /** + * Get file position. + * + * @return integer + */ + public function tell() + { + return ftell($this->_fileHandle); + } + + + /** + * Close File object + */ + public function close() + { + if ($this->_fileHandle !== null ) { + @fclose($this->_fileHandle); + $this->_fileHandle = null; + } + } + + /** + * Get the size of the already opened file + * + * @return integer + */ + public function size() + { + $position = ftell($this->_fileHandle); + fseek($this->_fileHandle, 0, SEEK_END); + $size = ftell($this->_fileHandle); + fseek($this->_fileHandle,$position); + + return $size; + } + + /** + * Read a $length bytes from the file and advance the file pointer. + * + * @param integer $length + * @return string + */ + protected function _fread($length=1) + { + if ($length == 0) { + return ''; + } + + if ($length < 1024) { + return fread($this->_fileHandle, $length); + } + + $data = ''; + while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { + $data .= $nextBlock; + $length -= strlen($nextBlock); + } + return $data; + } + + + /** + * Writes $length number of bytes (all, if $length===null) to the end + * of the file. + * + * @param string $data + * @param integer $length + */ + protected function _fwrite($data, $length=null) + { + if ($length === null ) { + fwrite($this->_fileHandle, $data); + } else { + fwrite($this->_fileHandle, $data, $length); + } + } +} + diff --git a/buildscripts/texbuilder/Zend/Search/TODO.txt b/buildscripts/texbuilder/Zend/Search/TODO.txt new file mode 100644 index 00000000..06f7b487 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/TODO.txt @@ -0,0 +1,14 @@ +@todo + +- Improve API: fix ZSearchMultiTermQuery($terms, $signs); + +- Analysis and indexing engine + +- Additional queries: phrase, wildcard, proximity, and range + +- Better class-level docblocks (most functions okay) + +- Some Windows issues(?) during indexing + +- Finish renaming classes to PEAR-like conventions + diff --git a/buildscripts/texbuilder/build.php b/buildscripts/texbuilder/build.php index 4e2d8be6..20418248 100644 --- a/buildscripts/texbuilder/build.php +++ b/buildscripts/texbuilder/build.php @@ -10,6 +10,10 @@ $mainTexFile = dirname(__FILE__).'/prado3_quick_start.tex'; //page root location
$base = realpath(dirname(__FILE__).'/../../demos/quickstart/protected/pages/');
+//search index data directory
+$index_dir = realpath(dirname(__FILE__).'/../../demos/quickstart/protected/index/data');
+
+
//list page into chapters
$pages['Getting Started'] = array(
'GettingStarted/Introduction.page',
@@ -94,6 +98,11 @@ $pages['Advanced Topics'] = array( 'Advanced/Error.page',
'Advanced/Performance.page');
+$pages['Client-side Scripting'] = array(
+ 'Advanced/Scripts.page',
+ 'Advanced/Scripts1.page',
+ 'Advanced/Scripts2.page',
+ 'Advanced/Scripts3.page');
//-------------- END CONFIG ------------------
@@ -273,12 +282,51 @@ function get_section_label($section) return '\hypertarget{'.str_replace('/', '.', $section).'}{}';
}
+
+function set_header_id($content, $count)
+{
+ global $header_count;
+ $header_count = $count*100;
+ $content = preg_replace_callback('/<h1>/', "h1", $content);
+ $content = preg_replace_callback('/<h2>/', "h2", $content);
+ $content = preg_replace_callback('/<h3>/', "h3", $content);
+ return $content;
+}
+
+function h1($matches)
+{
+ global $header_count;
+ return "<h1 id=\"".(++$header_count)."\">";
+}
+
+function h2($matches)
+{
+ global $header_count;
+ return "<h2 id=\"".(++$header_count)."\">";
+}
+
+function h3($matches)
+{
+ global $header_count;
+ return "<h3 id=\"".(++$header_count)."\">";
+}
+
+$header_count = 0;
+
//--------------- BEGIN PROCESSING -------------------
+
+//--------------- Indexer -------------------
+
+require_once('create_index.php');
+$indexer = new quickstart_index($index_dir);
+
// ---------------- Create the Tex files ---------
$count = 1;
+$j = 1;
$current_path = '';
echo "Compiling .page files to Latex files\n\n";
+
foreach($pages as $chapter => $sections)
{
$content = '\chapter{'.$chapter.'}'.get_chapter_label($chapter);
@@ -289,8 +337,16 @@ foreach($pages as $chapter => $sections) echo " Adding $section\n";
$page = $base.'/'.$section;
$current_path = $page;
+
+ //add id to <h1>, <h2>, <3>
+ $content = set_header_id(file_get_contents($page),$j++);
+ file_put_contents($page, $content);
+
$content .= get_section_label($section);
- $content .= parse_html($page,file_get_contents($page));
+ $file_content = file_get_contents($page);
+ $tex = parse_html($page,$file_content);
+ $content .= $tex;
+ $indexer->add($file_content,$section, filemtime($page));
}
//var_dump($content);
@@ -299,6 +355,7 @@ foreach($pages as $chapter => $sections) echo "\n";
}
+$indexer->commit();
if($argc <= 1 && $count > 1)
{
diff --git a/buildscripts/texbuilder/create_index.php b/buildscripts/texbuilder/create_index.php new file mode 100644 index 00000000..db72c453 --- /dev/null +++ b/buildscripts/texbuilder/create_index.php @@ -0,0 +1,87 @@ +<?php
+
+// Create quickstart search index
+require_once (dirname(__FILE__).'/Zend/Search/Lucene.php');
+
+class quickstart_index
+{
+ private $_index;
+ private $_dir;
+
+ public function __construct($index_file)
+ {
+ $this->_index = new Zend_Search_Lucene($index_file, true);
+ $this->_dir = $index_file;
+ echo "Building search index...\n";
+ }
+
+ public function add($content, $section, $mtime)
+ {
+ foreach($this->split_headings($content) as $headers)
+ {
+ $doc = new Zend_Search_Lucene_Document();
+ $link = "index.php?page=".preg_replace('/\/|\\\/', '.', $section);
+ $link = str_replace('.page', '', $link).'#'.$headers['section'];
+
+ //unsearchable text
+ $doc->addField(Zend_Search_Lucene_Field::UnIndexed('link', $link));
+ $doc->addField(Zend_Search_Lucene_Field::UnIndexed('mtime', $mtime));
+ $doc->addField(Zend_Search_Lucene_Field::UnIndexed('title', $headers['title']));
+ $doc->addField(Zend_Search_Lucene_Field::UnIndexed('text', $headers['content']));
+
+ //searchable text
+ $doc->addField(Zend_Search_Lucene_Field::Keyword('page', strtolower($headers['title'])));
+ $body = strtolower($this->sanitize($headers['content']));
+ $doc->addField(Zend_Search_Lucene_Field::Unstored('contents',$body));
+ $this->_index->addDocument($doc);
+ }
+ }
+
+ function sanitize($input)
+ {
+ return htmlentities(strip_tags( $input ));
+ }
+
+ public function index()
+ {
+ return $this->_index;
+ }
+
+ protected function split_headings($html)
+ {
+ $html = preg_replace('/<\/?com:TContent[^<]*>/', '', $html);
+
+ $html = preg_replace('/<b>([^<]*)<\/b>/', '$1', $html);
+ $html = preg_replace('/<i>([^<]*)<\/i>/', '$1', $html);
+ $html = preg_replace('/<tt>([^<]*)<\/tt>/', '$1', $html);
+
+ $html = preg_replace('/<h1([^>]*)>([^<]*)<\/h1>/', '<hh$1>$2</hh>', $html);
+ $html = preg_replace('/<h2([^>]*)>([^<]*)<\/h2>/', '<hh$1>$2</hh>', $html);
+ $html = preg_replace('/<h3([^>]*)>([^<]*)<\/h3>/', '<hh$1>$2</hh>', $html);
+
+
+ $sections = preg_split('/<hh[^>]*>([^<]+)<\/hh>/', $html,-1);
+ $headers = array();
+ preg_match_all('/<hh([^>]*)>([^<]+)<\/hh>/', $html, $headers);
+ $contents = array();
+ for($i = 1, $t = count($sections); $i < $t; $i++)
+ {
+ $content['title'] = trim($this->sanitize($headers[2][$i-1]));
+ $sec = array();
+ preg_match('/"([^"]*)"/', $headers[1][$i-1], $sec);
+ $content['section'] = str_replace('"', '',$sec[0]);
+ $content['content'] = trim($this->sanitize($sections[$i]));
+ $contents[] = $content;
+ }
+
+ return $contents;
+ }
+
+ public function commit()
+ {
+ $this->_index->commit();
+ $count = $this->_index->count();
+ echo "\nSaving search index ({$count}) to {$this->_dir}\n\n";
+ }
+}
+?>
\ No newline at end of file diff --git a/buildscripts/texbuilder/prado3_quick_start.tex b/buildscripts/texbuilder/prado3_quick_start.tex index a4105685..4fc1bbd6 100644 --- a/buildscripts/texbuilder/prado3_quick_start.tex +++ b/buildscripts/texbuilder/prado3_quick_start.tex @@ -114,5 +114,6 @@ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \include{ch7}
\include{ch8}
\include{ch9}
+\include{ch10}
\end{document}
|