diff options
Diffstat (limited to 'buildscripts/texbuilder/Zend/Search/Lucene/Index')
6 files changed, 0 insertions, 1401 deletions
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php deleted file mode 100644 index eaca4ecf..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php +++ /dev/null @@ -1,43 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_FieldInfo -{ - public $name; - public $isIndexed; - public $number; - public $storeTermVector; - - public function __construct( $name, $isIndexed, $number, $storeTermVector ) - { - $this->name = $name; - $this->isIndexed = $isIndexed; - $this->number = $number; - $this->storeTermVector = $storeTermVector; - } -} - diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php deleted file mode 100644 index f5c596a0..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php +++ /dev/null @@ -1,412 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; - - -/** - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_SegmentInfo -{ - /** - * Number of docs in a segment - * - * @var integer - */ - private $_docCount; - - /** - * Segment name - * - * @var string - */ - private $_name; - - /** - * Term Dictionary Index - * Array of the Zend_Search_Lucene_Index_Term objects - * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos - * - * @var array - */ - private $_termDictionary; - - /** - * Term Dictionary Index TermInfos - * Array of the Zend_Search_Lucene_Index_TermInfo objects - * - * @var array - */ - private $_termDictionaryInfos; - - /** - * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment - * - * @var array - */ - private $_fields; - - /** - * Field positions in a dictionary. - * (Term dictionary contains filelds ordered by names) - * - * @var array - */ - private $_fieldsDicPositions; - - - /** - * Associative array where the key is the file name and the value is data offset - * in a compound segment file (.csf). - * - * @var array - */ - private $_segFiles; - - /** - * File system adapter. - * - * @var Zend_Search_Lucene_Storage_Directory_Filesystem - */ - private $_directory; - - /** - * Normalization factors. - * An array fieldName => normVector - * normVector is a binary string. - * Each byte corresponds to an indexed document in a segment and - * encodes normalization factor (float value, encoded by - * Zend_Search_Lucene_Search_Similarity::encodeNorm()) - * - * @var array - */ - private $_norms = array(); - - /** - * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, - * Documents count and Directory as a parameter. - * - * @param string $name - * @param integer $docCount - * @param Zend_Search_Lucene_Storage_Directory $directory - */ - public function __construct($name, $docCount, $directory) - { - $this->_name = $name; - $this->_docCount = $docCount; - $this->_directory = $directory; - $this->_termDictionary = null; - - $this->_segFiles = array(); - $cfsFile = $this->_directory->getFileObject($name . '.cfs'); - $segFilesCount = $cfsFile->readVInt(); - - for ($count = 0; $count < $segFilesCount; $count++) { - $dataOffset = $cfsFile->readLong(); - $fileName = $cfsFile->readString(); - $this->_segFiles[$fileName] = $dataOffset; - } - - $fnmFile = $this->openCompoundFile('.fnm'); - $fieldsCount = $fnmFile->readVInt(); - $fieldNames = array(); - $fieldNums = array(); - $this->_fields = array(); - for ($count=0; $count < $fieldsCount; $count++) { - $fieldName = $fnmFile->readString(); - $fieldBits = $fnmFile->readByte(); - $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, - $fieldBits & 1, - $count, - $fieldBits & 2 ); - if ($fieldBits & 0x10) { - // norms are omitted for the indexed field - $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); - } - - $fieldNums[$count] = $count; - $fieldNames[$count] = $fieldName; - } - array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); - $this->_fieldsDicPositions = array_flip($fieldNums); - } - - /** - * Opens index file stoted within compound index file - * - * @param string $extension - * @throws Zend_Search_Lucene_Exception - * @return Zend_Search_Lucene_Storage_File - */ - public function openCompoundFile($extension) - { - $filename = $this->_name . $extension; - - if( !isset($this->_segFiles[ $filename ]) ) { - throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' - . $filename . ' file.' ); - } - - $file = $this->_directory->getFileObject( $this->_name.".cfs" ); - $file->seek( $this->_segFiles[ $filename ] ); - return $file; - } - - /** - * Returns field index or -1 if field is not found - * - * @param string $fieldName - * @return integer - */ - public function getFieldNum($fieldName) - { - foreach( $this->_fields as $field ) { - if( $field->name == $fieldName ) { - return $field->number; - } - } - - return -1; - } - - /** - * Returns field info for specified field - * - * @param integer $fieldNum - * @return ZSearchFieldInfo - */ - public function getField($fieldNum) - { - return $this->_fields[$fieldNum]; - } - - /** - * Returns array of fields. - * if $indexed parameter is true, then returns only indexed fields. - * - * @param boolean $indexed - * @return array - */ - public function getFields($indexed = false) - { - $result = array(); - foreach( $this->_fields as $field ) { - if( (!$indexed) || $field->isIndexed ) { - $result[ $field->name ] = $field->name; - } - } - return $result; - } - - /** - * Returns the total number of documents in this segment. - * - * @return integer - */ - public function count() - { - return $this->_docCount; - } - - - /** - * Loads Term dictionary from TermInfoIndex file - */ - protected function _loadDictionary() - { - if ($this->_termDictionary !== null) { - return; - } - - $this->_termDictionary = array(); - $this->_termDictionaryInfos = array(); - - $tiiFile = $this->openCompoundFile('.tii'); - $tiVersion = $tiiFile->readInt(); - if ($tiVersion != (int)0xFFFFFFFE) { - throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); - } - - $indexTermCount = $tiiFile->readLong(); - $tiiFile->readInt(); // IndexInterval - $skipInterval = $tiiFile->readInt(); - - $prevTerm = ''; - $freqPointer = 0; - $proxPointer = 0; - $indexPointer = 0; - for ($count = 0; $count < $indexTermCount; $count++) { - $termPrefixLength = $tiiFile->readVInt(); - $termSuffix = $tiiFile->readString(); - $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; - - $termFieldNum = $tiiFile->readVInt(); - $docFreq = $tiiFile->readVInt(); - $freqPointer += $tiiFile->readVInt(); - $proxPointer += $tiiFile->readVInt(); - if( $docFreq >= $skipInterval ) { - $skipDelta = $tiiFile->readVInt(); - } else { - $skipDelta = 0; - } - - $indexPointer += $tiiFile->readVInt(); - - $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); - $this->_termDictionaryInfos[] = - new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); - $prevTerm = $termValue; - } - } - - - /** - * Return segment name - * - * @return string - */ - public function getName() - { - return $this->_name; - } - - - /** - * Scans terms dictionary and returns term info - * - * @param Zend_Search_Lucene_Index_Term $term - * @return Zend_Search_Lucene_Index_TermInfo - */ - public function getTermInfo($term) - { - $this->_loadDictionary(); - - $searchField = $this->getFieldNum($term->field); - - if ($searchField == -1) { - return null; - } - $searchDicField = $this->_fieldsDicPositions[$searchField]; - - // search for appropriate value in dictionary - $lowIndex = 0; - $highIndex = count($this->_termDictionary)-1; - while ($highIndex >= $lowIndex) { - // $mid = ($highIndex - $lowIndex)/2; - $mid = ($highIndex + $lowIndex) >> 1; - $midTerm = $this->_termDictionary[$mid]; - - $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field]; - if ($delta == 0) { - $delta = strcmp($term->text, $midTerm->text); - } - - if ($delta < 0) { - $highIndex = $mid-1; - } elseif ($delta > 0) { - $lowIndex = $mid+1; - } else { - return $this->_termDictionaryInfos[$mid]; // We got it! - } - } - - if ($highIndex == -1) { - // Term is out of the dictionary range - return null; - } - - $prevPosition = $highIndex; - $prevTerm = $this->_termDictionary[$prevPosition]; - $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; - - $tisFile = $this->openCompoundFile('.tis'); - $tiVersion = $tisFile->readInt(); - if ($tiVersion != (int)0xFFFFFFFE) { - throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); - } - - $termCount = $tisFile->readLong(); - $indexInterval = $tisFile->readInt(); - $skipInterval = $tisFile->readInt(); - - $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); - - $termValue = $prevTerm->text; - $termFieldNum = $prevTerm->field; - $freqPointer = $prevTermInfo->freqPointer; - $proxPointer = $prevTermInfo->proxPointer; - for ($count = $prevPosition*$indexInterval + 1; - $count < $termCount && - ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField || - ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField && - strcmp($termValue, $term->text) < 0) ); - $count++) { - $termPrefixLength = $tisFile->readVInt(); - $termSuffix = $tisFile->readString(); - $termFieldNum = $tisFile->readVInt(); - $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; - - $docFreq = $tisFile->readVInt(); - $freqPointer += $tisFile->readVInt(); - $proxPointer += $tisFile->readVInt(); - if( $docFreq >= $skipInterval ) { - $skipOffset = $tisFile->readVInt(); - } else { - $skipOffset = 0; - } - } - - if ($termFieldNum == $searchField && $termValue == $term->text) { - return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); - } else { - return null; - } - } - - /** - * Returns normalization factor for specified documents - * - * @param integer $id - * @param string $fieldName - * @return string - */ - public function norm($id, $fieldName) - { - $fieldNum = $this->getFieldNum($fieldName); - - if ( !($this->_fields[$fieldNum]->isIndexed) ) { - return null; - } - - if ( !isset( $this->_norms[$fieldNum] )) { - $fFile = $this->openCompoundFile('.f' . $fieldNum); - $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); - } - - return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); - } -} - diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php deleted file mode 100644 index f90d6ed3..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php +++ /dev/null @@ -1,491 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; - -/** Zend_Search_Lucene_Analysis_Analyzer */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; - -/** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; - - -/** - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_SegmentWriter -{ - /** - * Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful. - * - * @var integer - */ - static public $indexInterval = 128; - - /** Expert: The fraction of TermDocs entries stored in skip tables. - * Larger values result in smaller indexes, greater acceleration, but fewer - * accelerable cases, while smaller values result in bigger indexes, - * less acceleration and more - * accelerable cases. More detailed experiments would be useful here. - * - * 0x0x7FFFFFFF indicates that we don't use skip data - * Default value is 16 - * - * @var integer - */ - static public $skipInterval = 0x7FFFFFFF; - - /** - * Number of docs in a segment - * - * @var integer - */ - private $_docCount; - - /** - * Segment name - * - * @var string - */ - private $_name; - - /** - * File system adapter. - * - * @var Zend_Search_Lucene_Storage_Directory - */ - private $_directory; - - /** - * List of the index files. - * Used for automatic compound file generation - * - * @var unknown_type - */ - private $_files; - - /** - * Term Dictionary - * Array of the Zend_Search_Lucene_Index_Term objects - * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos - * - * @var array - */ - private $_termDictionary; - - /** - * Documents, which contain the term - * - * @var array - */ - private $_termDocs; - - /** - * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment - * - * @var array - */ - private $_fields; - - /** - * Normalization factors. - * An array fieldName => normVector - * normVector is a binary string. - * Each byte corresponds to an indexed document in a segment and - * encodes normalization factor (float value, encoded by - * Zend_Search_Lucene_Search_Similarity::encodeNorm()) - * - * @var array - */ - private $_norms; - - - /** - * '.fdx' file - Stored Fields, the field index. - * - * @var Zend_Search_Lucene_Storage_File - */ - private $_fdxFile; - - /** - * '.fdx' file - Stored Fields, the field data. - * - * @var Zend_Search_Lucene_Storage_File - */ - private $_fdtFile; - - - /** - * Object constructor. - * - * @param Zend_Search_Lucene_Storage_Directory $directory - * @param string $name - */ - public function __construct($directory, $name) - { - $this->_directory = $directory; - $this->_name = $name; - $this->_docCount = 0; - - $this->_fields = array(); - $this->_termDocs = array(); - $this->_files = array(); - $this->_norms = array(); - - $this->_fdxFile = null; - $this->_fdtFile = null; - } - - - /** - * Add field to the segment - * - * @param Zend_Search_Lucene_Field $field - */ - private function _addFieldInfo(Zend_Search_Lucene_Field $field) - { - if (!isset($this->_fields[$field->name])) { - $this->_fields[$field->name] = - new Zend_Search_Lucene_Index_FieldInfo($field->name, - $field->isIndexed, - count($this->_fields), - $field->storeTermVector); - } else { - $this->_fields[$field->name]->isIndexed |= $field->isIndexed; - $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; - } - } - - - /** - * Adds a document to this segment. - * - * @param Zend_Search_Lucene_Document $document - * @throws Zend_Search_Lucene_Exception - */ - public function addDocument(Zend_Search_Lucene_Document $document) - { - $storedFields = array(); - - foreach ($document->getFieldNames() as $fieldName) { - $field = $document->getField($fieldName); - $this->_addFieldInfo($field); - - if ($field->storeTermVector) { - /** - * @todo term vector storing support - */ - throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); - } - - if ($field->isIndexed) { - if ($field->isTokenized) { - $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); - } else { - $tokenList = array(); - $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); - } - - $position = 0; - foreach ($tokenList as $token) { - $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); - $termKey = $term->key(); - - if (!isset($this->_termDictionary[$termKey])) { - // New term - $this->_termDictionary[$termKey] = $term; - $this->_termDocs[$termKey] = array(); - $this->_termDocs[$termKey][$this->_docCount] = array(); - } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { - // Existing term, but new term entry - $this->_termDocs[$termKey][$this->_docCount] = array(); - } - $position += $token->getPositionIncrement(); - $this->_termDocs[$termKey][$this->_docCount][] = $position; - } - } - - if ($field->isStored) { - $storedFields[] = $field; - } - } - - if (count($storedFields) != 0) { - if (!isset($this->_fdxFile)) { - $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); - $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); - - $this->_files[] = $this->_name . '.fdx'; - $this->_files[] = $this->_name . '.fdt'; - } - - $this->_fdxFile->writeLong($this->_fdtFile->tell()); - - $this->_fdtFile->writeVInt(count($storedFields)); - foreach ($storedFields as $field) { - $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); - $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 | - $field->isBinary ? 0x02 : 0x00 | - 0x00 /* 0x04 - third bit, compressed (ZLIB) */ ); - if ($field->isBinary) { - $this->_fdtFile->writeVInt(strlen($field->stringValue)); - $this->_fdtFile->writeBytes($field->stringValue); - } else { - $this->_fdtFile->writeString($field->stringValue); - } - } - } - - $this->_docCount++; - } - - - /** - * Dump Field Info (.fnm) segment file - */ - private function _dumpFNM() - { - $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); - $fnmFile->writeVInt(count($this->_fields)); - - foreach ($this->_fields as $field) { - $fnmFile->writeString($field->name); - $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | - ($field->storeTermVector ? 0x02 : 0x00) | -// not supported yet 0x04 /* term positions are stored with the term vectors */ | -// not supported yet 0x08 /* term offsets are stored with the term vectors */ | -/* not supported yet */ 0x10 /* norms are omitted for the indexed field */ - ); - } - - $this->_files[] = $this->_name . '.fnm'; - } - - - /** - * Dump Term Dictionary segment file entry. - * Used to write entry to .tis or .tii files - * - * @param Zend_Search_Lucene_Storage_File $dicFile - * @param Zend_Search_Lucene_Index_Term $prevTerm - * @param Zend_Search_Lucene_Index_Term $term - * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo - * @param Zend_Search_Lucene_Index_TermInfo $termInfo - */ - private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, - &$prevTerm, Zend_Search_Lucene_Index_Term $term, - &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) - { - if (isset($prevTerm) && $prevTerm->field == $term->field) { - $prefixLength = 0; - while ($prefixLength < strlen($prevTerm->text) && - $prefixLength < strlen($term->text) && - $prevTerm->text{$prefixLength} == $term->text{$prefixLength} - ) { - $prefixLength++; - } - // Write preffix length - $dicFile->writeVInt($prefixLength); - // Write suffix - $dicFile->writeString( substr($term->text, $prefixLength) ); - } else { - // Write preffix length - $dicFile->writeVInt(0); - // Write suffix - $dicFile->writeString($term->text); - } - // Write field number - $dicFile->writeVInt($term->field); - // DocFreq (the count of documents which contain the term) - $dicFile->writeVInt($termInfo->docFreq); - - $prevTerm = $term; - - if (!isset($prevTermInfo)) { - // Write FreqDelta - $dicFile->writeVInt($termInfo->freqPointer); - // Write ProxDelta - $dicFile->writeVInt($termInfo->proxPointer); - } else { - // Write FreqDelta - $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); - // Write ProxDelta - $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); - } - // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval - if ($termInfo->skipOffset != 0) { - $dicFile->writeVInt($termInfo->skipOffset); - } - - $prevTermInfo = $termInfo; - } - - /** - * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files - */ - private function _dumpDictionary() - { - $tisFile = $this->_directory->createFile($this->_name . '.tis'); - $tisFile->writeInt((int)0xFFFFFFFE); - $tisFile->writeLong(count($this->_termDictionary)); - $tisFile->writeInt(self::$indexInterval); - $tisFile->writeInt(self::$skipInterval); - - $tiiFile = $this->_directory->createFile($this->_name . '.tii'); - $tiiFile->writeInt((int)0xFFFFFFFE); - $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1); - $tiiFile->writeInt(self::$indexInterval); - $tiiFile->writeInt(self::$skipInterval); - - $frqFile = $this->_directory->createFile($this->_name . '.frq'); - $prxFile = $this->_directory->createFile($this->_name . '.prx'); - - $termKeys = array_keys($this->_termDictionary); - sort($termKeys, SORT_STRING); - - $termCount = 0; - - $prevTerm = null; - $prevTermInfo = null; - $prevIndexTerm = null; - $prevIndexTermInfo = null; - $prevIndexPosition = 0; - - foreach ($termKeys as $termId) { - $freqPointer = $frqFile->tell(); - $proxPointer = $prxFile->tell(); - - $prevDoc = 0; - foreach ($this->_termDocs[$termId] as $docId => $termPositions) { - $docDelta = ($docId - $prevDoc)*2; - $prevDoc = $docId; - if (count($termPositions) > 1) { - $frqFile->writeVInt($docDelta); - $frqFile->writeVInt(count($termPositions)); - } else { - $frqFile->writeVInt($docDelta + 1); - } - - $prevPosition = 0; - foreach ($termPositions as $position) { - $prxFile->writeVInt($position - $prevPosition); - $prevPosition = $position; - } - } - - if (count($this->_termDocs[$termId]) >= self::$skipInterval) { - /** - * @todo Write Skip Data to a freq file. - * It's not used now, but must be implemented to be compatible with Lucene - */ - $skipOffset = $frqFile->tell() - $freqPointer; - } else { - $skipOffset = 0; - } - - $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, - $this->_fields[$this->_termDictionary[$termId]->field]->number); - $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), - $freqPointer, $proxPointer, $skipOffset); - - $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); - - if ($termCount % self::$indexInterval == 0) { - $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); - - $indexPosition = $tisFile->tell(); - $tiiFile->writeVInt($indexPosition - $prevIndexPosition); - $prevIndexPosition = $indexPosition; - } - $termCount++; - } - - $this->_files[] = $this->_name . '.tis'; - $this->_files[] = $this->_name . '.tii'; - $this->_files[] = $this->_name . '.frq'; - $this->_files[] = $this->_name . '.prx'; - } - - - /** - * Generate compound index file - */ - private function _generateCFS() - { - $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); - $cfsFile->writeVInt(count($this->_files)); - - $dataOffsetPointers = array(); - foreach ($this->_files as $fileName) { - $dataOffsetPointers[$fileName] = $cfsFile->tell(); - $cfsFile->writeLong(0); // write dummy data - $cfsFile->writeString($fileName); - } - - foreach ($this->_files as $fileName) { - // Get actual data offset - $dataOffset = $cfsFile->tell(); - // Seek to the data offset pointer - $cfsFile->seek($dataOffsetPointers[$fileName]); - // Write actual data offset value - $cfsFile->writeLong($dataOffset); - // Seek back to the end of file - $cfsFile->seek($dataOffset); - - $dataFile = $this->_directory->getFileObject($fileName); - $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName))); - - $this->_directory->deleteFile($fileName); - } - } - - - /** - * Close segment, write it to disk and return segment info - * - * @return Zend_Search_Lucene_Index_SegmentInfo - */ - public function close() - { - if ($this->_docCount == 0) { - return null; - } - - $this->_dumpFNM(); - $this->_dumpDictionary(); - - $this->_generateCFS(); - - return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, - $this->_docCount, - $this->_directory); - } - -} - diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php deleted file mode 100644 index e30ce587..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php +++ /dev/null @@ -1,70 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** - * A Term represents a word from text. This is the unit of search. It is - * composed of two elements, the text of the word, as a string, and the name of - * the field that the text occured in, an interned string. - * - * Note that terms may represent more than words from text fields, but also - * things like dates, email addresses, urls, etc. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_Term -{ - /** - * Field name or field number (depending from context) - * - * @var mixed - */ - public $field; - - /** - * Term value - * - * @var string - */ - public $text; - - - /** - * @todo docblock - */ - public function __construct( $text, $field = 'contents' ) - { - $this->field = $field; - $this->text = $text; - } - - - /** - * @todo docblock - */ - public function key() - { - return $this->field . chr(0) . $this->text; - } -} - diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php deleted file mode 100644 index ddef721d..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php +++ /dev/null @@ -1,77 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** - * A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_TermInfo -{ - /** - * The number of documents which contain the term. - * - * @var integer - */ - public $docFreq; - - /** - * Data offset in a Frequencies file. - * - * @var integer - */ - public $freqPointer; - - /** - * Data offset in a Positions file. - * - * @var integer - */ - public $proxPointer; - - /** - * ScipData offset in a Frequencies file. - * - * @var integer - */ - public $skipOffset; - - /** - * Term offset of the _next_ term in a TermDictionary file. - * Used only for Term Index - * - * @var integer - */ - public $indexPointer; - - public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null) - { - $this->docFreq = $docFreq; - $this->freqPointer = $freqPointer; - $this->proxPointer = $proxPointer; - $this->skipOffset = $skipOffset; - $this->indexPointer = $indexPointer; - } -} - diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php deleted file mode 100644 index da4af000..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php +++ /dev/null @@ -1,308 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** Zend_Search_Lucene_Index_SegmentWriter */ -require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; - -/** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; - - -/** - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_Writer -{ - /** - * @todo Implement segment merger - * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage. - * @todo Implement Analyzer substitution - * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for - * temporary index files - * @todo Directory lock processing - */ - - /** - * File system adapter. - * - * @var Zend_Search_Lucene_Storage_Directory - */ - private $_directory = null; - - - /** - * Index version - * Counts how often the index has been changed by adding or deleting docs - * - * @var integer - */ - private $_version; - - /** - * Segment name counter. - * Used to name new segments . - * - * @var integer - */ - private $_segmentNameCounter; - - /** - * Number of the segments in the index - * - * @var inteher - */ - private $_segments; - - /** - * Determines how often segment indices - * are merged by addDocument(). - * - * @var integer - */ - public $mergeFactor; - - /** - * Determines the minimal number of documents required before - * the buffered in-memory documents are merging and a new Segment - * is created. - * - * @var integer - */ - public $minMergeDocs; - - /** - * Determines the largest number of documents ever merged by addDocument(). - * - * @var integer - */ - public $maxMergeDocs; - - /** - * List of the segments, created by index writer - * Array of Zend_Search_Lucene_Index_SegmentInfo objects - * - * @var array - */ - private $_newSegments; - - /** - * Current segment to add documents - * - * @var Zend_Search_Lucene_Index_SegmentWriter - */ - private $_currentSegment; - - /** - * Opens the index for writing - * - * IndexWriter constructor needs Directory as a parameter. It should be - * a string with a path to the index folder or a Directory object. - * Second constructor parameter create is optional - true to create the - * index or overwrite the existing one. - * - * @param Zend_Search_Lucene_Storage_Directory $directory - * @param boolean $create - */ - public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false) - { - $this->_directory = $directory; - - if ($create) { - foreach ($this->_directory->fileList() as $file) { - if ($file == 'deletable' || - $file == 'segments' || - substr($file, strlen($file)-4) == '.cfs') { - $this->_directory->deleteFile($file); - } - } - $segmentsFile = $this->_directory->createFile('segments'); - $segmentsFile->writeInt((int)0xFFFFFFFF); - // write version - $segmentsFile->writeLong(0); - // write name counter - $segmentsFile->writeInt(0); - // write segment counter - $segmentsFile->writeInt(0); - - $deletableFile = $this->_directory->createFile('deletable'); - // write counter - $deletableFile->writeInt(0); - - $this->_version = 0; - $this->_segmentNameCounter = 0; - $this->_segments = 0; - } else { - $segmentsFile = $this->_directory->getFileObject('segments'); - $format = $segmentsFile->readInt(); - if ($format != (int)0xFFFFFFFF) { - throw new Zend_Search_Lucene_Exception('Wrong segments file format'); - } - - // read version - $this->_version = $segmentsFile->readLong(); - // read counter - $this->_segmentNameCounter = $segmentsFile->readInt(); - // read segment counter - $this->_segments = $segmentsFile->readInt(); - } - - $this->_newSegments = array(); - $this->_currentSegment = null; - } - - /** - * Adds a document to this index. - * - * @param Zend_Search_Lucene_Document $document - */ - public function addDocument(Zend_Search_Lucene_Document $document) - { - if ($this->_currentSegment === null) { - $this->_currentSegment = - new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); - } - $this->_currentSegment->addDocument($document); - $this->_version++; - } - - - - /** - * Update segments file by adding current segment to a list - * @todo !!!!!Finish the implementation - * - * @throws Zend_Search_Lucene_Exception - */ - private function _updateSegments() - { - $segmentsFile = $this->_directory->getFileObject('segments'); - $newSegmentFile = $this->_directory->createFile('segments.new'); - - $newSegmentFile->writeInt((int)0xFFFFFFFF); - $newSegmentFile->writeLong($this->_version); - $newSegmentFile->writeInt($this->_segmentNameCounter); - $newSegmentFile->writeInt($this->_segments + count($this->_newSegments)); - - $segmentsFile->seek(20); - $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); - - foreach ($this->_newSegments as $segmentName => $segmentInfo) { - $newSegmentFile->writeString($segmentName); - $newSegmentFile->writeInt($segmentInfo->count()); - } - - $this->_directory->renameFile('segments.new', 'segments'); - } - - - /** - * Commit current changes - * returns array of new segments - * - * @return array - */ - public function commit() - { - if ($this->_currentSegment !== null) { - $newSegment = $this->_currentSegment->close(); - if ($newSegment !== null) { - $this->_newSegments[$newSegment->getName()] = $newSegment; - } - $this->_currentSegment = null; - } - - if (count($this->_newSegments) != 0) { - $this->_updateSegments(); - } - - $result = $this->_newSegments; - $this->_newSegments = array(); - - return $result; - } - - - /** - * Merges the provided indexes into this index. - * - * @param array $readers - * @return void - */ - public function addIndexes($readers) - { - /** - * @todo implementation - */ - } - - - /** - * Returns the number of documents currently in this index. - * - * @return integer - */ - public function docCount($readers) - { - /** - * @todo implementation - */ - } - - - /** - * Flushes all changes to an index and closes all associated files. - * - */ - public function close() - { - /** - * @todo implementation - */ - } - - - /** - * Merges all segments together into a single segment, optimizing - * an index for search. - * - * return void - */ - public function optimize() - { - /** - * @todo implementation - */ - } - - /** - * Get name for new segment - * - * @return string - */ - private function _newSegmentName() - { - return '_' . base_convert($this->_segmentNameCounter++, 10, 36); - } - -} |