diff options
Diffstat (limited to 'buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php')
-rw-r--r-- | buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php | 491 |
1 files changed, 0 insertions, 491 deletions
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php deleted file mode 100644 index f90d6ed3..00000000 --- a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php +++ /dev/null @@ -1,491 +0,0 @@ -<?php -/** - * Zend Framework - * - * LICENSE - * - * This source file is subject to version 1.0 of the Zend Framework - * license, that is bundled with this package in the file LICENSE, and - * is available through the world-wide-web at the following URL: - * http://www.zend.com/license/framework/1_0.txt. If you did not receive - * a copy of the Zend Framework license and are unable to obtain it - * through the world-wide-web, please send a note to license@zend.com - * so we can mail you a copy immediately. - * - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ - - -/** Zend_Search_Lucene_Exception */ -require_once 'Zend/Search/Lucene/Exception.php'; - -/** Zend_Search_Lucene_Analysis_Analyzer */ -require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; - -/** Zend_Search_Lucene_Index_SegmentInfo */ -require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; - - -/** - * @package Zend_Search_Lucene - * @subpackage Index - * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) - * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 - */ -class Zend_Search_Lucene_Index_SegmentWriter -{ - /** - * Expert: The fraction of terms in the "dictionary" which should be stored - * in RAM. Smaller values use more memory, but make searching slightly - * faster, while larger values use less memory and make searching slightly - * slower. Searching is typically not dominated by dictionary lookup, so - * tweaking this is rarely useful. - * - * @var integer - */ - static public $indexInterval = 128; - - /** Expert: The fraction of TermDocs entries stored in skip tables. - * Larger values result in smaller indexes, greater acceleration, but fewer - * accelerable cases, while smaller values result in bigger indexes, - * less acceleration and more - * accelerable cases. More detailed experiments would be useful here. - * - * 0x0x7FFFFFFF indicates that we don't use skip data - * Default value is 16 - * - * @var integer - */ - static public $skipInterval = 0x7FFFFFFF; - - /** - * Number of docs in a segment - * - * @var integer - */ - private $_docCount; - - /** - * Segment name - * - * @var string - */ - private $_name; - - /** - * File system adapter. - * - * @var Zend_Search_Lucene_Storage_Directory - */ - private $_directory; - - /** - * List of the index files. - * Used for automatic compound file generation - * - * @var unknown_type - */ - private $_files; - - /** - * Term Dictionary - * Array of the Zend_Search_Lucene_Index_Term objects - * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos - * - * @var array - */ - private $_termDictionary; - - /** - * Documents, which contain the term - * - * @var array - */ - private $_termDocs; - - /** - * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment - * - * @var array - */ - private $_fields; - - /** - * Normalization factors. - * An array fieldName => normVector - * normVector is a binary string. - * Each byte corresponds to an indexed document in a segment and - * encodes normalization factor (float value, encoded by - * Zend_Search_Lucene_Search_Similarity::encodeNorm()) - * - * @var array - */ - private $_norms; - - - /** - * '.fdx' file - Stored Fields, the field index. - * - * @var Zend_Search_Lucene_Storage_File - */ - private $_fdxFile; - - /** - * '.fdx' file - Stored Fields, the field data. - * - * @var Zend_Search_Lucene_Storage_File - */ - private $_fdtFile; - - - /** - * Object constructor. - * - * @param Zend_Search_Lucene_Storage_Directory $directory - * @param string $name - */ - public function __construct($directory, $name) - { - $this->_directory = $directory; - $this->_name = $name; - $this->_docCount = 0; - - $this->_fields = array(); - $this->_termDocs = array(); - $this->_files = array(); - $this->_norms = array(); - - $this->_fdxFile = null; - $this->_fdtFile = null; - } - - - /** - * Add field to the segment - * - * @param Zend_Search_Lucene_Field $field - */ - private function _addFieldInfo(Zend_Search_Lucene_Field $field) - { - if (!isset($this->_fields[$field->name])) { - $this->_fields[$field->name] = - new Zend_Search_Lucene_Index_FieldInfo($field->name, - $field->isIndexed, - count($this->_fields), - $field->storeTermVector); - } else { - $this->_fields[$field->name]->isIndexed |= $field->isIndexed; - $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; - } - } - - - /** - * Adds a document to this segment. - * - * @param Zend_Search_Lucene_Document $document - * @throws Zend_Search_Lucene_Exception - */ - public function addDocument(Zend_Search_Lucene_Document $document) - { - $storedFields = array(); - - foreach ($document->getFieldNames() as $fieldName) { - $field = $document->getField($fieldName); - $this->_addFieldInfo($field); - - if ($field->storeTermVector) { - /** - * @todo term vector storing support - */ - throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); - } - - if ($field->isIndexed) { - if ($field->isTokenized) { - $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); - } else { - $tokenList = array(); - $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); - } - - $position = 0; - foreach ($tokenList as $token) { - $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); - $termKey = $term->key(); - - if (!isset($this->_termDictionary[$termKey])) { - // New term - $this->_termDictionary[$termKey] = $term; - $this->_termDocs[$termKey] = array(); - $this->_termDocs[$termKey][$this->_docCount] = array(); - } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { - // Existing term, but new term entry - $this->_termDocs[$termKey][$this->_docCount] = array(); - } - $position += $token->getPositionIncrement(); - $this->_termDocs[$termKey][$this->_docCount][] = $position; - } - } - - if ($field->isStored) { - $storedFields[] = $field; - } - } - - if (count($storedFields) != 0) { - if (!isset($this->_fdxFile)) { - $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); - $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); - - $this->_files[] = $this->_name . '.fdx'; - $this->_files[] = $this->_name . '.fdt'; - } - - $this->_fdxFile->writeLong($this->_fdtFile->tell()); - - $this->_fdtFile->writeVInt(count($storedFields)); - foreach ($storedFields as $field) { - $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); - $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 | - $field->isBinary ? 0x02 : 0x00 | - 0x00 /* 0x04 - third bit, compressed (ZLIB) */ ); - if ($field->isBinary) { - $this->_fdtFile->writeVInt(strlen($field->stringValue)); - $this->_fdtFile->writeBytes($field->stringValue); - } else { - $this->_fdtFile->writeString($field->stringValue); - } - } - } - - $this->_docCount++; - } - - - /** - * Dump Field Info (.fnm) segment file - */ - private function _dumpFNM() - { - $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); - $fnmFile->writeVInt(count($this->_fields)); - - foreach ($this->_fields as $field) { - $fnmFile->writeString($field->name); - $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | - ($field->storeTermVector ? 0x02 : 0x00) | -// not supported yet 0x04 /* term positions are stored with the term vectors */ | -// not supported yet 0x08 /* term offsets are stored with the term vectors */ | -/* not supported yet */ 0x10 /* norms are omitted for the indexed field */ - ); - } - - $this->_files[] = $this->_name . '.fnm'; - } - - - /** - * Dump Term Dictionary segment file entry. - * Used to write entry to .tis or .tii files - * - * @param Zend_Search_Lucene_Storage_File $dicFile - * @param Zend_Search_Lucene_Index_Term $prevTerm - * @param Zend_Search_Lucene_Index_Term $term - * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo - * @param Zend_Search_Lucene_Index_TermInfo $termInfo - */ - private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, - &$prevTerm, Zend_Search_Lucene_Index_Term $term, - &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) - { - if (isset($prevTerm) && $prevTerm->field == $term->field) { - $prefixLength = 0; - while ($prefixLength < strlen($prevTerm->text) && - $prefixLength < strlen($term->text) && - $prevTerm->text{$prefixLength} == $term->text{$prefixLength} - ) { - $prefixLength++; - } - // Write preffix length - $dicFile->writeVInt($prefixLength); - // Write suffix - $dicFile->writeString( substr($term->text, $prefixLength) ); - } else { - // Write preffix length - $dicFile->writeVInt(0); - // Write suffix - $dicFile->writeString($term->text); - } - // Write field number - $dicFile->writeVInt($term->field); - // DocFreq (the count of documents which contain the term) - $dicFile->writeVInt($termInfo->docFreq); - - $prevTerm = $term; - - if (!isset($prevTermInfo)) { - // Write FreqDelta - $dicFile->writeVInt($termInfo->freqPointer); - // Write ProxDelta - $dicFile->writeVInt($termInfo->proxPointer); - } else { - // Write FreqDelta - $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); - // Write ProxDelta - $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); - } - // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval - if ($termInfo->skipOffset != 0) { - $dicFile->writeVInt($termInfo->skipOffset); - } - - $prevTermInfo = $termInfo; - } - - /** - * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files - */ - private function _dumpDictionary() - { - $tisFile = $this->_directory->createFile($this->_name . '.tis'); - $tisFile->writeInt((int)0xFFFFFFFE); - $tisFile->writeLong(count($this->_termDictionary)); - $tisFile->writeInt(self::$indexInterval); - $tisFile->writeInt(self::$skipInterval); - - $tiiFile = $this->_directory->createFile($this->_name . '.tii'); - $tiiFile->writeInt((int)0xFFFFFFFE); - $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1); - $tiiFile->writeInt(self::$indexInterval); - $tiiFile->writeInt(self::$skipInterval); - - $frqFile = $this->_directory->createFile($this->_name . '.frq'); - $prxFile = $this->_directory->createFile($this->_name . '.prx'); - - $termKeys = array_keys($this->_termDictionary); - sort($termKeys, SORT_STRING); - - $termCount = 0; - - $prevTerm = null; - $prevTermInfo = null; - $prevIndexTerm = null; - $prevIndexTermInfo = null; - $prevIndexPosition = 0; - - foreach ($termKeys as $termId) { - $freqPointer = $frqFile->tell(); - $proxPointer = $prxFile->tell(); - - $prevDoc = 0; - foreach ($this->_termDocs[$termId] as $docId => $termPositions) { - $docDelta = ($docId - $prevDoc)*2; - $prevDoc = $docId; - if (count($termPositions) > 1) { - $frqFile->writeVInt($docDelta); - $frqFile->writeVInt(count($termPositions)); - } else { - $frqFile->writeVInt($docDelta + 1); - } - - $prevPosition = 0; - foreach ($termPositions as $position) { - $prxFile->writeVInt($position - $prevPosition); - $prevPosition = $position; - } - } - - if (count($this->_termDocs[$termId]) >= self::$skipInterval) { - /** - * @todo Write Skip Data to a freq file. - * It's not used now, but must be implemented to be compatible with Lucene - */ - $skipOffset = $frqFile->tell() - $freqPointer; - } else { - $skipOffset = 0; - } - - $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, - $this->_fields[$this->_termDictionary[$termId]->field]->number); - $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), - $freqPointer, $proxPointer, $skipOffset); - - $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); - - if ($termCount % self::$indexInterval == 0) { - $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); - - $indexPosition = $tisFile->tell(); - $tiiFile->writeVInt($indexPosition - $prevIndexPosition); - $prevIndexPosition = $indexPosition; - } - $termCount++; - } - - $this->_files[] = $this->_name . '.tis'; - $this->_files[] = $this->_name . '.tii'; - $this->_files[] = $this->_name . '.frq'; - $this->_files[] = $this->_name . '.prx'; - } - - - /** - * Generate compound index file - */ - private function _generateCFS() - { - $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); - $cfsFile->writeVInt(count($this->_files)); - - $dataOffsetPointers = array(); - foreach ($this->_files as $fileName) { - $dataOffsetPointers[$fileName] = $cfsFile->tell(); - $cfsFile->writeLong(0); // write dummy data - $cfsFile->writeString($fileName); - } - - foreach ($this->_files as $fileName) { - // Get actual data offset - $dataOffset = $cfsFile->tell(); - // Seek to the data offset pointer - $cfsFile->seek($dataOffsetPointers[$fileName]); - // Write actual data offset value - $cfsFile->writeLong($dataOffset); - // Seek back to the end of file - $cfsFile->seek($dataOffset); - - $dataFile = $this->_directory->getFileObject($fileName); - $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName))); - - $this->_directory->deleteFile($fileName); - } - } - - - /** - * Close segment, write it to disk and return segment info - * - * @return Zend_Search_Lucene_Index_SegmentInfo - */ - public function close() - { - if ($this->_docCount == 0) { - return null; - } - - $this->_dumpFNM(); - $this->_dumpDictionary(); - - $this->_generateCFS(); - - return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, - $this->_docCount, - $this->_directory); - } - -} - |