normVector * normVector is a binary string. * Each byte corresponds to an indexed document in a segment and * encodes normalization factor (float value, encoded by * Zend_Search_Lucene_Search_Similarity::encodeNorm()) * * @var array */ private $_norms = array(); /** * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, * Documents count and Directory as a parameter. * * @param string $name * @param integer $docCount * @param Zend_Search_Lucene_Storage_Directory $directory */ public function __construct($name, $docCount, $directory) { $this->_name = $name; $this->_docCount = $docCount; $this->_directory = $directory; $this->_termDictionary = null; $this->_segFiles = array(); $cfsFile = $this->_directory->getFileObject($name . '.cfs'); $segFilesCount = $cfsFile->readVInt(); for ($count = 0; $count < $segFilesCount; $count++) { $dataOffset = $cfsFile->readLong(); $fileName = $cfsFile->readString(); $this->_segFiles[$fileName] = $dataOffset; } $fnmFile = $this->openCompoundFile('.fnm'); $fieldsCount = $fnmFile->readVInt(); $fieldNames = array(); $fieldNums = array(); $this->_fields = array(); for ($count=0; $count < $fieldsCount; $count++) { $fieldName = $fnmFile->readString(); $fieldBits = $fnmFile->readByte(); $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, $fieldBits & 1, $count, $fieldBits & 2 ); if ($fieldBits & 0x10) { // norms are omitted for the indexed field $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); } $fieldNums[$count] = $count; $fieldNames[$count] = $fieldName; } array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); $this->_fieldsDicPositions = array_flip($fieldNums); } /** * Opens index file stoted within compound index file * * @param string $extension * @throws Zend_Search_Lucene_Exception * @return Zend_Search_Lucene_Storage_File */ public function openCompoundFile($extension) { $filename = $this->_name . $extension; if( !isset($this->_segFiles[ $filename ]) ) { throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' . $filename . ' file.' ); } $file = $this->_directory->getFileObject( $this->_name.".cfs" ); $file->seek( $this->_segFiles[ $filename ] ); return $file; } /** * Returns field index or -1 if field is not found * * @param string $fieldName * @return integer */ public function getFieldNum($fieldName) { foreach( $this->_fields as $field ) { if( $field->name == $fieldName ) { return $field->number; } } return -1; } /** * Returns field info for specified field * * @param integer $fieldNum * @return ZSearchFieldInfo */ public function getField($fieldNum) { return $this->_fields[$fieldNum]; } /** * Returns array of fields. * if $indexed parameter is true, then returns only indexed fields. * * @param boolean $indexed * @return array */ public function getFields($indexed = false) { $result = array(); foreach( $this->_fields as $field ) { if( (!$indexed) || $field->isIndexed ) { $result[ $field->name ] = $field->name; } } return $result; } /** * Returns the total number of documents in this segment. * * @return integer */ public function count() { return $this->_docCount; } /** * Loads Term dictionary from TermInfoIndex file */ protected function _loadDictionary() { if ($this->_termDictionary !== null) { return; } $this->_termDictionary = array(); $this->_termDictionaryInfos = array(); $tiiFile = $this->openCompoundFile('.tii'); $tiVersion = $tiiFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); } $indexTermCount = $tiiFile->readLong(); $tiiFile->readInt(); // IndexInterval $skipInterval = $tiiFile->readInt(); $prevTerm = ''; $freqPointer = 0; $proxPointer = 0; $indexPointer = 0; for ($count = 0; $count < $indexTermCount; $count++) { $termPrefixLength = $tiiFile->readVInt(); $termSuffix = $tiiFile->readString(); $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; $termFieldNum = $tiiFile->readVInt(); $docFreq = $tiiFile->readVInt(); $freqPointer += $tiiFile->readVInt(); $proxPointer += $tiiFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipDelta = $tiiFile->readVInt(); } else { $skipDelta = 0; } $indexPointer += $tiiFile->readVInt(); $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); $this->_termDictionaryInfos[] = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); $prevTerm = $termValue; } } /** * Return segment name * * @return string */ public function getName() { return $this->_name; } /** * Scans terms dictionary and returns term info * * @param Zend_Search_Lucene_Index_Term $term * @return Zend_Search_Lucene_Index_TermInfo */ public function getTermInfo($term) { $this->_loadDictionary(); $searchField = $this->getFieldNum($term->field); if ($searchField == -1) { return null; } $searchDicField = $this->_fieldsDicPositions[$searchField]; // search for appropriate value in dictionary $lowIndex = 0; $highIndex = count($this->_termDictionary)-1; while ($highIndex >= $lowIndex) { // $mid = ($highIndex - $lowIndex)/2; $mid = ($highIndex + $lowIndex) >> 1; $midTerm = $this->_termDictionary[$mid]; $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field]; if ($delta == 0) { $delta = strcmp($term->text, $midTerm->text); } if ($delta < 0) { $highIndex = $mid-1; } elseif ($delta > 0) { $lowIndex = $mid+1; } else { return $this->_termDictionaryInfos[$mid]; // We got it! } } if ($highIndex == -1) { // Term is out of the dictionary range return null; } $prevPosition = $highIndex; $prevTerm = $this->_termDictionary[$prevPosition]; $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; $tisFile = $this->openCompoundFile('.tis'); $tiVersion = $tisFile->readInt(); if ($tiVersion != (int)0xFFFFFFFE) { throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); } $termCount = $tisFile->readLong(); $indexInterval = $tisFile->readInt(); $skipInterval = $tisFile->readInt(); $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); $termValue = $prevTerm->text; $termFieldNum = $prevTerm->field; $freqPointer = $prevTermInfo->freqPointer; $proxPointer = $prevTermInfo->proxPointer; for ($count = $prevPosition*$indexInterval + 1; $count < $termCount && ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField || ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField && strcmp($termValue, $term->text) < 0) ); $count++) { $termPrefixLength = $tisFile->readVInt(); $termSuffix = $tisFile->readString(); $termFieldNum = $tisFile->readVInt(); $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; $docFreq = $tisFile->readVInt(); $freqPointer += $tisFile->readVInt(); $proxPointer += $tisFile->readVInt(); if( $docFreq >= $skipInterval ) { $skipOffset = $tisFile->readVInt(); } else { $skipOffset = 0; } } if ($termFieldNum == $searchField && $termValue == $term->text) { return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); } else { return null; } } /** * Returns normalization factor for specified documents * * @param integer $id * @param string $fieldName * @return string */ public function norm($id, $fieldName) { $fieldNum = $this->getFieldNum($fieldName); if ( !($this->_fields[$fieldNum]->isIndexed) ) { return null; } if ( !isset( $this->_norms[$fieldNum] )) { $fFile = $this->openCompoundFile('.f' . $fieldNum); $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); } return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); } }