From f4de82bcdafba51e4eed9cae6b2d3e5375ffd115 Mon Sep 17 00:00:00 2001 From: xue <> Date: Tue, 9 May 2006 12:11:38 +0000 Subject: --- .../protected/index/Zend/Search/Exception.php | 34 ++ .../protected/index/Zend/Search/Lucene.php | 569 +++++++++++++++++++++ .../index/Zend/Search/Lucene/Analysis/Analyzer.php | 94 ++++ .../Search/Lucene/Analysis/Analyzer/Common.php | 73 +++ .../Lucene/Analysis/Analyzer/Common/Text.php | 76 +++ .../Analyzer/Common/Text/CaseInsensitive.php | 43 ++ .../index/Zend/Search/Lucene/Analysis/Token.php | 170 ++++++ .../Zend/Search/Lucene/Analysis/TokenFilter.php | 45 ++ .../Lucene/Analysis/TokenFilter/LowerCase.php | 55 ++ .../index/Zend/Search/Lucene/Document.php | 109 ++++ .../index/Zend/Search/Lucene/Exception.php | 34 ++ .../protected/index/Zend/Search/Lucene/Field.php | 134 +++++ .../index/Zend/Search/Lucene/Index/FieldInfo.php | 43 ++ .../index/Zend/Search/Lucene/Index/SegmentInfo.php | 412 +++++++++++++++ .../Zend/Search/Lucene/Index/SegmentWriter.php | 491 ++++++++++++++++++ .../index/Zend/Search/Lucene/Index/Term.php | 70 +++ .../index/Zend/Search/Lucene/Index/TermInfo.php | 77 +++ .../index/Zend/Search/Lucene/Index/Writer.php | 308 +++++++++++ .../index/Zend/Search/Lucene/Search/Query.php | 98 ++++ .../Zend/Search/Lucene/Search/Query/MultiTerm.php | 437 ++++++++++++++++ .../Zend/Search/Lucene/Search/Query/Phrase.php | 424 +++++++++++++++ .../index/Zend/Search/Lucene/Search/Query/Term.php | 126 +++++ .../index/Zend/Search/Lucene/Search/QueryHit.php | 106 ++++ .../Zend/Search/Lucene/Search/QueryParser.php | 140 +++++ .../index/Zend/Search/Lucene/Search/QueryToken.php | 102 ++++ .../Zend/Search/Lucene/Search/QueryTokenizer.php | 162 ++++++ .../index/Zend/Search/Lucene/Search/Similarity.php | 551 ++++++++++++++++++++ .../Search/Lucene/Search/Similarity/Default.php | 99 ++++ .../index/Zend/Search/Lucene/Search/Weight.php | 59 +++ .../Zend/Search/Lucene/Search/Weight/MultiTerm.php | 133 +++++ .../Zend/Search/Lucene/Search/Weight/Phrase.php | 138 +++++ .../Zend/Search/Lucene/Search/Weight/Term.php | 144 ++++++ .../index/Zend/Search/Lucene/Storage/Directory.php | 118 +++++ .../Search/Lucene/Storage/Directory/Filesystem.php | 269 ++++++++++ .../index/Zend/Search/Lucene/Storage/File.php | 376 ++++++++++++++ .../Zend/Search/Lucene/Storage/File/Filesystem.php | 170 ++++++ .../protected/index/Zend/Search/TODO.txt | 14 + 37 files changed, 6503 insertions(+) create mode 100644 demos/quickstart/protected/index/Zend/Search/Exception.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Token.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/TokenFilter.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Document.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Exception.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Field.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/FieldInfo.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentInfo.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentWriter.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/Term.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/TermInfo.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Index/Writer.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/MultiTerm.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Phrase.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Term.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryHit.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryParser.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryToken.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryTokenizer.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity/Default.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/MultiTerm.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Phrase.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Term.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Storage/Directory.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Storage/Directory/Filesystem.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File.php create mode 100644 demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File/Filesystem.php create mode 100644 demos/quickstart/protected/index/Zend/Search/TODO.txt (limited to 'demos/quickstart/protected/index/Zend/Search') diff --git a/demos/quickstart/protected/index/Zend/Search/Exception.php b/demos/quickstart/protected/index/Zend/Search/Exception.php new file mode 100644 index 00000000..e0aa2221 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Exception.php @@ -0,0 +1,34 @@ +_directory = $directory; + $this->_closeDirOnExit = false; + } else { + $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); + $this->_closeDirOnExit = true; + } + + if ($create) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); + } else { + $this->_writer = null; + } + + $this->_segmentInfos = array(); + + $segmentsFile = $this->_directory->getFileObject('segments'); + + $format = $segmentsFile->readInt(); + + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $segmentsFile->readLong(); + + // read counter + $segmentsFile->readInt(); + + $segments = $segmentsFile->readInt(); + + $this->_docCount = 0; + + // read segmentInfos + for ($count = 0; $count < $segments; $count++) { + $segName = $segmentsFile->readString(); + $segSize = $segmentsFile->readInt(); + $this->_docCount += $segSize; + + $this->_segmentInfos[$count] = + new Zend_Search_Lucene_Index_SegmentInfo($segName, + $segSize, + $this->_directory); + } + } + + + /** + * Object destructor + */ + public function __destruct() + { + $this->commit(); + + if ($this->_closeDirOnExit) { + $this->_directory->close(); + } + } + + /** + * Returns an instance of Zend_Search_Lucene_Index_Writer for the index + * + * @return Zend_Search_Lucene_Index_Writer + */ + public function getIndexWriter() + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + return $this->_writer; + } + + + /** + * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. + * + * @return Zend_Search_Lucene_Storage_Directory + */ + public function getDirectory() + { + return $this->_directory; + } + + + /** + * Returns the total number of documents in this index. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * @param mixed $query + * @return array ZSearchHit + */ + public function find($query) + { + if (is_string($query)) { + $query = Zend_Search_Lucene_Search_QueryParser::parse($query); + } + + if (!$query instanceof Zend_Search_Lucene_Search_Query) { + throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); + } + + $this->commit(); + + $hits = array(); + $scores = array(); + + $docNum = $this->count(); + for( $count=0; $count < $docNum; $count++ ) { + $docScore = $query->score( $count, $this); + if( $docScore != 0 ) { + $hit = new Zend_Search_Lucene_Search_QueryHit($this); + $hit->id = $count; + $hit->score = $docScore; + + $hits[] = $hit; + $scores[] = $docScore; + } + } + array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); + + return $hits; + } + + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false) + { + $result = array(); + foreach( $this->_segmentInfos as $segmentInfo ) { + $result = array_merge($result, $segmentInfo->getFields($indexed)); + } + return $result; + } + + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id) + { + if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { + /* @var $id Zend_Search_Lucene_Search_QueryHit */ + $id = $id->id; + } + + if ($id >= $this->_docCount) { + /** + * @todo exception here? + */ + return null; + } + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + + $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); + $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); + $fieldValuesPosition = $fdxFile->readLong(); + + $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); + $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); + $fieldCount = $fdtFile->readVInt(); + + $doc = new Zend_Search_Lucene_Document(); + for( $count = 0; $count < $fieldCount; $count++ ) { + $fieldNum = $fdtFile->readVInt(); + $bits = $fdtFile->readByte(); + + $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); + + if( !($bits & 2) ) { // Text data + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readString(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } else { + $field = new Zend_Search_Lucene_Field($fieldInfo->name, + $fdtFile->readBinary(), + true, + $fieldInfo->isIndexed, + $bits & 1 ); + } + + $doc->addField($field); + } + + return $doc; + } + + + /** + * Returns an array of all the documents which contain term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $docId = 0; + for( $count=0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + } else { + $docId += $docDelta/2; + // read freq + $frqFile->readVInt(); + } + $result[] = $segmentStartDocId + $docId; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term) + { + $result = array(); + $segmentStartDocId = 0; + foreach( $this->_segmentInfos as $segInfo ) { + $termInfo = $segInfo->getTermInfo($term); + + if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { + $segmentStartDocId += $segInfo->count(); + continue; + } + + $frqFile = $segInfo->openCompoundFile('.frq'); + $frqFile->seek($termInfo->freqPointer,SEEK_CUR); + $freqs = array(); + $docId = 0; + + for( $count = 0; $count < $termInfo->docFreq; $count++ ) { + $docDelta = $frqFile->readVInt(); + if( $docDelta % 2 == 1 ) { + $docId += ($docDelta-1)/2; + $freqs[ $docId ] = 1; + } else { + $docId += $docDelta/2; + $freqs[ $docId ] = $frqFile->readVInt(); + } + } + + $prxFile = $segInfo->openCompoundFile('.prx'); + $prxFile->seek($termInfo->proxPointer,SEEK_CUR); + foreach ($freqs as $docId => $freq) { + $termPosition = 0; + $positions = array(); + + for ($count = 0; $count < $freq; $count++ ) { + $termPosition += $prxFile->readVInt(); + $positions[] = $termPosition; + } + $result[ $segmentStartDocId + $docId ] = $positions; + } + + $segmentStartDocId += $segInfo->count(); + } + + return $result; + } + + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term) + { + $result = 0; + foreach ($this->_segmentInfos as $segInfo) { + $termInfo = $segInfo->getTermInfo($term); + if ($termInfo !== null) { + $result += $termInfo->docFreq; + } + } + + return $result; + } + + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity() + { + return Zend_Search_Lucene_Search_Similarity::getDefault(); + } + + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return Zend_Search_Lucene_Document + */ + public function norm( $id, $fieldName ) + { + if( $id >= $this->_docCount ) + return null; + + $segCount = 0; + $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); + while( $nextSegmentStartId <= $id ) { + $segCount++; + $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); + } + + $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + + return $this->_segmentInfos[ $segCount ]->norm($id - $segmentStartId, $fieldName); + } + + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { + $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); + } + + $this->_writer->addDocument($document); + } + + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + * + * @todo delete() and undeleteAll processing. + */ + public function commit() + { + if ($this->_writer !== null) { + foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { + if ($segmentInfo !== null) { + $this->_segmentInfos[] = $segmentInfo; + $this->_docCount += $segmentInfo->count(); + } else { + foreach ($this->_segmentInfos as $segId => $segInfo) { + if ($segInfo->getName() == $segmentName) { + unset($this->_segmentInfos[$segId]); + } + } + } + } + } + } + + + /************************************************************************* + @todo UNIMPLEMENTED + *************************************************************************/ + + /** + * Returns an array of all terms in this index. + * + * @todo Implementation + * @return array + */ + public function terms() + { + return array(); + } + + + /** + * Returns true if any documents have been deleted from this index. + * + * @todo Implementation + * @return boolean + */ + public function hasDeletions() + { + return false; + } + + + /** + * Deletes a document from the index. $doc may contain a Zend_Search_Lucene_Document + * or the number of the document to delete. + * + * @todo Implementation + * @param mixed $item_to_del + */ + public function delete($doc) + {} + + + /** + * Undeletes all documents currently marked as deleted in this index. + * + * @todo Implementation + */ + public function undeleteAll() + {} +} \ No newline at end of file diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer.php new file mode 100644 index 00000000..8e234c16 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer.php @@ -0,0 +1,94 @@ +_filters[] = $filter; + } + + /** + * Apply filters to the token. + * + * @param Zend_Search_Lucene_Analysis_Token $token + * @return Zend_Search_Lucene_Analysis_Token + */ + public function normalize(Zend_Search_Lucene_Analysis_Token $token) + { + foreach ($this->_filters as $filter) { + $token = $filter->normalize($token); + } + + return $token; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php new file mode 100644 index 00000000..2a80c1f8 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -0,0 +1,76 @@ +normalize($token); + } + + return $tokenStream; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php new file mode 100644 index 00000000..d77e38d5 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php @@ -0,0 +1,43 @@ +addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase()); + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Token.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Token.php new file mode 100644 index 00000000..a60d5d96 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Token.php @@ -0,0 +1,170 @@ +_termText = $text; + $this->_startOffset = $start; + $this->_endOffset = $end; + $this->_type = $type; + + $this->_positionIncrement = 1; + } + + + /** + * positionIncrement setter + * + * @param integer $positionIncrement + */ + public function setPositionIncrement($positionIncrement) + { + $this->_positionIncrement = $positionIncrement; + } + + /** + * Returns the position increment of this Token. + * + * @return integer + */ + public function getPositionIncrement() + { + return $this->_positionIncrement; + } + + /** + * Returns the Token's term text. + * + * @return string + */ + public function getTermText() + { + return $this->_termText; + } + + /** + * Returns this Token's starting offset, the position of the first character + * corresponding to this token in the source text. + * + * Note: + * The difference between getEndOffset() and getStartOffset() may not be equal + * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered + * by a stemmer or some other filter. + * + * @return integer + */ + public function getStartOffset() + { + return $this->_startOffset; + } + + /** + * Returns this Token's ending offset, one greater than the position of the + * last character corresponding to this token in the source text. + * + * @return integer + */ + public function getEndOffset() + { + return $this->_endOffset; + } + + /** + * Returns this Token's lexical type. Defaults to 'word'. + * + * @return string + */ + public function getType() + { + return $this->_type; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/TokenFilter.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/TokenFilter.php new file mode 100644 index 00000000..9ea5125f --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/TokenFilter.php @@ -0,0 +1,45 @@ +getTermText() ), + $srcToken->getStartOffset(), + $srcToken->getEndOffset(), + $srcToken->getType()); + + $newToken->setPositionIncrement($srcToken->getPositionIncrement()); + + return $newToken; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Document.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Document.php new file mode 100644 index 00000000..29c0c2d9 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Document.php @@ -0,0 +1,109 @@ +getFieldValue($offset); + } + + + /** + * Add a field object to this document. + * + * @param Zend_Search_Lucene_Field $field + */ + public function addField(Zend_Search_Lucene_Field $field) + { + $this->_fields[$field->name] = $field; + } + + + /** + * Return an array with the names of the fields in this document. + * + * @return array + */ + public function getFieldNames() + { + return array_keys($this->_fields); + } + + + /** + * Returns Zend_Search_Lucene_Field object for a named field in this document. + * + * @param string $fieldName + * @return Zend_Search_Lucene_Field + */ + public function getField($fieldName) + { + if (!array_key_exists($fieldName, $this->_fields)) { + throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); + } + return $this->_fields[$fieldName]; + } + + + /** + * Returns the string value of a named field in this document. + * + * @see __get() + * @return string + */ + public function getFieldValue($fieldName) + { + return $this->getField($fieldName)->stringValue; + } + +} diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Exception.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Exception.php new file mode 100644 index 00000000..5f12c5f6 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Exception.php @@ -0,0 +1,34 @@ +name = $name; + $this->stringValue = $stringValue; + $this->isStored = $isStored; + $this->isIndexed = $isIndexed; + $this->isTokenized = $isTokenized; + $this->isBinary = $isBinary; + + $this->storeTermVector = false; + $this->boost = 1.0; + } + + + /** + * Constructs a String-valued Field that is not tokenized, but is indexed + * and stored. Useful for non-text fields, e.g. date or url. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Keyword($name, $value) + { + return new self($name, $value, true, true, false); + } + + + /** + * Constructs a String-valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnIndexed($name, $value) + { + return new self($name, $value, true, false, false); + } + + + /** + * Constructs a Binary String valued Field that is not tokenized nor indexed, + * but is stored in the index, for return with hits. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Binary($name, $value) + { + return new self($name, $value, true, false, false, true); + } + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * and is stored in the index, for return with hits. Useful for short text + * fields, like "title" or "subject". Term vector will not be stored for this field. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function Text($name, $value) + { + return new self($name, $value, true, true, true); + } + + + /** + * Constructs a String-valued Field that is tokenized and indexed, + * but that is not stored in the index. + * + * @param string $name + * @param string $value + * @return Zend_Search_Lucene_Field + */ + static public function UnStored($name, $value) + { + return new self($name, $value, false, true, true); + } + +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/FieldInfo.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/FieldInfo.php new file mode 100644 index 00000000..eaca4ecf --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/FieldInfo.php @@ -0,0 +1,43 @@ +name = $name; + $this->isIndexed = $isIndexed; + $this->number = $number; + $this->storeTermVector = $storeTermVector; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentInfo.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentInfo.php new file mode 100644 index 00000000..f5c596a0 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentInfo.php @@ -0,0 +1,412 @@ + normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) + * + * @var array + */ + private $_norms = array(); + + /** + * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, + * Documents count and Directory as a parameter. + * + * @param string $name + * @param integer $docCount + * @param Zend_Search_Lucene_Storage_Directory $directory + */ + public function __construct($name, $docCount, $directory) + { + $this->_name = $name; + $this->_docCount = $docCount; + $this->_directory = $directory; + $this->_termDictionary = null; + + $this->_segFiles = array(); + $cfsFile = $this->_directory->getFileObject($name . '.cfs'); + $segFilesCount = $cfsFile->readVInt(); + + for ($count = 0; $count < $segFilesCount; $count++) { + $dataOffset = $cfsFile->readLong(); + $fileName = $cfsFile->readString(); + $this->_segFiles[$fileName] = $dataOffset; + } + + $fnmFile = $this->openCompoundFile('.fnm'); + $fieldsCount = $fnmFile->readVInt(); + $fieldNames = array(); + $fieldNums = array(); + $this->_fields = array(); + for ($count=0; $count < $fieldsCount; $count++) { + $fieldName = $fnmFile->readString(); + $fieldBits = $fnmFile->readByte(); + $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, + $fieldBits & 1, + $count, + $fieldBits & 2 ); + if ($fieldBits & 0x10) { + // norms are omitted for the indexed field + $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); + } + + $fieldNums[$count] = $count; + $fieldNames[$count] = $fieldName; + } + array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); + $this->_fieldsDicPositions = array_flip($fieldNums); + } + + /** + * Opens index file stoted within compound index file + * + * @param string $extension + * @throws Zend_Search_Lucene_Exception + * @return Zend_Search_Lucene_Storage_File + */ + public function openCompoundFile($extension) + { + $filename = $this->_name . $extension; + + if( !isset($this->_segFiles[ $filename ]) ) { + throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' + . $filename . ' file.' ); + } + + $file = $this->_directory->getFileObject( $this->_name.".cfs" ); + $file->seek( $this->_segFiles[ $filename ] ); + return $file; + } + + /** + * Returns field index or -1 if field is not found + * + * @param string $fieldName + * @return integer + */ + public function getFieldNum($fieldName) + { + foreach( $this->_fields as $field ) { + if( $field->name == $fieldName ) { + return $field->number; + } + } + + return -1; + } + + /** + * Returns field info for specified field + * + * @param integer $fieldNum + * @return ZSearchFieldInfo + */ + public function getField($fieldNum) + { + return $this->_fields[$fieldNum]; + } + + /** + * Returns array of fields. + * if $indexed parameter is true, then returns only indexed fields. + * + * @param boolean $indexed + * @return array + */ + public function getFields($indexed = false) + { + $result = array(); + foreach( $this->_fields as $field ) { + if( (!$indexed) || $field->isIndexed ) { + $result[ $field->name ] = $field->name; + } + } + return $result; + } + + /** + * Returns the total number of documents in this segment. + * + * @return integer + */ + public function count() + { + return $this->_docCount; + } + + + /** + * Loads Term dictionary from TermInfoIndex file + */ + protected function _loadDictionary() + { + if ($this->_termDictionary !== null) { + return; + } + + $this->_termDictionary = array(); + $this->_termDictionaryInfos = array(); + + $tiiFile = $this->openCompoundFile('.tii'); + $tiVersion = $tiiFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); + } + + $indexTermCount = $tiiFile->readLong(); + $tiiFile->readInt(); // IndexInterval + $skipInterval = $tiiFile->readInt(); + + $prevTerm = ''; + $freqPointer = 0; + $proxPointer = 0; + $indexPointer = 0; + for ($count = 0; $count < $indexTermCount; $count++) { + $termPrefixLength = $tiiFile->readVInt(); + $termSuffix = $tiiFile->readString(); + $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; + + $termFieldNum = $tiiFile->readVInt(); + $docFreq = $tiiFile->readVInt(); + $freqPointer += $tiiFile->readVInt(); + $proxPointer += $tiiFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipDelta = $tiiFile->readVInt(); + } else { + $skipDelta = 0; + } + + $indexPointer += $tiiFile->readVInt(); + + $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); + $this->_termDictionaryInfos[] = + new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); + $prevTerm = $termValue; + } + } + + + /** + * Return segment name + * + * @return string + */ + public function getName() + { + return $this->_name; + } + + + /** + * Scans terms dictionary and returns term info + * + * @param Zend_Search_Lucene_Index_Term $term + * @return Zend_Search_Lucene_Index_TermInfo + */ + public function getTermInfo($term) + { + $this->_loadDictionary(); + + $searchField = $this->getFieldNum($term->field); + + if ($searchField == -1) { + return null; + } + $searchDicField = $this->_fieldsDicPositions[$searchField]; + + // search for appropriate value in dictionary + $lowIndex = 0; + $highIndex = count($this->_termDictionary)-1; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $midTerm = $this->_termDictionary[$mid]; + + $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field]; + if ($delta == 0) { + $delta = strcmp($term->text, $midTerm->text); + } + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $this->_termDictionaryInfos[$mid]; // We got it! + } + } + + if ($highIndex == -1) { + // Term is out of the dictionary range + return null; + } + + $prevPosition = $highIndex; + $prevTerm = $this->_termDictionary[$prevPosition]; + $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; + + $tisFile = $this->openCompoundFile('.tis'); + $tiVersion = $tisFile->readInt(); + if ($tiVersion != (int)0xFFFFFFFE) { + throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); + } + + $termCount = $tisFile->readLong(); + $indexInterval = $tisFile->readInt(); + $skipInterval = $tisFile->readInt(); + + $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); + + $termValue = $prevTerm->text; + $termFieldNum = $prevTerm->field; + $freqPointer = $prevTermInfo->freqPointer; + $proxPointer = $prevTermInfo->proxPointer; + for ($count = $prevPosition*$indexInterval + 1; + $count < $termCount && + ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField || + ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField && + strcmp($termValue, $term->text) < 0) ); + $count++) { + $termPrefixLength = $tisFile->readVInt(); + $termSuffix = $tisFile->readString(); + $termFieldNum = $tisFile->readVInt(); + $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; + + $docFreq = $tisFile->readVInt(); + $freqPointer += $tisFile->readVInt(); + $proxPointer += $tisFile->readVInt(); + if( $docFreq >= $skipInterval ) { + $skipOffset = $tisFile->readVInt(); + } else { + $skipOffset = 0; + } + } + + if ($termFieldNum == $searchField && $termValue == $term->text) { + return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); + } else { + return null; + } + } + + /** + * Returns normalization factor for specified documents + * + * @param integer $id + * @param string $fieldName + * @return string + */ + public function norm($id, $fieldName) + { + $fieldNum = $this->getFieldNum($fieldName); + + if ( !($this->_fields[$fieldNum]->isIndexed) ) { + return null; + } + + if ( !isset( $this->_norms[$fieldNum] )) { + $fFile = $this->openCompoundFile('.f' . $fieldNum); + $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); + } + + return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentWriter.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentWriter.php new file mode 100644 index 00000000..f90d6ed3 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/SegmentWriter.php @@ -0,0 +1,491 @@ + normVector + * normVector is a binary string. + * Each byte corresponds to an indexed document in a segment and + * encodes normalization factor (float value, encoded by + * Zend_Search_Lucene_Search_Similarity::encodeNorm()) + * + * @var array + */ + private $_norms; + + + /** + * '.fdx' file - Stored Fields, the field index. + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_fdxFile; + + /** + * '.fdx' file - Stored Fields, the field data. + * + * @var Zend_Search_Lucene_Storage_File + */ + private $_fdtFile; + + + /** + * Object constructor. + * + * @param Zend_Search_Lucene_Storage_Directory $directory + * @param string $name + */ + public function __construct($directory, $name) + { + $this->_directory = $directory; + $this->_name = $name; + $this->_docCount = 0; + + $this->_fields = array(); + $this->_termDocs = array(); + $this->_files = array(); + $this->_norms = array(); + + $this->_fdxFile = null; + $this->_fdtFile = null; + } + + + /** + * Add field to the segment + * + * @param Zend_Search_Lucene_Field $field + */ + private function _addFieldInfo(Zend_Search_Lucene_Field $field) + { + if (!isset($this->_fields[$field->name])) { + $this->_fields[$field->name] = + new Zend_Search_Lucene_Index_FieldInfo($field->name, + $field->isIndexed, + count($this->_fields), + $field->storeTermVector); + } else { + $this->_fields[$field->name]->isIndexed |= $field->isIndexed; + $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; + } + } + + + /** + * Adds a document to this segment. + * + * @param Zend_Search_Lucene_Document $document + * @throws Zend_Search_Lucene_Exception + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $storedFields = array(); + + foreach ($document->getFieldNames() as $fieldName) { + $field = $document->getField($fieldName); + $this->_addFieldInfo($field); + + if ($field->storeTermVector) { + /** + * @todo term vector storing support + */ + throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); + } + + if ($field->isIndexed) { + if ($field->isTokenized) { + $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); + } else { + $tokenList = array(); + $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); + } + + $position = 0; + foreach ($tokenList as $token) { + $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); + $termKey = $term->key(); + + if (!isset($this->_termDictionary[$termKey])) { + // New term + $this->_termDictionary[$termKey] = $term; + $this->_termDocs[$termKey] = array(); + $this->_termDocs[$termKey][$this->_docCount] = array(); + } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { + // Existing term, but new term entry + $this->_termDocs[$termKey][$this->_docCount] = array(); + } + $position += $token->getPositionIncrement(); + $this->_termDocs[$termKey][$this->_docCount][] = $position; + } + } + + if ($field->isStored) { + $storedFields[] = $field; + } + } + + if (count($storedFields) != 0) { + if (!isset($this->_fdxFile)) { + $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); + $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); + + $this->_files[] = $this->_name . '.fdx'; + $this->_files[] = $this->_name . '.fdt'; + } + + $this->_fdxFile->writeLong($this->_fdtFile->tell()); + + $this->_fdtFile->writeVInt(count($storedFields)); + foreach ($storedFields as $field) { + $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); + $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 | + $field->isBinary ? 0x02 : 0x00 | + 0x00 /* 0x04 - third bit, compressed (ZLIB) */ ); + if ($field->isBinary) { + $this->_fdtFile->writeVInt(strlen($field->stringValue)); + $this->_fdtFile->writeBytes($field->stringValue); + } else { + $this->_fdtFile->writeString($field->stringValue); + } + } + } + + $this->_docCount++; + } + + + /** + * Dump Field Info (.fnm) segment file + */ + private function _dumpFNM() + { + $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); + $fnmFile->writeVInt(count($this->_fields)); + + foreach ($this->_fields as $field) { + $fnmFile->writeString($field->name); + $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) | + ($field->storeTermVector ? 0x02 : 0x00) | +// not supported yet 0x04 /* term positions are stored with the term vectors */ | +// not supported yet 0x08 /* term offsets are stored with the term vectors */ | +/* not supported yet */ 0x10 /* norms are omitted for the indexed field */ + ); + } + + $this->_files[] = $this->_name . '.fnm'; + } + + + /** + * Dump Term Dictionary segment file entry. + * Used to write entry to .tis or .tii files + * + * @param Zend_Search_Lucene_Storage_File $dicFile + * @param Zend_Search_Lucene_Index_Term $prevTerm + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo + * @param Zend_Search_Lucene_Index_TermInfo $termInfo + */ + private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, + &$prevTerm, Zend_Search_Lucene_Index_Term $term, + &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) + { + if (isset($prevTerm) && $prevTerm->field == $term->field) { + $prefixLength = 0; + while ($prefixLength < strlen($prevTerm->text) && + $prefixLength < strlen($term->text) && + $prevTerm->text{$prefixLength} == $term->text{$prefixLength} + ) { + $prefixLength++; + } + // Write preffix length + $dicFile->writeVInt($prefixLength); + // Write suffix + $dicFile->writeString( substr($term->text, $prefixLength) ); + } else { + // Write preffix length + $dicFile->writeVInt(0); + // Write suffix + $dicFile->writeString($term->text); + } + // Write field number + $dicFile->writeVInt($term->field); + // DocFreq (the count of documents which contain the term) + $dicFile->writeVInt($termInfo->docFreq); + + $prevTerm = $term; + + if (!isset($prevTermInfo)) { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer); + } else { + // Write FreqDelta + $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); + // Write ProxDelta + $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); + } + // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval + if ($termInfo->skipOffset != 0) { + $dicFile->writeVInt($termInfo->skipOffset); + } + + $prevTermInfo = $termInfo; + } + + /** + * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files + */ + private function _dumpDictionary() + { + $tisFile = $this->_directory->createFile($this->_name . '.tis'); + $tisFile->writeInt((int)0xFFFFFFFE); + $tisFile->writeLong(count($this->_termDictionary)); + $tisFile->writeInt(self::$indexInterval); + $tisFile->writeInt(self::$skipInterval); + + $tiiFile = $this->_directory->createFile($this->_name . '.tii'); + $tiiFile->writeInt((int)0xFFFFFFFE); + $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1); + $tiiFile->writeInt(self::$indexInterval); + $tiiFile->writeInt(self::$skipInterval); + + $frqFile = $this->_directory->createFile($this->_name . '.frq'); + $prxFile = $this->_directory->createFile($this->_name . '.prx'); + + $termKeys = array_keys($this->_termDictionary); + sort($termKeys, SORT_STRING); + + $termCount = 0; + + $prevTerm = null; + $prevTermInfo = null; + $prevIndexTerm = null; + $prevIndexTermInfo = null; + $prevIndexPosition = 0; + + foreach ($termKeys as $termId) { + $freqPointer = $frqFile->tell(); + $proxPointer = $prxFile->tell(); + + $prevDoc = 0; + foreach ($this->_termDocs[$termId] as $docId => $termPositions) { + $docDelta = ($docId - $prevDoc)*2; + $prevDoc = $docId; + if (count($termPositions) > 1) { + $frqFile->writeVInt($docDelta); + $frqFile->writeVInt(count($termPositions)); + } else { + $frqFile->writeVInt($docDelta + 1); + } + + $prevPosition = 0; + foreach ($termPositions as $position) { + $prxFile->writeVInt($position - $prevPosition); + $prevPosition = $position; + } + } + + if (count($this->_termDocs[$termId]) >= self::$skipInterval) { + /** + * @todo Write Skip Data to a freq file. + * It's not used now, but must be implemented to be compatible with Lucene + */ + $skipOffset = $frqFile->tell() - $freqPointer; + } else { + $skipOffset = 0; + } + + $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, + $this->_fields[$this->_termDictionary[$termId]->field]->number); + $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), + $freqPointer, $proxPointer, $skipOffset); + + $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); + + if ($termCount % self::$indexInterval == 0) { + $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); + + $indexPosition = $tisFile->tell(); + $tiiFile->writeVInt($indexPosition - $prevIndexPosition); + $prevIndexPosition = $indexPosition; + } + $termCount++; + } + + $this->_files[] = $this->_name . '.tis'; + $this->_files[] = $this->_name . '.tii'; + $this->_files[] = $this->_name . '.frq'; + $this->_files[] = $this->_name . '.prx'; + } + + + /** + * Generate compound index file + */ + private function _generateCFS() + { + $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); + $cfsFile->writeVInt(count($this->_files)); + + $dataOffsetPointers = array(); + foreach ($this->_files as $fileName) { + $dataOffsetPointers[$fileName] = $cfsFile->tell(); + $cfsFile->writeLong(0); // write dummy data + $cfsFile->writeString($fileName); + } + + foreach ($this->_files as $fileName) { + // Get actual data offset + $dataOffset = $cfsFile->tell(); + // Seek to the data offset pointer + $cfsFile->seek($dataOffsetPointers[$fileName]); + // Write actual data offset value + $cfsFile->writeLong($dataOffset); + // Seek back to the end of file + $cfsFile->seek($dataOffset); + + $dataFile = $this->_directory->getFileObject($fileName); + $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName))); + + $this->_directory->deleteFile($fileName); + } + } + + + /** + * Close segment, write it to disk and return segment info + * + * @return Zend_Search_Lucene_Index_SegmentInfo + */ + public function close() + { + if ($this->_docCount == 0) { + return null; + } + + $this->_dumpFNM(); + $this->_dumpDictionary(); + + $this->_generateCFS(); + + return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, + $this->_docCount, + $this->_directory); + } + +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Term.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Term.php new file mode 100644 index 00000000..e30ce587 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Term.php @@ -0,0 +1,70 @@ +field = $field; + $this->text = $text; + } + + + /** + * @todo docblock + */ + public function key() + { + return $this->field . chr(0) . $this->text; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/TermInfo.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/TermInfo.php new file mode 100644 index 00000000..ddef721d --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/TermInfo.php @@ -0,0 +1,77 @@ +docFreq = $docFreq; + $this->freqPointer = $freqPointer; + $this->proxPointer = $proxPointer; + $this->skipOffset = $skipOffset; + $this->indexPointer = $indexPointer; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Writer.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Writer.php new file mode 100644 index 00000000..da4af000 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Index/Writer.php @@ -0,0 +1,308 @@ +_directory = $directory; + + if ($create) { + foreach ($this->_directory->fileList() as $file) { + if ($file == 'deletable' || + $file == 'segments' || + substr($file, strlen($file)-4) == '.cfs') { + $this->_directory->deleteFile($file); + } + } + $segmentsFile = $this->_directory->createFile('segments'); + $segmentsFile->writeInt((int)0xFFFFFFFF); + // write version + $segmentsFile->writeLong(0); + // write name counter + $segmentsFile->writeInt(0); + // write segment counter + $segmentsFile->writeInt(0); + + $deletableFile = $this->_directory->createFile('deletable'); + // write counter + $deletableFile->writeInt(0); + + $this->_version = 0; + $this->_segmentNameCounter = 0; + $this->_segments = 0; + } else { + $segmentsFile = $this->_directory->getFileObject('segments'); + $format = $segmentsFile->readInt(); + if ($format != (int)0xFFFFFFFF) { + throw new Zend_Search_Lucene_Exception('Wrong segments file format'); + } + + // read version + $this->_version = $segmentsFile->readLong(); + // read counter + $this->_segmentNameCounter = $segmentsFile->readInt(); + // read segment counter + $this->_segments = $segmentsFile->readInt(); + } + + $this->_newSegments = array(); + $this->_currentSegment = null; + } + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + if ($this->_currentSegment === null) { + $this->_currentSegment = + new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); + } + $this->_currentSegment->addDocument($document); + $this->_version++; + } + + + + /** + * Update segments file by adding current segment to a list + * @todo !!!!!Finish the implementation + * + * @throws Zend_Search_Lucene_Exception + */ + private function _updateSegments() + { + $segmentsFile = $this->_directory->getFileObject('segments'); + $newSegmentFile = $this->_directory->createFile('segments.new'); + + $newSegmentFile->writeInt((int)0xFFFFFFFF); + $newSegmentFile->writeLong($this->_version); + $newSegmentFile->writeInt($this->_segmentNameCounter); + $newSegmentFile->writeInt($this->_segments + count($this->_newSegments)); + + $segmentsFile->seek(20); + $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); + + foreach ($this->_newSegments as $segmentName => $segmentInfo) { + $newSegmentFile->writeString($segmentName); + $newSegmentFile->writeInt($segmentInfo->count()); + } + + $this->_directory->renameFile('segments.new', 'segments'); + } + + + /** + * Commit current changes + * returns array of new segments + * + * @return array + */ + public function commit() + { + if ($this->_currentSegment !== null) { + $newSegment = $this->_currentSegment->close(); + if ($newSegment !== null) { + $this->_newSegments[$newSegment->getName()] = $newSegment; + } + $this->_currentSegment = null; + } + + if (count($this->_newSegments) != 0) { + $this->_updateSegments(); + } + + $result = $this->_newSegments; + $this->_newSegments = array(); + + return $result; + } + + + /** + * Merges the provided indexes into this index. + * + * @param array $readers + * @return void + */ + public function addIndexes($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Returns the number of documents currently in this index. + * + * @return integer + */ + public function docCount($readers) + { + /** + * @todo implementation + */ + } + + + /** + * Flushes all changes to an index and closes all associated files. + * + */ + public function close() + { + /** + * @todo implementation + */ + } + + + /** + * Merges all segments together into a single segment, optimizing + * an index for search. + * + * return void + */ + public function optimize() + { + /** + * @todo implementation + */ + } + + /** + * Get name for new segment + * + * @return string + */ + private function _newSegmentName() + { + return '_' . base_convert($this->_segmentNameCounter++, 10, 36); + } + +} diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query.php new file mode 100644 index 00000000..dd8698e8 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query.php @@ -0,0 +1,98 @@ +_boost; + } + + /** + * Sets the boost for this query clause to $boost. + * + * @param float $boost + */ + public function setBoost($boost) + { + $this->_boost = $boost; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + abstract public function score($docId, $reader); + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + abstract protected function _createWeight($reader); + + /** + * Constructs an initializes a Weight for a query. + * + * @param Zend_Search_Lucene $reader + */ + protected function _initWeight($reader) + { + $this->_weight = $this->_createWeight($reader); + $sum = $this->_weight->sumOfSquaredWeights(); + $queryNorm = $reader->getSimilarity()->queryNorm($sum); + $this->_weight->normalize($queryNorm); + } + +} \ No newline at end of file diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/MultiTerm.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/MultiTerm.php new file mode 100644 index 00000000..4a99c0f7 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/MultiTerm.php @@ -0,0 +1,437 @@ + (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + + /** + * A score factor based on the fraction of all query terms + * that a document contains. + * float for conjunction queries + * array of float for non conjunction queries + * + * @var mixed + */ + private $_coord = null; + + + /** + * Terms weights + * array of Zend_Search_Lucene_Search_Weight + * + * @var array + */ + private $_weights = array(); + + + /** + * Class constructor. Create a new multi-term query object. + * + * @param array $terms Array of Zend_Search_Lucene_Index_Term objects + * @param array $signs Array of signs. Sign is boolean|null. + * @return void + */ + public function __construct($terms = null, $signs = null) + { + /** + * @todo Check contents of $terms and $signs before adding them. + */ + if (is_array($terms)) { + $this->_terms = $terms; + + $this->_signs = null; + // Check if all terms are required + if (is_array($signs)) { + foreach ($signs as $sign ) { + if ($sign !== true) { + $this->_signs = $signs; + continue; + } + } + } + } + } + + + /** + * Add a $term (Zend_Search_Lucene_Index_Term) to this query. + * + * The sign is specified as: + * TRUE - term is required + * FALSE - term is prohibited + * NULL - term is neither prohibited, nor required + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean|null $sign + * @return void + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) { + $this->_terms[] = $term; + + /** + * @todo This is not good. Sometimes $this->_signs is an array, sometimes + * it is null, even when there are terms. It will be changed so that + * it is always an array. + */ + if ($this->_signs === null) { + if ($sign !== null) { + $this->_signs = array(); + foreach ($this->_terms as $term) { + $this->_signs[] = null; + } + $this->_signs[] = $sign; + } + } else { + $this->_signs[] = $sign; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Return terms signs + * + * @return array + */ + public function getSigns() + { + return $this->_signs; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); + } + + + /** + * Calculate result vector for Conjunction query + * (like '+something +another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Calculate result vector for non Conjunction query + * (like '+something -another') + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateNonConjunctionResult($reader) + { + if (extension_loaded('bitset')) { + $required = null; + $neither = bitset_empty(); + $prohibited = bitset_empty(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = bitset_from_array($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + $required = bitset_intersection($required, $termDocs); + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + $prohibited = bitset_union($prohibited, $termDocs); + } else { + // neither required, nor prohibited + $neither = bitset_union($neither, $termDocs); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + $this->_resVector = bitset_intersection( $required, + bitset_invert($prohibited, $reader->count()) ); + } else { + $required = null; + $neither = array(); + $prohibited = array(); + + foreach ($this->_terms as $termId => $term) { + $termDocs = array_flip($reader->termDocs($term)); + + if ($this->_signs[$termId] === true) { + // required + if ($required !== null) { + // substitute for bitset_intersection + foreach ($required as $key => $value) { + if (!isset( $termDocs[$key] )) { + unset($required[$key]); + } + } + } else { + $required = $termDocs; + } + } elseif ($this->_signs[$termId] === false) { + // prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $prohibited[$key] = $value; + } + } else { + // neither required, nor prohibited + // substitute for bitset_union + foreach ($termDocs as $key => $value) { + $neither[$key] = $value; + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + + if ($required === null) { + $required = $neither; + } + + foreach ($required as $key=>$value) { + if (isset( $prohibited[$key] )) { + unset($required[$key]); + } + } + $this->_resVector = $required; + } + } + + + /** + * Score calculator for conjunction queries (all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _conjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), + count($this->_terms) ); + } + + $score = 0.0; + + foreach ($this->_terms as $termId=>$term) { + $score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + + return $score * $this->_coord; + } + + + /** + * Score calculator for non conjunction queries (not all terms are required) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _nonConjunctionScore($docId, $reader) + { + if ($this->_coord === null) { + $this->_coord = array(); + + $maxCoord = 0; + foreach ($this->_signs as $sign) { + if ($sign !== false /* not prohibited */) { + $maxCoord++; + } + } + + for ($count = 0; $count <= $maxCoord; $count++) { + $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); + } + } + + $score = 0.0; + $matchedTerms = 0; + foreach ($this->_terms as $termId=>$term) { + // Check if term is + if ($this->_signs[$termId] !== false && // not prohibited + isset($this->_termsPositions[$termId][$docId]) // matched + ) { + $matchedTerms++; + $score += + $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * + $this->_weights[$termId]->getValue() * + $reader->norm($docId, $term->field); + } + } + + return $score * $this->_coord[$matchedTerms]; + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + if($this->_resVector === null) { + if ($this->_signs === null) { + $this->_calculateConjunctionResult($reader); + } else { + $this->_calculateNonConjunctionResult($reader); + } + + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_signs === null) { + return $this->_conjunctionScore($docId, $reader); + } else { + return $this->_nonConjunctionScore($docId, $reader); + } + } else { + return 0; + } + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Phrase.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Phrase.php new file mode 100644 index 00000000..3e52666b --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Phrase.php @@ -0,0 +1,424 @@ + (docId => array( pos1, pos2, ... ), ...) + * term2Id => (docId => array( pos1, pos2, ... ), ...) + * + * @var array + */ + private $_termsPositions = array(); + + /** + * Class constructor. Create a new prase query. + * + * @param string $field Field to search. + * @param array $terms Terms to search Array of strings. + * @param array $offsets Relative term positions. Array of integers. + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($terms = null, $offsets = null, $field = null) + { + $this->_slop = 0; + + if (is_array($terms)) { + $this->_terms = array(); + foreach ($terms as $termId => $termText) { + $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field): + new Zend_Search_Lucene_Index_Term($termText); + } + } else if ($terms === null) { + $this->_terms = array(); + } else { + throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null'); + } + + if (is_array($offsets)) { + if (count($this->_terms) != count($offsets)) { + throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.'); + } + $this->_offsets = $offsets; + } else if ($offsets === null) { + $this->_offsets = array(); + foreach ($this->_terms as $termId => $term) { + $position = count($this->_offsets); + $this->_offsets[$termId] = $position; + } + } else { + throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null'); + } + } + + /** + * Set slop + * + * @param integer $slop + */ + public function setSlop($slop) + { + $this->_slop = $slop; + } + + + /** + * Get slop + * + * @return integer + */ + public function getSlop() + { + return $this->_slop; + } + + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is specified explicitly or the one immediately + * after the last term added. + * + * @param Zend_Search_Lucene_Index_Term $term + * @param integer $position + */ + public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) { + if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) { + throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' . + $term->field . ':' . $term->text); + } + + $this->_terms[] = $term; + if ($position !== null) { + $this->_offsets[] = $position; + } else if (count($this->_offsets) != 0) { + $this->_offsets[] = end($this->_offsets) + 1; + } else { + $this->_offsets[] = 0; + } + } + + + /** + * Returns query term + * + * @return array + */ + public function getTerms() + { + return $this->_terms; + } + + + /** + * Set weight for specified term + * + * @param integer $num + * @param Zend_Search_Lucene_Search_Weight_Term $weight + */ + public function setWeight($num, $weight) + { + $this->_weights[$num] = $weight; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); + } + + + /** + * Calculate result vector + * + * @param Zend_Search_Lucene $reader + */ + private function _calculateResult($reader) + { + if (extension_loaded('bitset')) { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = bitset_from_array($reader->termDocs($term)); + } else { + $this->_resVector = bitset_intersection( + $this->_resVector, + bitset_from_array($reader->termDocs($term)) ); + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } else { + foreach( $this->_terms as $termId=>$term ) { + if($this->_resVector === null) { + $this->_resVector = array_flip($reader->termDocs($term)); + } else { + $termDocs = array_flip($reader->termDocs($term)); + foreach($this->_resVector as $key=>$value) { + if (!isset( $termDocs[$key] )) { + unset( $this->_resVector[$key] ); + } + } + } + + $this->_termsPositions[$termId] = $reader->termPositions($term); + } + } + } + + + /** + * Score calculator for exact phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @return float + */ + public function _exactPhraseFreq($docId) + { + $freq = 0; + + // Term Id with lowest cardinality + $lowCardTermId = null; + + // Calculate $lowCardTermId + foreach ($this->_terms as $termId => $term) { + if ($lowCardTermId === null || + count($this->_termsPositions[$termId][$docId]) < + count($this->_termsPositions[$lowCardTermId][$docId]) ) { + $lowCardTermId = $termId; + } + } + + // Walk through positions of the term with lowest cardinality + foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { + // We expect phrase to be found + $freq++; + + // Walk through other terms + foreach ($this->_terms as $termId => $term) { + if ($termId != $lowCardTermId) { + $expectedPosition = $lowCardPos + + ($this->_offsets[$termId] - + $this->_offsets[$lowCardTermId]); + + if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { + $freq--; // Phrase wasn't found. + break; + } + } + } + } + + return $freq; + } + + /** + * Score calculator for sloppy phrase queries (terms sequence is fixed) + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader) + { + $freq = 0; + + $phraseQueue = array(); + $phraseQueue[0] = array(); // empty phrase + $lastTerm = null; + + // Walk through the terms to create phrases. + foreach ($this->_terms as $termId => $term) { + $queueSize = count($phraseQueue); + $firstPass = true; + + // Walk through the term positions. + // Each term position produces a set of phrases. + foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { + if ($firstPass) { + for ($count = 0; $count < $queueSize; $count++) { + $phraseQueue[$count][$termId] = $termPosition; + } + } else { + for ($count = 0; $count < $queueSize; $count++) { + if ($lastTerm !== null && + abs( $termPosition - $phraseQueue[$count][$lastTerm] - + ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { + continue; + } + + $newPhraseId = count($phraseQueue); + $phraseQueue[$newPhraseId] = $phraseQueue[$count]; + $phraseQueue[$newPhraseId][$termId] = $termPosition; + } + + } + + $firstPass = false; + } + $lastTerm = $termId; + } + + + foreach ($phraseQueue as $phrasePos) { + $minDistance = null; + + for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { + $distance = 0; + $start = reset($phrasePos) - reset($this->_offsets) + $shift; + + foreach ($this->_terms as $termId => $term) { + $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); + + if($distance > $this->_slop) { + break; + } + } + + if ($minDistance === null || $distance < $minDistance) { + $minDistance = $distance; + } + } + + if ($minDistance <= $this->_slop) { + $freq += $reader->getSimilarity()->sloppyFreq($minDistance); + } + } + + return $freq; + } + + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score($docId, $reader) + { + // optimize zero-term case + if (count($this->_terms) == 0) { + return 0; + } + + if($this->_resVector === null) { + $this->_calculateResult($reader); + $this->_initWeight($reader); + } + + if ( (extension_loaded('bitset')) ? + bitset_in($this->_resVector, $docId) : + isset($this->_resVector[$docId]) ) { + if ($this->_slop == 0) { + $freq = $this->_exactPhraseFreq($docId); + } else { + $freq = $this->_sloppyPhraseFreq($docId, $reader); + } + +/* + return $reader->getSimilarity()->tf($freq) * + $this->_weight->getValue() * + $reader->norm($docId, reset($this->_terms)->field); +*/ + if ($freq != 0) { + $tf = $reader->getSimilarity()->tf($freq); + $weight = $this->_weight->getValue(); + $norm = $reader->norm($docId, reset($this->_terms)->field); + + return $tf*$weight*$norm; + } + } else { + return 0; + } + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Term.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Term.php new file mode 100644 index 00000000..d622f845 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Query/Term.php @@ -0,0 +1,126 @@ + array( pos1, pos2, ... ) + * + * @var array + */ + private $_termPositions; + + + /** + * Zend_Search_Lucene_Search_Query_Term constructor + * + * @param Zend_Search_Lucene_Index_Term $term + * @param boolean $sign + */ + public function __construct( $term, $sign = true ) + { + $this->_term = $term; + $this->_sign = $sign; + } + + + /** + * Constructs an appropriate Weight implementation for this query. + * + * @param Zend_Search_Lucene $reader + * @return Zend_Search_Lucene_Search_Weight + */ + protected function _createWeight($reader) + { + return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); + } + + /** + * Score specified document + * + * @param integer $docId + * @param Zend_Search_Lucene $reader + * @return float + */ + public function score( $docId, $reader ) + { + if($this->_docVector===null) { + if (extension_loaded('bitset')) { + $this->_docVector = bitset_from_array( $reader->termDocs($this->_term) ); + } else { + $this->_docVector = array_flip($reader->termDocs($this->_term)); + } + + $this->_termPositions = $reader->termPositions($this->_term); + $this->_initWeight($reader); + } + + $match = extension_loaded('bitset') ? bitset_in($this->_docVector, $docId) : + isset($this->_docVector[$docId]); + if ($this->_sign && $match) { + return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) * + $this->_weight->getValue() * + $reader->norm($docId, $this->_term->field); + } else { + return 0; + } + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryHit.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryHit.php new file mode 100644 index 00000000..65290a9e --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryHit.php @@ -0,0 +1,106 @@ +_index = $index; + } + + + /** + * Convenience function for getting fields from the document + * associated with this hit. + * + * @param string $offset + * @return string + */ + public function __get($offset) + { + return $this->getDocument()->getFieldValue($offset); + } + + + /** + * Return the document object for this hit + * + * @return Zend_Search_Lucene_Document + */ + public function getDocument() + { + if (!$this->_document instanceof Zend_Search_Lucene_Document) { + $this->_document = $this->_index->getDocument($this->id); + } + + return $this->_document; + } + + + /** + * Return the index object for this hit + * + * @return Zend_Search_Lucene + */ + public function getIndex() + { + return $this->_index; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryParser.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryParser.php new file mode 100644 index 00000000..9387afca --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryParser.php @@ -0,0 +1,140 @@ +count()) { + throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.'); + } + + // Term query + if ($tokens->count() == 1) { + if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents')); + } else { + throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.'); + } + } + + + /** + * MultiTerm Query + * + * Process each token that was returned by the tokenizer. + */ + $terms = array(); + $signs = array(); + $prevToken = null; + $openBrackets = 0; + $field = 'contents'; + foreach ($tokens as $token) { + switch ($token->type) { + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD: + $terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field); + $field = 'contents'; + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + if ($prevToken->text == "+") { + $signs[] = true; + } else { + $signs[] = false; + } + } else { + $signs[] = null; + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN: + if ($prevToken !== null && + $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.'); + } + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD: + $field = $token->text; + // let previous token to be signed as next $prevToken + $token = $prevToken; + break; + case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET: + $token->text=='(' ? $openBrackets++ : $openBrackets--; + } + $prevToken = $token; + } + + // Finish up parsing: check the last token in the query for an opening sign or parenthesis. + if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { + throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.'); + } + + // Finish up parsing: check that every opening bracket has a matching closing bracket. + if ($openBrackets != 0) { + throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.'); + } + + switch (count($terms)) { + case 0: + throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.'); + case 1: + return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false); + default: + return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs); + } + } + +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryToken.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryToken.php new file mode 100644 index 00000000..995e0d3c --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryToken.php @@ -0,0 +1,102 @@ +type = $tokType; + $this->text = $tokText; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryTokenizer.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryTokenizer.php new file mode 100644 index 00000000..986f8899 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -0,0 +1,162 @@ +_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, + $currentToken); + $currentToken = ''; + } + + if ($inputString{$count} == '+' || $inputString{$count} == '-') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN, + $inputString{$count}); + } elseif ($inputString{$count} == '(' || $inputString{$count} == ')') { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET, + $inputString{$count}); + } elseif ($inputString{$count} == ':' && $this->count()) { + if ($this->_tokens[count($this->_tokens)-1]->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { + $this->_tokens[count($this->_tokens)-1]->type = Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD; + } + } + } + } + + if (strlen($currentToken)) { + $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, $currentToken); + } + } + + + /** + * Returns number of tokens + * + * @return integer + */ + public function count() + { + return count($this->_tokens); + } + + + /** + * Returns TRUE if a token exists at the current position. + * + * @return boolean + */ + public function valid() + { + return $this->_currToken < $this->count(); + } + + + /** + * Resets token stream. + * + * @return integer + */ + public function rewind() + { + $this->_currToken = 0; + } + + + /** + * Returns the token at the current position or FALSE if + * the position does not contain a valid token. + * + * @return mixed + */ + public function current() + { + return $this->valid() ? $this->_tokens[$this->_currToken] : false; + } + + + /** + * Returns next token + * + * @return Zend_Search_Lucene_Search_QueryToken + */ + public function next() + { + return ++$this->_currToken; + } + + + /** + * Return the position of the current token. + * + * @return integer + */ + public function key() + { + return $this->_currToken; + } + +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php new file mode 100644 index 00000000..8b758213 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php @@ -0,0 +1,551 @@ + 0.0, + 1 => 5.820766E-10, + 2 => 6.9849193E-10, + 3 => 8.1490725E-10, + 4 => 9.313226E-10, + 5 => 1.1641532E-9, + 6 => 1.3969839E-9, + 7 => 1.6298145E-9, + 8 => 1.8626451E-9, + 9 => 2.3283064E-9, + 10 => 2.7939677E-9, + 11 => 3.259629E-9, + 12 => 3.7252903E-9, + 13 => 4.656613E-9, + 14 => 5.5879354E-9, + 15 => 6.519258E-9, + 16 => 7.4505806E-9, + 17 => 9.313226E-9, + 18 => 1.1175871E-8, + 19 => 1.3038516E-8, + 20 => 1.4901161E-8, + 21 => 1.8626451E-8, + 22 => 2.2351742E-8, + 23 => 2.6077032E-8, + 24 => 2.9802322E-8, + 25 => 3.7252903E-8, + 26 => 4.4703484E-8, + 27 => 5.2154064E-8, + 28 => 5.9604645E-8, + 29 => 7.4505806E-8, + 30 => 8.940697E-8, + 31 => 1.0430813E-7, + 32 => 1.1920929E-7, + 33 => 1.4901161E-7, + 34 => 1.7881393E-7, + 35 => 2.0861626E-7, + 36 => 2.3841858E-7, + 37 => 2.9802322E-7, + 38 => 3.5762787E-7, + 39 => 4.172325E-7, + 40 => 4.7683716E-7, + 41 => 5.9604645E-7, + 42 => 7.1525574E-7, + 43 => 8.34465E-7, + 44 => 9.536743E-7, + 45 => 1.1920929E-6, + 46 => 1.4305115E-6, + 47 => 1.66893E-6, + 48 => 1.9073486E-6, + 49 => 2.3841858E-6, + 50 => 2.861023E-6, + 51 => 3.33786E-6, + 52 => 3.8146973E-6, + 53 => 4.7683716E-6, + 54 => 5.722046E-6, + 55 => 6.67572E-6, + 56 => 7.6293945E-6, + 57 => 9.536743E-6, + 58 => 1.1444092E-5, + 59 => 1.335144E-5, + 60 => 1.5258789E-5, + 61 => 1.9073486E-5, + 62 => 2.2888184E-5, + 63 => 2.670288E-5, + 64 => 3.0517578E-5, + 65 => 3.8146973E-5, + 66 => 4.5776367E-5, + 67 => 5.340576E-5, + 68 => 6.1035156E-5, + 69 => 7.6293945E-5, + 70 => 9.1552734E-5, + 71 => 1.0681152E-4, + 72 => 1.2207031E-4, + 73 => 1.5258789E-4, + 74 => 1.8310547E-4, + 75 => 2.1362305E-4, + 76 => 2.4414062E-4, + 77 => 3.0517578E-4, + 78 => 3.6621094E-4, + 79 => 4.272461E-4, + 80 => 4.8828125E-4, + 81 => 6.1035156E-4, + 82 => 7.324219E-4, + 83 => 8.544922E-4, + 84 => 9.765625E-4, + 85 => 0.0012207031, + 86 => 0.0014648438, + 87 => 0.0017089844, + 88 => 0.001953125, + 89 => 0.0024414062, + 90 => 0.0029296875, + 91 => 0.0034179688, + 92 => 0.00390625, + 93 => 0.0048828125, + 94 => 0.005859375, + 95 => 0.0068359375, + 96 => 0.0078125, + 97 => 0.009765625, + 98 => 0.01171875, + 99 => 0.013671875, + 100 => 0.015625, + 101 => 0.01953125, + 102 => 0.0234375, + 103 => 0.02734375, + 104 => 0.03125, + 105 => 0.0390625, + 106 => 0.046875, + 107 => 0.0546875, + 108 => 0.0625, + 109 => 0.078125, + 110 => 0.09375, + 111 => 0.109375, + 112 => 0.125, + 113 => 0.15625, + 114 => 0.1875, + 115 => 0.21875, + 116 => 0.25, + 117 => 0.3125, + 118 => 0.375, + 119 => 0.4375, + 120 => 0.5, + 121 => 0.625, + 122 => 0.75, + 123 => 0.875, + 124 => 1.0, + 125 => 1.25, + 126 => 1.5, + 127 => 1.75, + 128 => 2.0, + 129 => 2.5, + 130 => 3.0, + 131 => 3.5, + 132 => 4.0, + 133 => 5.0, + 134 => 6.0, + 135 => 7.0, + 136 => 8.0, + 137 => 10.0, + 138 => 12.0, + 139 => 14.0, + 140 => 16.0, + 141 => 20.0, + 142 => 24.0, + 143 => 28.0, + 144 => 32.0, + 145 => 40.0, + 146 => 48.0, + 147 => 56.0, + 148 => 64.0, + 149 => 80.0, + 150 => 96.0, + 151 => 112.0, + 152 => 128.0, + 153 => 160.0, + 154 => 192.0, + 155 => 224.0, + 156 => 256.0, + 157 => 320.0, + 158 => 384.0, + 159 => 448.0, + 160 => 512.0, + 161 => 640.0, + 162 => 768.0, + 163 => 896.0, + 164 => 1024.0, + 165 => 1280.0, + 166 => 1536.0, + 167 => 1792.0, + 168 => 2048.0, + 169 => 2560.0, + 170 => 3072.0, + 171 => 3584.0, + 172 => 4096.0, + 173 => 5120.0, + 174 => 6144.0, + 175 => 7168.0, + 176 => 8192.0, + 177 => 10240.0, + 178 => 12288.0, + 179 => 14336.0, + 180 => 16384.0, + 181 => 20480.0, + 182 => 24576.0, + 183 => 28672.0, + 184 => 32768.0, + 185 => 40960.0, + 186 => 49152.0, + 187 => 57344.0, + 188 => 65536.0, + 189 => 81920.0, + 190 => 98304.0, + 191 => 114688.0, + 192 => 131072.0, + 193 => 163840.0, + 194 => 196608.0, + 195 => 229376.0, + 196 => 262144.0, + 197 => 327680.0, + 198 => 393216.0, + 199 => 458752.0, + 200 => 524288.0, + 201 => 655360.0, + 202 => 786432.0, + 203 => 917504.0, + 204 => 1048576.0, + 205 => 1310720.0, + 206 => 1572864.0, + 207 => 1835008.0, + 208 => 2097152.0, + 209 => 2621440.0, + 210 => 3145728.0, + 211 => 3670016.0, + 212 => 4194304.0, + 213 => 5242880.0, + 214 => 6291456.0, + 215 => 7340032.0, + 216 => 8388608.0, + 217 => 1.048576E7, + 218 => 1.2582912E7, + 219 => 1.4680064E7, + 220 => 1.6777216E7, + 221 => 2.097152E7, + 222 => 2.5165824E7, + 223 => 2.9360128E7, + 224 => 3.3554432E7, + 225 => 4.194304E7, + 226 => 5.0331648E7, + 227 => 5.8720256E7, + 228 => 6.7108864E7, + 229 => 8.388608E7, + 230 => 1.00663296E8, + 231 => 1.17440512E8, + 232 => 1.34217728E8, + 233 => 1.6777216E8, + 234 => 2.01326592E8, + 235 => 2.34881024E8, + 236 => 2.68435456E8, + 237 => 3.3554432E8, + 238 => 4.02653184E8, + 239 => 4.69762048E8, + 240 => 5.3687091E8, + 241 => 6.7108864E8, + 242 => 8.0530637E8, + 243 => 9.395241E8, + 244 => 1.07374182E9, + 245 => 1.34217728E9, + 246 => 1.61061274E9, + 247 => 1.87904819E9, + 248 => 2.14748365E9, + 249 => 2.68435456E9, + 250 => 3.22122547E9, + 251 => 3.75809638E9, + 252 => 4.2949673E9, + 253 => 5.3687091E9, + 254 => 6.4424509E9, + 255 => 7.5161928E9 ); + + + /** + * Set the default Similarity implementation used by indexing and search + * code. + * + * @param Zend_Search_Lucene_Search_Similarity $similarity + */ + static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) + { + self::$_defaultImpl = $similarity; + } + + + /** + * Return the default Similarity implementation used by indexing and search + * code. + * + * @return Zend_Search_Lucene_Search_Similarity + */ + static public function getDefault() + { + if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { + self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); + } + + return self::$_defaultImpl; + } + + + /** + * Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + * Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when 'numTokens' is large, + * and larger values when 'numTokens' is small. + * + * That these values are computed under + * IndexWriter::addDocument(Document) and stored then using + * encodeNorm(float). Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * fieldName - name of field + * numTokens - the total number of tokens contained in fields named + * 'fieldName' of 'doc'. + * Returns a normalization factor for hits on this field of this document + * + * @param string $fieldName + * @param integer $numTokens + * @return float + */ + abstract public function lengthNorm($fieldName, $numTokens); + + /** + * Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + * This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * sumOfSquaredWeights - the sum of the squares of query term weights + * Returns a normalization factor for query weights + * + * @param float $sumOfSquaredWeights + * @return float + */ + abstract public function queryNorm($sumOfSquaredWeights); + + + /** + * Decodes a normalization factor stored in an index. + * + * @param integer $byte + * @return float + */ + static public function decodeNorm($byte) + { + return self::$_normTable[$byte & 0xFF]; + } + + + /** + * Encodes a normalization factor for storage in an index. + * + * The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @param float $f + * @return integer + */ + static function encodeNorm($f) + { + return self::_floatToByte($f); + } + + /** + * Float to byte conversion + * + * @param integer $b + * @return float + */ + static private function _floatToByte($f) + { + // round negatives up to zero + if ($f <= 0.0) { + return 0; + } + + // search for appropriate value + $lowIndex = 0; + $highIndex = 255; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $delta = $f - self::$_normTable[$mid]; + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $mid; // We got it! + } + } + + // round to closest value + if ($highIndex != 255 && + $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { + return $highIndex + 1; + } else { + return $highIndex; + } + } + + + /** + * Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the idf(Term, Searcher) + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when 'freq' is large, and smaller values when 'freq' + * is small. + * + * freq - the frequency of a term within a document + * Returns a score factor based on a term's within-document frequency + * + * @param float $freq + * @return float + */ + abstract public function tf($freq); + + /** + * Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to tf(float). + * + * A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * distance - the edit distance of this sloppy phrase match + * Returns the frequency increment for this match + * + * @param integer $distance + * @return float + */ + abstract public function sloppyFreq($distance); + + + /** + * Computes a score factor for a simple term or a phrase. + * + * The default implementation is: + * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); + * + * input - the term in question or array of terms + * reader - reader the document collection being searched + * Returns a score factor for the term + * + * @param mixed $input + * @param Zend_Search_Lucene $reader + * @return a score factor for the term + */ + public function idf($input, $reader) + { + if (!is_array($input)) { + return $this->idfFreq($reader->docFreq($input), $reader->count()); + } else { + $idf = 0.0; + foreach ($input as $term) { + $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); + } + return $idf; + } + } + + /** + * Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * tf(int) factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + * Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * docFreq - the number of documents which contain the term + * numDocs - the total number of documents in the collection + * Returns a score factor based on the term's document frequency + * + * @param integer $docFreq + * @param integer $numDocs + * @return float + */ + abstract public function idfFreq($docFreq, $numDocs); + + /** + * Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + * The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * overlap - the number of query terms matched in the document + * maxOverlap - the total number of terms in the query + * Returns a score factor based on term overlap with the query + * + * @param integer $overlap + * @param integer $maxOverlap + * @return float + */ + abstract public function coord($overlap, $maxOverlap); +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity/Default.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity/Default.php new file mode 100644 index 00000000..1551d8bd --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity/Default.php @@ -0,0 +1,99 @@ +createWeight(). + * The sumOfSquaredWeights() method is then called on the top-level + * query to compute the query normalization factor Similarity->queryNorm(float). + * This factor is then passed to normalize(float). At this point the weighting + * is complete. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Weight +{ + /** + * The weight for this query. + * + * @return float + */ + abstract public function getValue(); + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + abstract public function sumOfSquaredWeights(); + + /** + * Assigns the query normalization factor to this. + * + * @param $norm + */ + abstract public function normalize($norm); +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/MultiTerm.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/MultiTerm.php new file mode 100644 index 00000000..69528ba4 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/MultiTerm.php @@ -0,0 +1,133 @@ +_query = $query; + $this->_reader = $reader; + $this->_weights = array(); + + $signs = $query->getSigns(); + + foreach ($query->getTerms() as $num => $term) { + if ($signs === null || $signs[$num] === null || $signs[$num]) { + $this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); + $query->setWeight($num, $this->_weights[$num]); + } + } + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_query->getBoost(); + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + $sum = 0; + foreach ($this->_weights as $weight) { + // sum sub weights + $sum += $weight->sumOfSquaredWeights(); + } + + // boost each sub-weight + $sum *= $this->_query->getBoost() * $this->_query->getBoost(); + + // check for empty query (like '-something -another') + if ($sum == 0) { + $sum = 1.0; + } + return $sum; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + // incorporate boost + $queryNorm *= $this->_query->getBoost(); + + foreach ($this->_weights as $weight) { + $weight->normalize($queryNorm); + } + } +} + + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Phrase.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Phrase.php new file mode 100644 index 00000000..77e94f28 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Phrase.php @@ -0,0 +1,138 @@ +_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Term.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Term.php new file mode 100644 index 00000000..3e6102f3 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Weight/Term.php @@ -0,0 +1,144 @@ +_term = $term; + $this->_query = $query; + $this->_reader = $reader; + } + + + /** + * The weight for this query + * + * @return float + */ + public function getValue() + { + return $this->_value; + } + + + /** + * The sum of squared weights of contained query clauses. + * + * @return float + */ + public function sumOfSquaredWeights() + { + // compute idf + $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); + + // compute query weight + $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + + // square it + return $this->_queryWeight * $this->_queryWeight; + } + + + /** + * Assigns the query normalization factor to this. + * + * @param float $queryNorm + */ + public function normalize($queryNorm) + { + $this->_queryNorm = $queryNorm; + + // normalize query weight + $this->_queryWeight *= $queryNorm; + + // idf for documents + $this->_value = $this->_queryWeight * $this->_idf; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/Directory.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/Directory.php new file mode 100644 index 00000000..48114a76 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/Directory.php @@ -0,0 +1,118 @@ + Zend_Search_Lucene_Storage_File object + * + * @var array + * @throws Zend_Search_Lucene_Exception + */ + private $_fileHandlers; + + + /** + * Utility function to recursive directory creation + * + * @param string $dir + * @param integer $mode + * @param boolean $recursive + * @return boolean + */ + + static public function mkdirs($dir, $mode = 0777, $recursive = true) + { + if (is_null($dir) || $dir === '') { + return false; + } + if (is_dir($dir) || $dir === '/') { + return true; + } + if (self::mkdirs(dirname($dir), $mode, $recursive)) { + return mkdir($dir, $mode); + } + return false; + } + + + /** + * Object constructor + * Checks if $path is a directory or tries to create it. + * + * @param string $path + * @throws Zend_Search_Lucene_Exception + */ + public function __construct($path) + { + if (!is_dir($path)) { + if (file_exists($path)) { + throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory'); + } else { + if (!self::mkdirs($path)) { + throw new Zend_Search_Lucene_Exception("Can't create directory '$path'."); + } + } + } + $this->_dirPath = $path; + $this->_fileHandlers = array(); + } + + + /** + * Closes the store. + * + * @return void + */ + public function close() + { + foreach ($this->_fileHandlers as $fileObject) { + $fileObject->close(); + } + + unset($this->_fileHandlers); + } + + + /** + * Returns an array of strings, one for each file in the directory. + * + * @return array + */ + public function fileList() + { + $result = array(); + + $dirContent = opendir( $this->_dirPath ); + while ($file = readdir($dirContent)) { + if (($file == '..')||($file == '.')) continue; + + $fullName = $this->_dirPath . '/' . $file; + + if( !is_dir($this->_dirPath . '/' . $file) ) { + $result[] = $file; + } + } + + return $result; + } + + /** + * Creates a new, empty file in the directory with the given $filename. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function createFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b'); + return $this->_fileHandlers[$filename]; + } + + + /** + * Removes an existing $filename in the directory. + * + * @param string $filename + * @return void + */ + public function deleteFile($filename) + { + if (isset($this->_fileHandlers[$filename])) { + $this->_fileHandlers[$filename]->close(); + } + unset($this->_fileHandlers[$filename]); + unlink($this->_dirPath .'/'. $filename); + } + + + /** + * Returns true if a file with the given $filename exists. + * + * @param string $filename + * @return boolean + */ + public function fileExists($filename) + { + return file_exists($this->_dirPath .'/'. $filename); + } + + + /** + * Returns the length of a $filename in the directory. + * + * @param string $filename + * @return integer + */ + public function fileLength($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + return $this->_fileHandlers[$filename]->size(); + } + return filesize($this->_dirPath .'/'. $filename); + } + + + /** + * Returns the UNIX timestamp $filename was last modified. + * + * @param string $filename + * @return integer + */ + public function fileModified($filename) + { + return filemtime($this->_dirPath .'/'. $filename); + } + + + /** + * Renames an existing file in the directory. + * + * @param string $from + * @param string $to + * @return void + */ + public function renameFile($from, $to) + { + if ($this->_fileHandlers[$from] !== null) { + $this->_fileHandlers[$from]->close(); + } + unset($this->_fileHandlers[$from]); + + if ($this->_fileHandlers[$to] !== null) { + $this->_fileHandlers[$to]->close(); + } + unset($this->_fileHandlers[$to]); + + if (file_exists($this->_dirPath . '/' . $to)) { + unlink($this->_dirPath . '/' . $to); + } + + return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); + } + + + /** + * Sets the modified time of $filename to now. + * + * @param string $filename + * @return void + */ + public function touchFile($filename) + { + return touch($this->_dirPath .'/'. $filename); + } + + + /** + * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. + * + * @param string $filename + * @return Zend_Search_Lucene_Storage_File + */ + public function getFileObject($filename) + { + if (isset( $this->_fileHandlers[$filename] )) { + $this->_fileHandlers[$filename]->seek(0); + return $this->_fileHandlers[$filename]; + } + + $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'rb'); + return $this->_fileHandlers[$filename]; + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File.php new file mode 100644 index 00000000..f62af33a --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File.php @@ -0,0 +1,376 @@ +_fread(1)); + } + + /** + * Writes a byte to the end of the file. + * + * @param integer $byte + */ + public function writeByte($byte) + { + return $this->_fwrite(chr($byte), 1); + } + + /** + * Read num bytes from the current position in the file + * and advances the file pointer. + * + * @param integer $num + * @return string + */ + public function readBytes($num) + { + return $this->_fread($num); + } + + /** + * Writes num bytes of data (all, if $num===null) to the end + * of the file. + * + * @param string $data + * @param integer $num + */ + public function writeBytes($data, $num=null) + { + $this->_fwrite($data, $num); + } + + + /** + * Reads an integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readInt() + { + $str = $this->_fread(4); + + return ord($str{0}) << 24 | + ord($str{1}) << 16 | + ord($str{2}) << 8 | + ord($str{3}); + } + + + /** + * Writes an integer to the end of file. + * + * @param integer $value + */ + public function writeInt($value) + { + settype($value, 'integer'); + $this->_fwrite( chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 4 ); + } + + + /** + * Returns a long integer from the current position in the file + * and advances the file pointer. + * + * @return integer + */ + public function readLong() + { + $str = $this->_fread(8); + + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + return /* ord($str{0}) << 56 | */ + /* ord($str{1}) << 48 | */ + /* ord($str{2}) << 40 | */ + /* ord($str{3}) << 32 | */ + ord($str{4}) << 24 | + ord($str{5}) << 16 | + ord($str{6}) << 8 | + ord($str{7}); + } + + /** + * Writes long integer to the end of file + * + * @param integer $value + */ + public function writeLong($value) + { + /** + * PHP uses long as largest integer. fseek() uses long for offset. + * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent + * conversion to float. + * So, largest index segment file is 2Gb + */ + settype($value, 'integer'); + $this->_fwrite( "\x00\x00\x00\x00" . + chr($value>>24 & 0xFF) . + chr($value>>16 & 0xFF) . + chr($value>>8 & 0xFF) . + chr($value & 0xFF), 8 ); + } + + + + /** + * Returns a variable-length integer from the current + * position in the file and advances the file pointer. + * + * @return integer + */ + public function readVInt() + { + $nextByte = ord($this->_fread(1)); + $val = $nextByte & 0x7F; + + for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { + $nextByte = ord($this->_fread(1)); + $val |= ($nextByte & 0x7F) << $shift; + } + return $val; + } + + /** + * Writes a variable-length integer to the end of file. + * + * @param integer $value + */ + public function writeVInt($value) + { + settype($value, 'integer'); + while ($value > 0x7F) { + $this->_fwrite(chr( ($value & 0x7F)|0x80 )); + $value >>= 7; + } + $this->_fwrite(chr($value)); + } + + + /** + * Reads a string from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readString() + { + $strlen = $this->readVInt(); + if ($strlen == 0) { + return ''; + } else { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + $str_val = $this->_fread($strlen); + + for ($count = 0; $count < $strlen; $count++ ) { + if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) { + $addBytes = 1; + if (ord($str_val{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + if (ord($str_val{$count}) & 0x10 ) { + $addBytes++; + } + } + $str_val .= $this->_fread($addBytes); + $strlen += $addBytes; + + // Check for null character. Java2 encodes null character + // in two bytes. + if (ord($str_val{$count}) == 0xC0 && + ord($str_val{$count+1}) == 0x80 ) { + $str_val{$count} = 0; + $str_val = substr($str_val,0,$count+1) + . substr($str_val,$count+2); + } + $count += $addBytes; + } + } + + return $str_val; + } + } + + /** + * Writes a string to the end of file. + * + * @param string $str + * @throws Zend_Search_Lucene_Exception + */ + public function writeString($str) + { + /** + * This implementation supports only Basic Multilingual Plane + * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support + * "supplementary characters" (characters whose code points are + * greater than 0xFFFF) + * Java 2 represents these characters as a pair of char (16-bit) + * values, the first from the high-surrogates range (0xD800-0xDBFF), + * the second from the low-surrogates range (0xDC00-0xDFFF). Then + * they are encoded as usual UTF-8 characters in six bytes. + * Standard UTF-8 representation uses four bytes for supplementary + * characters. + */ + + // convert input to a string before iterating string characters + settype($str, 'string'); + + $chars = $strlen = strlen($str); + $containNullChars = false; + + for ($count = 0; $count < $strlen; $count++ ) { + /** + * String is already in Java 2 representation. + * We should only calculate actual string length and replace + * \x00 by \xC0\x80 + */ + if ((ord($str{$count}) & 0xC0) == 0xC0) { + $addBytes = 1; + if (ord($str{$count}) & 0x20 ) { + $addBytes++; + + // Never used. Java2 doesn't encode strings in four bytes + // and we dont't support non-BMP characters + if (ord($str{$count}) & 0x10 ) { + $addBytes++; + } + } + $chars -= $addBytes; + + if (ord($str{$count}) == 0 ) { + $containNullChars = true; + } + $count += $addBytes; + } + } + + if ($chars < 0) { + throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string'); + } + + $this->writeVInt($chars); + if ($containNullChars) { + $this->_fwrite(str_replace($str, "\x00", "\xC0\x80")); + } else { + $this->_fwrite($str); + } + } + + + /** + * Reads binary data from the current position in the file + * and advances the file pointer. + * + * @return string + */ + public function readBinary() + { + return $this->_fread($this->readVInt()); + } +} \ No newline at end of file diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File/Filesystem.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File/Filesystem.php new file mode 100644 index 00000000..fc6adcf5 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Storage/File/Filesystem.php @@ -0,0 +1,170 @@ +_fileHandle = @fopen($filename, $mode); + + if ($this->_fileHandle===false) { + ini_set('track_errors', $trackErrors); + throw new Zend_Search_Lucene_Exception($php_errormsg); + } + + ini_set('track_errors', $trackErrors); + } + + + /** + * Sets the file position indicator and advances the file pointer. + * The new position, measured in bytes from the beginning of the file, + * is obtained by adding offset to the position specified by whence, + * whose values are defined as follows: + * SEEK_SET - Set position equal to offset bytes. + * SEEK_CUR - Set position to current location plus offset. + * SEEK_END - Set position to end-of-file plus offset. (To move to + * a position before the end-of-file, you need to pass a negative value + * in offset.) + * Upon success, returns 0; otherwise, returns -1 + * + * @param integer $offset + * @param integer $whence + * @return integer + */ + public function seek($offset, $whence=SEEK_SET) + { + return fseek($this->_fileHandle, $offset, $whence); + } + + + /** + * Get file position. + * + * @return integer + */ + public function tell() + { + return ftell($this->_fileHandle); + } + + + /** + * Close File object + */ + public function close() + { + if ($this->_fileHandle !== null ) { + @fclose($this->_fileHandle); + $this->_fileHandle = null; + } + } + + /** + * Get the size of the already opened file + * + * @return integer + */ + public function size() + { + $position = ftell($this->_fileHandle); + fseek($this->_fileHandle, 0, SEEK_END); + $size = ftell($this->_fileHandle); + fseek($this->_fileHandle,$position); + + return $size; + } + + /** + * Read a $length bytes from the file and advance the file pointer. + * + * @param integer $length + * @return string + */ + protected function _fread($length=1) + { + if ($length == 0) { + return ''; + } + + if ($length < 1024) { + return fread($this->_fileHandle, $length); + } + + $data = ''; + while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { + $data .= $nextBlock; + $length -= strlen($nextBlock); + } + return $data; + } + + + /** + * Writes $length number of bytes (all, if $length===null) to the end + * of the file. + * + * @param string $data + * @param integer $length + */ + protected function _fwrite($data, $length=null) + { + if ($length === null ) { + fwrite($this->_fileHandle, $data); + } else { + fwrite($this->_fileHandle, $data, $length); + } + } +} + diff --git a/demos/quickstart/protected/index/Zend/Search/TODO.txt b/demos/quickstart/protected/index/Zend/Search/TODO.txt new file mode 100644 index 00000000..06f7b487 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/TODO.txt @@ -0,0 +1,14 @@ +@todo + +- Improve API: fix ZSearchMultiTermQuery($terms, $signs); + +- Analysis and indexing engine + +- Additional queries: phrase, wildcard, proximity, and range + +- Better class-level docblocks (most functions okay) + +- Some Windows issues(?) during indexing + +- Finish renaming classes to PEAR-like conventions + -- cgit v1.2.3