<?php /** * Zend Framework * * LICENSE * * This source file is subject to version 1.0 of the Zend Framework * license, that is bundled with this package in the file LICENSE, and * is available through the world-wide-web at the following URL: * http://www.zend.com/license/framework/1_0.txt. If you did not receive * a copy of the Zend Framework license and are unable to obtain it * through the world-wide-web, please send a note to license@zend.com * so we can mail you a copy immediately. * * @package Zend_Search_Lucene * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 */ /** Zend_Search_Lucene_Exception */ require_once 'Zend/Search/Lucene/Exception.php'; /** Zend_Search_Lucene_Document */ require_once 'Zend/Search/Lucene/Document.php'; /** Zend_Search_Lucene_Storage_Directory */ require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php'; /** Zend_Search_Lucene_Index_Term */ require_once 'Zend/Search/Lucene/Index/Term.php'; /** Zend_Search_Lucene_Index_TermInfo */ require_once 'Zend/Search/Lucene/Index/TermInfo.php'; /** Zend_Search_Lucene_Index_SegmentInfo */ require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; /** Zend_Search_Lucene_Index_FieldInfo */ require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; /** Zend_Search_Lucene_Index_Writer */ require_once 'Zend/Search/Lucene/Index/Writer.php'; /** Zend_Search_Lucene_Search_QueryParser */ require_once 'Zend/Search/Lucene/Search/QueryParser.php'; /** Zend_Search_Lucene_Search_QueryHit */ require_once 'Zend/Search/Lucene/Search/QueryHit.php'; /** Zend_Search_Lucene_Search_Similarity */ require_once 'Zend/Search/Lucene/Search/Similarity.php'; /** * @package Zend_Search_Lucene * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 */ class Zend_Search_Lucene { /** * File system adapter. * * @var Zend_Search_Lucene_Storage_Directory */ private $_directory = null; /** * File system adapter closing option * * @var boolean */ private $_closeDirOnExit = true; /** * Writer for this index, not instantiated unless required. * * @var Zend_Search_Lucene_Index_Writer */ private $_writer = null; /** * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. * * @var array Zend_Search_Lucene_Index_SegmentInfo */ private $_segmentInfos = array(); /** * Number of documents in this index. * * @var integer */ private $_docCount = 0; /** * Opens the index. * * IndexReader constructor needs Directory as a parameter. It should be * a string with a path to the index folder or a Directory object. * * @param mixed $directory * @throws Zend_Search_Lucene_Exception */ public function __construct($directory = null, $create = false) { if ($directory === null) { throw new Zend_Search_Exception('No index directory specified'); } if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { $this->_directory = $directory; $this->_closeDirOnExit = false; } else { $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); $this->_closeDirOnExit = true; } if ($create) { $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); } else { $this->_writer = null; } $this->_segmentInfos = array(); $segmentsFile = $this->_directory->getFileObject('segments'); $format = $segmentsFile->readInt(); if ($format != (int)0xFFFFFFFF) { throw new Zend_Search_Lucene_Exception('Wrong segments file format'); } // read version $segmentsFile->readLong(); // read counter $segmentsFile->readInt(); $segments = $segmentsFile->readInt(); $this->_docCount = 0; // read segmentInfos for ($count = 0; $count < $segments; $count++) { $segName = $segmentsFile->readString(); $segSize = $segmentsFile->readInt(); $this->_docCount += $segSize; $this->_segmentInfos[$count] = new Zend_Search_Lucene_Index_SegmentInfo($segName, $segSize, $this->_directory); } } /** * Object destructor */ public function __destruct() { $this->commit(); if ($this->_closeDirOnExit) { $this->_directory->close(); } } /** * Returns an instance of Zend_Search_Lucene_Index_Writer for the index * * @return Zend_Search_Lucene_Index_Writer */ public function getIndexWriter() { if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); } return $this->_writer; } /** * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. * * @return Zend_Search_Lucene_Storage_Directory */ public function getDirectory() { return $this->_directory; } /** * Returns the total number of documents in this index. * * @return integer */ public function count() { return $this->_docCount; } /** * Performs a query against the index and returns an array * of Zend_Search_Lucene_Search_QueryHit objects. * Input is a string or Zend_Search_Lucene_Search_Query. * * @param mixed $query * @return array ZSearchHit */ public function find($query) { if (is_string($query)) { $query = Zend_Search_Lucene_Search_QueryParser::parse($query); } if (!$query instanceof Zend_Search_Lucene_Search_Query) { throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); } $this->commit(); $hits = array(); $scores = array(); $docNum = $this->count(); for( $count=0; $count < $docNum; $count++ ) { $docScore = $query->score( $count, $this); if( $docScore != 0 ) { $hit = new Zend_Search_Lucene_Search_QueryHit($this); $hit->id = $count; $hit->score = $docScore; $hits[] = $hit; $scores[] = $docScore; } } array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); return $hits; } /** * Returns a list of all unique field names that exist in this index. * * @param boolean $indexed * @return array */ public function getFieldNames($indexed = false) { $result = array(); foreach( $this->_segmentInfos as $segmentInfo ) { $result = array_merge($result, $segmentInfo->getFields($indexed)); } return $result; } /** * Returns a Zend_Search_Lucene_Document object for the document * number $id in this index. * * @param integer|Zend_Search_Lucene_Search_QueryHit $id * @return Zend_Search_Lucene_Document */ public function getDocument($id) { if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { /* @var $id Zend_Search_Lucene_Search_QueryHit */ $id = $id->id; } if ($id >= $this->_docCount) { /** * @todo exception here? */ return null; } $segCount = 0; $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); while( $nextSegmentStartId <= $id ) { $segCount++; $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); } $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); $fieldValuesPosition = $fdxFile->readLong(); $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); $fieldCount = $fdtFile->readVInt(); $doc = new Zend_Search_Lucene_Document(); for( $count = 0; $count < $fieldCount; $count++ ) { $fieldNum = $fdtFile->readVInt(); $bits = $fdtFile->readByte(); $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); if( !($bits & 2) ) { // Text data $field = new Zend_Search_Lucene_Field($fieldInfo->name, $fdtFile->readString(), true, $fieldInfo->isIndexed, $bits & 1 ); } else { $field = new Zend_Search_Lucene_Field($fieldInfo->name, $fdtFile->readBinary(), true, $fieldInfo->isIndexed, $bits & 1 ); } $doc->addField($field); } return $doc; } /** * Returns an array of all the documents which contain term. * * @param Zend_Search_Lucene_Index_Term $term * @return array */ public function termDocs(Zend_Search_Lucene_Index_Term $term) { $result = array(); $segmentStartDocId = 0; foreach ($this->_segmentInfos as $segInfo) { $termInfo = $segInfo->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { $segmentStartDocId += $segInfo->count(); continue; } $frqFile = $segInfo->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $docId = 0; for( $count=0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; } else { $docId += $docDelta/2; // read freq $frqFile->readVInt(); } $result[] = $segmentStartDocId + $docId; } $segmentStartDocId += $segInfo->count(); } return $result; } /** * Returns an array of all term positions in the documents. * Return array structure: array( docId => array( pos1, pos2, ...), ...) * * @param Zend_Search_Lucene_Index_Term $term * @return array */ public function termPositions(Zend_Search_Lucene_Index_Term $term) { $result = array(); $segmentStartDocId = 0; foreach( $this->_segmentInfos as $segInfo ) { $termInfo = $segInfo->getTermInfo($term); if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { $segmentStartDocId += $segInfo->count(); continue; } $frqFile = $segInfo->openCompoundFile('.frq'); $frqFile->seek($termInfo->freqPointer,SEEK_CUR); $freqs = array(); $docId = 0; for( $count = 0; $count < $termInfo->docFreq; $count++ ) { $docDelta = $frqFile->readVInt(); if( $docDelta % 2 == 1 ) { $docId += ($docDelta-1)/2; $freqs[ $docId ] = 1; } else { $docId += $docDelta/2; $freqs[ $docId ] = $frqFile->readVInt(); } } $prxFile = $segInfo->openCompoundFile('.prx'); $prxFile->seek($termInfo->proxPointer,SEEK_CUR); foreach ($freqs as $docId => $freq) { $termPosition = 0; $positions = array(); for ($count = 0; $count < $freq; $count++ ) { $termPosition += $prxFile->readVInt(); $positions[] = $termPosition; } $result[ $segmentStartDocId + $docId ] = $positions; } $segmentStartDocId += $segInfo->count(); } return $result; } /** * Returns the number of documents in this index containing the $term. * * @param Zend_Search_Lucene_Index_Term $term * @return integer */ public function docFreq(Zend_Search_Lucene_Index_Term $term) { $result = 0; foreach ($this->_segmentInfos as $segInfo) { $termInfo = $segInfo->getTermInfo($term); if ($termInfo !== null) { $result += $termInfo->docFreq; } } return $result; } /** * Retrive similarity used by index reader * * @return Zend_Search_Lucene_Search_Similarity */ public function getSimilarity() { return Zend_Search_Lucene_Search_Similarity::getDefault(); } /** * Returns a normalization factor for "field, document" pair. * * @param integer $id * @param string $fieldName * @return Zend_Search_Lucene_Document */ public function norm( $id, $fieldName ) { if( $id >= $this->_docCount ) return null; $segCount = 0; $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); while( $nextSegmentStartId <= $id ) { $segCount++; $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); } $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); return $this->_segmentInfos[ $segCount ]->norm($id - $segmentStartId, $fieldName); } /** * Adds a document to this index. * * @param Zend_Search_Lucene_Document $document */ public function addDocument(Zend_Search_Lucene_Document $document) { if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); } $this->_writer->addDocument($document); } /** * Commit changes resulting from delete() or undeleteAll() operations. * * @todo delete() and undeleteAll processing. */ public function commit() { if ($this->_writer !== null) { foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { if ($segmentInfo !== null) { $this->_segmentInfos[] = $segmentInfo; $this->_docCount += $segmentInfo->count(); } else { foreach ($this->_segmentInfos as $segId => $segInfo) { if ($segInfo->getName() == $segmentName) { unset($this->_segmentInfos[$segId]); } } } } } } /************************************************************************* @todo UNIMPLEMENTED *************************************************************************/ /** * Returns an array of all terms in this index. * * @todo Implementation * @return array */ public function terms() { return array(); } /** * Returns true if any documents have been deleted from this index. * * @todo Implementation * @return boolean */ public function hasDeletions() { return false; } /** * Deletes a document from the index. $doc may contain a Zend_Search_Lucene_Document * or the number of the document to delete. * * @todo Implementation * @param mixed $item_to_del */ public function delete($doc) {} /** * Undeletes all documents currently marked as deleted in this index. * * @todo Implementation */ public function undeleteAll() {} }