summaryrefslogtreecommitdiff
path: root/buildscripts/texbuilder/Zend/Search/Lucene.php
diff options
context:
space:
mode:
Diffstat (limited to 'buildscripts/texbuilder/Zend/Search/Lucene.php')
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene.php569
1 files changed, 569 insertions, 0 deletions
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene.php b/buildscripts/texbuilder/Zend/Search/Lucene.php
new file mode 100644
index 00000000..700a8b8a
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene.php
@@ -0,0 +1,569 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once 'Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Document */
+require_once 'Zend/Search/Lucene/Document.php';
+
+/** Zend_Search_Lucene_Storage_Directory */
+require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
+
+/** Zend_Search_Lucene_Index_Term */
+require_once 'Zend/Search/Lucene/Index/Term.php';
+
+/** Zend_Search_Lucene_Index_TermInfo */
+require_once 'Zend/Search/Lucene/Index/TermInfo.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+
+/** Zend_Search_Lucene_Index_FieldInfo */
+require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
+
+/** Zend_Search_Lucene_Index_Writer */
+require_once 'Zend/Search/Lucene/Index/Writer.php';
+
+/** Zend_Search_Lucene_Search_QueryParser */
+require_once 'Zend/Search/Lucene/Search/QueryParser.php';
+
+/** Zend_Search_Lucene_Search_QueryHit */
+require_once 'Zend/Search/Lucene/Search/QueryHit.php';
+
+/** Zend_Search_Lucene_Search_Similarity */
+require_once 'Zend/Search/Lucene/Search/Similarity.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene
+{
+ /**
+ * File system adapter.
+ *
+ * @var Zend_Search_Lucene_Storage_Directory
+ */
+ private $_directory = null;
+
+ /**
+ * File system adapter closing option
+ *
+ * @var boolean
+ */
+ private $_closeDirOnExit = true;
+
+ /**
+ * Writer for this index, not instantiated unless required.
+ *
+ * @var Zend_Search_Lucene_Index_Writer
+ */
+ private $_writer = null;
+
+ /**
+ * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
+ *
+ * @var array Zend_Search_Lucene_Index_SegmentInfo
+ */
+ private $_segmentInfos = array();
+
+ /**
+ * Number of documents in this index.
+ *
+ * @var integer
+ */
+ private $_docCount = 0;
+
+
+ /**
+ * Opens the index.
+ *
+ * IndexReader constructor needs Directory as a parameter. It should be
+ * a string with a path to the index folder or a Directory object.
+ *
+ * @param mixed $directory
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function __construct($directory = null, $create = false)
+ {
+ if ($directory === null) {
+ throw new Zend_Search_Exception('No index directory specified');
+ }
+
+ if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) {
+ $this->_directory = $directory;
+ $this->_closeDirOnExit = false;
+ } else {
+ $this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
+ $this->_closeDirOnExit = true;
+ }
+
+ if ($create) {
+ $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true);
+ } else {
+ $this->_writer = null;
+ }
+
+ $this->_segmentInfos = array();
+
+ $segmentsFile = $this->_directory->getFileObject('segments');
+
+ $format = $segmentsFile->readInt();
+
+ if ($format != (int)0xFFFFFFFF) {
+ throw new Zend_Search_Lucene_Exception('Wrong segments file format');
+ }
+
+ // read version
+ $segmentsFile->readLong();
+
+ // read counter
+ $segmentsFile->readInt();
+
+ $segments = $segmentsFile->readInt();
+
+ $this->_docCount = 0;
+
+ // read segmentInfos
+ for ($count = 0; $count < $segments; $count++) {
+ $segName = $segmentsFile->readString();
+ $segSize = $segmentsFile->readInt();
+ $this->_docCount += $segSize;
+
+ $this->_segmentInfos[$count] =
+ new Zend_Search_Lucene_Index_SegmentInfo($segName,
+ $segSize,
+ $this->_directory);
+ }
+ }
+
+
+ /**
+ * Object destructor
+ */
+ public function __destruct()
+ {
+ $this->commit();
+
+ if ($this->_closeDirOnExit) {
+ $this->_directory->close();
+ }
+ }
+
+ /**
+ * Returns an instance of Zend_Search_Lucene_Index_Writer for the index
+ *
+ * @return Zend_Search_Lucene_Index_Writer
+ */
+ public function getIndexWriter()
+ {
+ if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
+ $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
+ }
+
+ return $this->_writer;
+ }
+
+
+ /**
+ * Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
+ *
+ * @return Zend_Search_Lucene_Storage_Directory
+ */
+ public function getDirectory()
+ {
+ return $this->_directory;
+ }
+
+
+ /**
+ * Returns the total number of documents in this index.
+ *
+ * @return integer
+ */
+ public function count()
+ {
+ return $this->_docCount;
+ }
+
+
+ /**
+ * Performs a query against the index and returns an array
+ * of Zend_Search_Lucene_Search_QueryHit objects.
+ * Input is a string or Zend_Search_Lucene_Search_Query.
+ *
+ * @param mixed $query
+ * @return array ZSearchHit
+ */
+ public function find($query)
+ {
+ if (is_string($query)) {
+ $query = Zend_Search_Lucene_Search_QueryParser::parse($query);
+ }
+
+ if (!$query instanceof Zend_Search_Lucene_Search_Query) {
+ throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
+ }
+
+ $this->commit();
+
+ $hits = array();
+ $scores = array();
+
+ $docNum = $this->count();
+ for( $count=0; $count < $docNum; $count++ ) {
+ $docScore = $query->score( $count, $this);
+ if( $docScore != 0 ) {
+ $hit = new Zend_Search_Lucene_Search_QueryHit($this);
+ $hit->id = $count;
+ $hit->score = $docScore;
+
+ $hits[] = $hit;
+ $scores[] = $docScore;
+ }
+ }
+ array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits);
+
+ return $hits;
+ }
+
+
+ /**
+ * Returns a list of all unique field names that exist in this index.
+ *
+ * @param boolean $indexed
+ * @return array
+ */
+ public function getFieldNames($indexed = false)
+ {
+ $result = array();
+ foreach( $this->_segmentInfos as $segmentInfo ) {
+ $result = array_merge($result, $segmentInfo->getFields($indexed));
+ }
+ return $result;
+ }
+
+
+ /**
+ * Returns a Zend_Search_Lucene_Document object for the document
+ * number $id in this index.
+ *
+ * @param integer|Zend_Search_Lucene_Search_QueryHit $id
+ * @return Zend_Search_Lucene_Document
+ */
+ public function getDocument($id)
+ {
+ if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
+ /* @var $id Zend_Search_Lucene_Search_QueryHit */
+ $id = $id->id;
+ }
+
+ if ($id >= $this->_docCount) {
+ /**
+ * @todo exception here?
+ */
+ return null;
+ }
+
+ $segCount = 0;
+ $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
+ while( $nextSegmentStartId <= $id ) {
+ $segCount++;
+ $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+ }
+ $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
+
+ $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx');
+ $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR );
+ $fieldValuesPosition = $fdxFile->readLong();
+
+ $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt');
+ $fdtFile->seek( $fieldValuesPosition, SEEK_CUR );
+ $fieldCount = $fdtFile->readVInt();
+
+ $doc = new Zend_Search_Lucene_Document();
+ for( $count = 0; $count < $fieldCount; $count++ ) {
+ $fieldNum = $fdtFile->readVInt();
+ $bits = $fdtFile->readByte();
+
+ $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum);
+
+ if( !($bits & 2) ) { // Text data
+ $field = new Zend_Search_Lucene_Field($fieldInfo->name,
+ $fdtFile->readString(),
+ true,
+ $fieldInfo->isIndexed,
+ $bits & 1 );
+ } else {
+ $field = new Zend_Search_Lucene_Field($fieldInfo->name,
+ $fdtFile->readBinary(),
+ true,
+ $fieldInfo->isIndexed,
+ $bits & 1 );
+ }
+
+ $doc->addField($field);
+ }
+
+ return $doc;
+ }
+
+
+ /**
+ * Returns an array of all the documents which contain term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termDocs(Zend_Search_Lucene_Index_Term $term)
+ {
+ $result = array();
+ $segmentStartDocId = 0;
+
+ foreach ($this->_segmentInfos as $segInfo) {
+ $termInfo = $segInfo->getTermInfo($term);
+
+ if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+ $segmentStartDocId += $segInfo->count();
+ continue;
+ }
+
+ $frqFile = $segInfo->openCompoundFile('.frq');
+ $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+ $docId = 0;
+ for( $count=0; $count < $termInfo->docFreq; $count++ ) {
+ $docDelta = $frqFile->readVInt();
+ if( $docDelta % 2 == 1 ) {
+ $docId += ($docDelta-1)/2;
+ } else {
+ $docId += $docDelta/2;
+ // read freq
+ $frqFile->readVInt();
+ }
+ $result[] = $segmentStartDocId + $docId;
+ }
+
+ $segmentStartDocId += $segInfo->count();
+ }
+
+ return $result;
+ }
+
+
+ /**
+ * Returns an array of all term positions in the documents.
+ * Return array structure: array( docId => array( pos1, pos2, ...), ...)
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return array
+ */
+ public function termPositions(Zend_Search_Lucene_Index_Term $term)
+ {
+ $result = array();
+ $segmentStartDocId = 0;
+ foreach( $this->_segmentInfos as $segInfo ) {
+ $termInfo = $segInfo->getTermInfo($term);
+
+ if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
+ $segmentStartDocId += $segInfo->count();
+ continue;
+ }
+
+ $frqFile = $segInfo->openCompoundFile('.frq');
+ $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
+ $freqs = array();
+ $docId = 0;
+
+ for( $count = 0; $count < $termInfo->docFreq; $count++ ) {
+ $docDelta = $frqFile->readVInt();
+ if( $docDelta % 2 == 1 ) {
+ $docId += ($docDelta-1)/2;
+ $freqs[ $docId ] = 1;
+ } else {
+ $docId += $docDelta/2;
+ $freqs[ $docId ] = $frqFile->readVInt();
+ }
+ }
+
+ $prxFile = $segInfo->openCompoundFile('.prx');
+ $prxFile->seek($termInfo->proxPointer,SEEK_CUR);
+ foreach ($freqs as $docId => $freq) {
+ $termPosition = 0;
+ $positions = array();
+
+ for ($count = 0; $count < $freq; $count++ ) {
+ $termPosition += $prxFile->readVInt();
+ $positions[] = $termPosition;
+ }
+ $result[ $segmentStartDocId + $docId ] = $positions;
+ }
+
+ $segmentStartDocId += $segInfo->count();
+ }
+
+ return $result;
+ }
+
+
+ /**
+ * Returns the number of documents in this index containing the $term.
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return integer
+ */
+ public function docFreq(Zend_Search_Lucene_Index_Term $term)
+ {
+ $result = 0;
+ foreach ($this->_segmentInfos as $segInfo) {
+ $termInfo = $segInfo->getTermInfo($term);
+ if ($termInfo !== null) {
+ $result += $termInfo->docFreq;
+ }
+ }
+
+ return $result;
+ }
+
+
+ /**
+ * Retrive similarity used by index reader
+ *
+ * @return Zend_Search_Lucene_Search_Similarity
+ */
+ public function getSimilarity()
+ {
+ return Zend_Search_Lucene_Search_Similarity::getDefault();
+ }
+
+
+ /**
+ * Returns a normalization factor for "field, document" pair.
+ *
+ * @param integer $id
+ * @param string $fieldName
+ * @return Zend_Search_Lucene_Document
+ */
+ public function norm( $id, $fieldName )
+ {
+ if( $id >= $this->_docCount )
+ return null;
+
+ $segCount = 0;
+ $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count();
+ while( $nextSegmentStartId <= $id ) {
+ $segCount++;
+ $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count();
+ }
+
+ $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count();
+
+ return $this->_segmentInfos[ $segCount ]->norm($id - $segmentStartId, $fieldName);
+ }
+
+
+ /**
+ * Adds a document to this index.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document)
+ {
+ if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) {
+ $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory);
+ }
+
+ $this->_writer->addDocument($document);
+ }
+
+
+ /**
+ * Commit changes resulting from delete() or undeleteAll() operations.
+ *
+ * @todo delete() and undeleteAll processing.
+ */
+ public function commit()
+ {
+ if ($this->_writer !== null) {
+ foreach ($this->_writer->commit() as $segmentName => $segmentInfo) {
+ if ($segmentInfo !== null) {
+ $this->_segmentInfos[] = $segmentInfo;
+ $this->_docCount += $segmentInfo->count();
+ } else {
+ foreach ($this->_segmentInfos as $segId => $segInfo) {
+ if ($segInfo->getName() == $segmentName) {
+ unset($this->_segmentInfos[$segId]);
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ /*************************************************************************
+ @todo UNIMPLEMENTED
+ *************************************************************************/
+
+ /**
+ * Returns an array of all terms in this index.
+ *
+ * @todo Implementation
+ * @return array
+ */
+ public function terms()
+ {
+ return array();
+ }
+
+
+ /**
+ * Returns true if any documents have been deleted from this index.
+ *
+ * @todo Implementation
+ * @return boolean
+ */
+ public function hasDeletions()
+ {
+ return false;
+ }
+
+
+ /**
+ * Deletes a document from the index. $doc may contain a Zend_Search_Lucene_Document
+ * or the number of the document to delete.
+ *
+ * @todo Implementation
+ * @param mixed $item_to_del
+ */
+ public function delete($doc)
+ {}
+
+
+ /**
+ * Undeletes all documents currently marked as deleted in this index.
+ *
+ * @todo Implementation
+ */
+ public function undeleteAll()
+ {}
+} \ No newline at end of file