summaryrefslogtreecommitdiff
path: root/buildscripts/texbuilder/Zend/Search/Lucene/Index
diff options
context:
space:
mode:
authorwei <>2006-05-07 03:34:25 +0000
committerwei <>2006-05-07 03:34:25 +0000
commit30eddf57c8de433e8ea02b9e552c8e1744a505a7 (patch)
tree9e81f3a15f9a695cb96c5cc4dd80de5a3a0bb7b2 /buildscripts/texbuilder/Zend/Search/Lucene/Index
parent0bb2822f68dfe3cf568affd4acf0d8120d9d53c7 (diff)
Add search to quickstart demo.
Diffstat (limited to 'buildscripts/texbuilder/Zend/Search/Lucene/Index')
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php43
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php412
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php491
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php70
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php77
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php308
6 files changed, 1401 insertions, 0 deletions
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php
new file mode 100644
index 00000000..eaca4ecf
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php
@@ -0,0 +1,43 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_FieldInfo
+{
+ public $name;
+ public $isIndexed;
+ public $number;
+ public $storeTermVector;
+
+ public function __construct( $name, $isIndexed, $number, $storeTermVector )
+ {
+ $this->name = $name;
+ $this->isIndexed = $isIndexed;
+ $this->number = $number;
+ $this->storeTermVector = $storeTermVector;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php
new file mode 100644
index 00000000..f5c596a0
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php
@@ -0,0 +1,412 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once 'Zend/Search/Lucene/Exception.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_SegmentInfo
+{
+ /**
+ * Number of docs in a segment
+ *
+ * @var integer
+ */
+ private $_docCount;
+
+ /**
+ * Segment name
+ *
+ * @var string
+ */
+ private $_name;
+
+ /**
+ * Term Dictionary Index
+ * Array of the Zend_Search_Lucene_Index_Term objects
+ * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
+ *
+ * @var array
+ */
+ private $_termDictionary;
+
+ /**
+ * Term Dictionary Index TermInfos
+ * Array of the Zend_Search_Lucene_Index_TermInfo objects
+ *
+ * @var array
+ */
+ private $_termDictionaryInfos;
+
+ /**
+ * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
+ *
+ * @var array
+ */
+ private $_fields;
+
+ /**
+ * Field positions in a dictionary.
+ * (Term dictionary contains filelds ordered by names)
+ *
+ * @var array
+ */
+ private $_fieldsDicPositions;
+
+
+ /**
+ * Associative array where the key is the file name and the value is data offset
+ * in a compound segment file (.csf).
+ *
+ * @var array
+ */
+ private $_segFiles;
+
+ /**
+ * File system adapter.
+ *
+ * @var Zend_Search_Lucene_Storage_Directory_Filesystem
+ */
+ private $_directory;
+
+ /**
+ * Normalization factors.
+ * An array fieldName => normVector
+ * normVector is a binary string.
+ * Each byte corresponds to an indexed document in a segment and
+ * encodes normalization factor (float value, encoded by
+ * Zend_Search_Lucene_Search_Similarity::encodeNorm())
+ *
+ * @var array
+ */
+ private $_norms = array();
+
+ /**
+ * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname,
+ * Documents count and Directory as a parameter.
+ *
+ * @param string $name
+ * @param integer $docCount
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ */
+ public function __construct($name, $docCount, $directory)
+ {
+ $this->_name = $name;
+ $this->_docCount = $docCount;
+ $this->_directory = $directory;
+ $this->_termDictionary = null;
+
+ $this->_segFiles = array();
+ $cfsFile = $this->_directory->getFileObject($name . '.cfs');
+ $segFilesCount = $cfsFile->readVInt();
+
+ for ($count = 0; $count < $segFilesCount; $count++) {
+ $dataOffset = $cfsFile->readLong();
+ $fileName = $cfsFile->readString();
+ $this->_segFiles[$fileName] = $dataOffset;
+ }
+
+ $fnmFile = $this->openCompoundFile('.fnm');
+ $fieldsCount = $fnmFile->readVInt();
+ $fieldNames = array();
+ $fieldNums = array();
+ $this->_fields = array();
+ for ($count=0; $count < $fieldsCount; $count++) {
+ $fieldName = $fnmFile->readString();
+ $fieldBits = $fnmFile->readByte();
+ $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
+ $fieldBits & 1,
+ $count,
+ $fieldBits & 2 );
+ if ($fieldBits & 0x10) {
+ // norms are omitted for the indexed field
+ $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
+ }
+
+ $fieldNums[$count] = $count;
+ $fieldNames[$count] = $fieldName;
+ }
+ array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
+ $this->_fieldsDicPositions = array_flip($fieldNums);
+ }
+
+ /**
+ * Opens index file stoted within compound index file
+ *
+ * @param string $extension
+ * @throws Zend_Search_Lucene_Exception
+ * @return Zend_Search_Lucene_Storage_File
+ */
+ public function openCompoundFile($extension)
+ {
+ $filename = $this->_name . $extension;
+
+ if( !isset($this->_segFiles[ $filename ]) ) {
+ throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
+ . $filename . ' file.' );
+ }
+
+ $file = $this->_directory->getFileObject( $this->_name.".cfs" );
+ $file->seek( $this->_segFiles[ $filename ] );
+ return $file;
+ }
+
+ /**
+ * Returns field index or -1 if field is not found
+ *
+ * @param string $fieldName
+ * @return integer
+ */
+ public function getFieldNum($fieldName)
+ {
+ foreach( $this->_fields as $field ) {
+ if( $field->name == $fieldName ) {
+ return $field->number;
+ }
+ }
+
+ return -1;
+ }
+
+ /**
+ * Returns field info for specified field
+ *
+ * @param integer $fieldNum
+ * @return ZSearchFieldInfo
+ */
+ public function getField($fieldNum)
+ {
+ return $this->_fields[$fieldNum];
+ }
+
+ /**
+ * Returns array of fields.
+ * if $indexed parameter is true, then returns only indexed fields.
+ *
+ * @param boolean $indexed
+ * @return array
+ */
+ public function getFields($indexed = false)
+ {
+ $result = array();
+ foreach( $this->_fields as $field ) {
+ if( (!$indexed) || $field->isIndexed ) {
+ $result[ $field->name ] = $field->name;
+ }
+ }
+ return $result;
+ }
+
+ /**
+ * Returns the total number of documents in this segment.
+ *
+ * @return integer
+ */
+ public function count()
+ {
+ return $this->_docCount;
+ }
+
+
+ /**
+ * Loads Term dictionary from TermInfoIndex file
+ */
+ protected function _loadDictionary()
+ {
+ if ($this->_termDictionary !== null) {
+ return;
+ }
+
+ $this->_termDictionary = array();
+ $this->_termDictionaryInfos = array();
+
+ $tiiFile = $this->openCompoundFile('.tii');
+ $tiVersion = $tiiFile->readInt();
+ if ($tiVersion != (int)0xFFFFFFFE) {
+ throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
+ }
+
+ $indexTermCount = $tiiFile->readLong();
+ $tiiFile->readInt(); // IndexInterval
+ $skipInterval = $tiiFile->readInt();
+
+ $prevTerm = '';
+ $freqPointer = 0;
+ $proxPointer = 0;
+ $indexPointer = 0;
+ for ($count = 0; $count < $indexTermCount; $count++) {
+ $termPrefixLength = $tiiFile->readVInt();
+ $termSuffix = $tiiFile->readString();
+ $termValue = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix;
+
+ $termFieldNum = $tiiFile->readVInt();
+ $docFreq = $tiiFile->readVInt();
+ $freqPointer += $tiiFile->readVInt();
+ $proxPointer += $tiiFile->readVInt();
+ if( $docFreq >= $skipInterval ) {
+ $skipDelta = $tiiFile->readVInt();
+ } else {
+ $skipDelta = 0;
+ }
+
+ $indexPointer += $tiiFile->readVInt();
+
+ $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum);
+ $this->_termDictionaryInfos[] =
+ new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
+ $prevTerm = $termValue;
+ }
+ }
+
+
+ /**
+ * Return segment name
+ *
+ * @return string
+ */
+ public function getName()
+ {
+ return $this->_name;
+ }
+
+
+ /**
+ * Scans terms dictionary and returns term info
+ *
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @return Zend_Search_Lucene_Index_TermInfo
+ */
+ public function getTermInfo($term)
+ {
+ $this->_loadDictionary();
+
+ $searchField = $this->getFieldNum($term->field);
+
+ if ($searchField == -1) {
+ return null;
+ }
+ $searchDicField = $this->_fieldsDicPositions[$searchField];
+
+ // search for appropriate value in dictionary
+ $lowIndex = 0;
+ $highIndex = count($this->_termDictionary)-1;
+ while ($highIndex >= $lowIndex) {
+ // $mid = ($highIndex - $lowIndex)/2;
+ $mid = ($highIndex + $lowIndex) >> 1;
+ $midTerm = $this->_termDictionary[$mid];
+
+ $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field];
+ if ($delta == 0) {
+ $delta = strcmp($term->text, $midTerm->text);
+ }
+
+ if ($delta < 0) {
+ $highIndex = $mid-1;
+ } elseif ($delta > 0) {
+ $lowIndex = $mid+1;
+ } else {
+ return $this->_termDictionaryInfos[$mid]; // We got it!
+ }
+ }
+
+ if ($highIndex == -1) {
+ // Term is out of the dictionary range
+ return null;
+ }
+
+ $prevPosition = $highIndex;
+ $prevTerm = $this->_termDictionary[$prevPosition];
+ $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ];
+
+ $tisFile = $this->openCompoundFile('.tis');
+ $tiVersion = $tisFile->readInt();
+ if ($tiVersion != (int)0xFFFFFFFE) {
+ throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
+ }
+
+ $termCount = $tisFile->readLong();
+ $indexInterval = $tisFile->readInt();
+ $skipInterval = $tisFile->readInt();
+
+ $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR);
+
+ $termValue = $prevTerm->text;
+ $termFieldNum = $prevTerm->field;
+ $freqPointer = $prevTermInfo->freqPointer;
+ $proxPointer = $prevTermInfo->proxPointer;
+ for ($count = $prevPosition*$indexInterval + 1;
+ $count < $termCount &&
+ ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField ||
+ ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField &&
+ strcmp($termValue, $term->text) < 0) );
+ $count++) {
+ $termPrefixLength = $tisFile->readVInt();
+ $termSuffix = $tisFile->readString();
+ $termFieldNum = $tisFile->readVInt();
+ $termValue = substr( $termValue, 0, $termPrefixLength ) . $termSuffix;
+
+ $docFreq = $tisFile->readVInt();
+ $freqPointer += $tisFile->readVInt();
+ $proxPointer += $tisFile->readVInt();
+ if( $docFreq >= $skipInterval ) {
+ $skipOffset = $tisFile->readVInt();
+ } else {
+ $skipOffset = 0;
+ }
+ }
+
+ if ($termFieldNum == $searchField && $termValue == $term->text) {
+ return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Returns normalization factor for specified documents
+ *
+ * @param integer $id
+ * @param string $fieldName
+ * @return string
+ */
+ public function norm($id, $fieldName)
+ {
+ $fieldNum = $this->getFieldNum($fieldName);
+
+ if ( !($this->_fields[$fieldNum]->isIndexed) ) {
+ return null;
+ }
+
+ if ( !isset( $this->_norms[$fieldNum] )) {
+ $fFile = $this->openCompoundFile('.f' . $fieldNum);
+ $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
+ }
+
+ return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) );
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php
new file mode 100644
index 00000000..f90d6ed3
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php
@@ -0,0 +1,491 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Exception */
+require_once 'Zend/Search/Lucene/Exception.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_SegmentWriter
+{
+ /**
+ * Expert: The fraction of terms in the "dictionary" which should be stored
+ * in RAM. Smaller values use more memory, but make searching slightly
+ * faster, while larger values use less memory and make searching slightly
+ * slower. Searching is typically not dominated by dictionary lookup, so
+ * tweaking this is rarely useful.
+ *
+ * @var integer
+ */
+ static public $indexInterval = 128;
+
+ /** Expert: The fraction of TermDocs entries stored in skip tables.
+ * Larger values result in smaller indexes, greater acceleration, but fewer
+ * accelerable cases, while smaller values result in bigger indexes,
+ * less acceleration and more
+ * accelerable cases. More detailed experiments would be useful here.
+ *
+ * 0x0x7FFFFFFF indicates that we don't use skip data
+ * Default value is 16
+ *
+ * @var integer
+ */
+ static public $skipInterval = 0x7FFFFFFF;
+
+ /**
+ * Number of docs in a segment
+ *
+ * @var integer
+ */
+ private $_docCount;
+
+ /**
+ * Segment name
+ *
+ * @var string
+ */
+ private $_name;
+
+ /**
+ * File system adapter.
+ *
+ * @var Zend_Search_Lucene_Storage_Directory
+ */
+ private $_directory;
+
+ /**
+ * List of the index files.
+ * Used for automatic compound file generation
+ *
+ * @var unknown_type
+ */
+ private $_files;
+
+ /**
+ * Term Dictionary
+ * Array of the Zend_Search_Lucene_Index_Term objects
+ * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
+ *
+ * @var array
+ */
+ private $_termDictionary;
+
+ /**
+ * Documents, which contain the term
+ *
+ * @var array
+ */
+ private $_termDocs;
+
+ /**
+ * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
+ *
+ * @var array
+ */
+ private $_fields;
+
+ /**
+ * Normalization factors.
+ * An array fieldName => normVector
+ * normVector is a binary string.
+ * Each byte corresponds to an indexed document in a segment and
+ * encodes normalization factor (float value, encoded by
+ * Zend_Search_Lucene_Search_Similarity::encodeNorm())
+ *
+ * @var array
+ */
+ private $_norms;
+
+
+ /**
+ * '.fdx' file - Stored Fields, the field index.
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_fdxFile;
+
+ /**
+ * '.fdx' file - Stored Fields, the field data.
+ *
+ * @var Zend_Search_Lucene_Storage_File
+ */
+ private $_fdtFile;
+
+
+ /**
+ * Object constructor.
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param string $name
+ */
+ public function __construct($directory, $name)
+ {
+ $this->_directory = $directory;
+ $this->_name = $name;
+ $this->_docCount = 0;
+
+ $this->_fields = array();
+ $this->_termDocs = array();
+ $this->_files = array();
+ $this->_norms = array();
+
+ $this->_fdxFile = null;
+ $this->_fdtFile = null;
+ }
+
+
+ /**
+ * Add field to the segment
+ *
+ * @param Zend_Search_Lucene_Field $field
+ */
+ private function _addFieldInfo(Zend_Search_Lucene_Field $field)
+ {
+ if (!isset($this->_fields[$field->name])) {
+ $this->_fields[$field->name] =
+ new Zend_Search_Lucene_Index_FieldInfo($field->name,
+ $field->isIndexed,
+ count($this->_fields),
+ $field->storeTermVector);
+ } else {
+ $this->_fields[$field->name]->isIndexed |= $field->isIndexed;
+ $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
+ }
+ }
+
+
+ /**
+ * Adds a document to this segment.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ * @throws Zend_Search_Lucene_Exception
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document)
+ {
+ $storedFields = array();
+
+ foreach ($document->getFieldNames() as $fieldName) {
+ $field = $document->getField($fieldName);
+ $this->_addFieldInfo($field);
+
+ if ($field->storeTermVector) {
+ /**
+ * @todo term vector storing support
+ */
+ throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
+ }
+
+ if ($field->isIndexed) {
+ if ($field->isTokenized) {
+ $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue);
+ } else {
+ $tokenList = array();
+ $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue));
+ }
+
+ $position = 0;
+ foreach ($tokenList as $token) {
+ $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
+ $termKey = $term->key();
+
+ if (!isset($this->_termDictionary[$termKey])) {
+ // New term
+ $this->_termDictionary[$termKey] = $term;
+ $this->_termDocs[$termKey] = array();
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
+ // Existing term, but new term entry
+ $this->_termDocs[$termKey][$this->_docCount] = array();
+ }
+ $position += $token->getPositionIncrement();
+ $this->_termDocs[$termKey][$this->_docCount][] = $position;
+ }
+ }
+
+ if ($field->isStored) {
+ $storedFields[] = $field;
+ }
+ }
+
+ if (count($storedFields) != 0) {
+ if (!isset($this->_fdxFile)) {
+ $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
+ $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
+
+ $this->_files[] = $this->_name . '.fdx';
+ $this->_files[] = $this->_name . '.fdt';
+ }
+
+ $this->_fdxFile->writeLong($this->_fdtFile->tell());
+
+ $this->_fdtFile->writeVInt(count($storedFields));
+ foreach ($storedFields as $field) {
+ $this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
+ $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 |
+ $field->isBinary ? 0x02 : 0x00 |
+ 0x00 /* 0x04 - third bit, compressed (ZLIB) */ );
+ if ($field->isBinary) {
+ $this->_fdtFile->writeVInt(strlen($field->stringValue));
+ $this->_fdtFile->writeBytes($field->stringValue);
+ } else {
+ $this->_fdtFile->writeString($field->stringValue);
+ }
+ }
+ }
+
+ $this->_docCount++;
+ }
+
+
+ /**
+ * Dump Field Info (.fnm) segment file
+ */
+ private function _dumpFNM()
+ {
+ $fnmFile = $this->_directory->createFile($this->_name . '.fnm');
+ $fnmFile->writeVInt(count($this->_fields));
+
+ foreach ($this->_fields as $field) {
+ $fnmFile->writeString($field->name);
+ $fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
+ ($field->storeTermVector ? 0x02 : 0x00) |
+// not supported yet 0x04 /* term positions are stored with the term vectors */ |
+// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
+/* not supported yet */ 0x10 /* norms are omitted for the indexed field */
+ );
+ }
+
+ $this->_files[] = $this->_name . '.fnm';
+ }
+
+
+ /**
+ * Dump Term Dictionary segment file entry.
+ * Used to write entry to .tis or .tii files
+ *
+ * @param Zend_Search_Lucene_Storage_File $dicFile
+ * @param Zend_Search_Lucene_Index_Term $prevTerm
+ * @param Zend_Search_Lucene_Index_Term $term
+ * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
+ * @param Zend_Search_Lucene_Index_TermInfo $termInfo
+ */
+ private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
+ &$prevTerm, Zend_Search_Lucene_Index_Term $term,
+ &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
+ {
+ if (isset($prevTerm) && $prevTerm->field == $term->field) {
+ $prefixLength = 0;
+ while ($prefixLength < strlen($prevTerm->text) &&
+ $prefixLength < strlen($term->text) &&
+ $prevTerm->text{$prefixLength} == $term->text{$prefixLength}
+ ) {
+ $prefixLength++;
+ }
+ // Write preffix length
+ $dicFile->writeVInt($prefixLength);
+ // Write suffix
+ $dicFile->writeString( substr($term->text, $prefixLength) );
+ } else {
+ // Write preffix length
+ $dicFile->writeVInt(0);
+ // Write suffix
+ $dicFile->writeString($term->text);
+ }
+ // Write field number
+ $dicFile->writeVInt($term->field);
+ // DocFreq (the count of documents which contain the term)
+ $dicFile->writeVInt($termInfo->docFreq);
+
+ $prevTerm = $term;
+
+ if (!isset($prevTermInfo)) {
+ // Write FreqDelta
+ $dicFile->writeVInt($termInfo->freqPointer);
+ // Write ProxDelta
+ $dicFile->writeVInt($termInfo->proxPointer);
+ } else {
+ // Write FreqDelta
+ $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
+ // Write ProxDelta
+ $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
+ }
+ // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
+ if ($termInfo->skipOffset != 0) {
+ $dicFile->writeVInt($termInfo->skipOffset);
+ }
+
+ $prevTermInfo = $termInfo;
+ }
+
+ /**
+ * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
+ */
+ private function _dumpDictionary()
+ {
+ $tisFile = $this->_directory->createFile($this->_name . '.tis');
+ $tisFile->writeInt((int)0xFFFFFFFE);
+ $tisFile->writeLong(count($this->_termDictionary));
+ $tisFile->writeInt(self::$indexInterval);
+ $tisFile->writeInt(self::$skipInterval);
+
+ $tiiFile = $this->_directory->createFile($this->_name . '.tii');
+ $tiiFile->writeInt((int)0xFFFFFFFE);
+ $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1);
+ $tiiFile->writeInt(self::$indexInterval);
+ $tiiFile->writeInt(self::$skipInterval);
+
+ $frqFile = $this->_directory->createFile($this->_name . '.frq');
+ $prxFile = $this->_directory->createFile($this->_name . '.prx');
+
+ $termKeys = array_keys($this->_termDictionary);
+ sort($termKeys, SORT_STRING);
+
+ $termCount = 0;
+
+ $prevTerm = null;
+ $prevTermInfo = null;
+ $prevIndexTerm = null;
+ $prevIndexTermInfo = null;
+ $prevIndexPosition = 0;
+
+ foreach ($termKeys as $termId) {
+ $freqPointer = $frqFile->tell();
+ $proxPointer = $prxFile->tell();
+
+ $prevDoc = 0;
+ foreach ($this->_termDocs[$termId] as $docId => $termPositions) {
+ $docDelta = ($docId - $prevDoc)*2;
+ $prevDoc = $docId;
+ if (count($termPositions) > 1) {
+ $frqFile->writeVInt($docDelta);
+ $frqFile->writeVInt(count($termPositions));
+ } else {
+ $frqFile->writeVInt($docDelta + 1);
+ }
+
+ $prevPosition = 0;
+ foreach ($termPositions as $position) {
+ $prxFile->writeVInt($position - $prevPosition);
+ $prevPosition = $position;
+ }
+ }
+
+ if (count($this->_termDocs[$termId]) >= self::$skipInterval) {
+ /**
+ * @todo Write Skip Data to a freq file.
+ * It's not used now, but must be implemented to be compatible with Lucene
+ */
+ $skipOffset = $frqFile->tell() - $freqPointer;
+ } else {
+ $skipOffset = 0;
+ }
+
+ $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text,
+ $this->_fields[$this->_termDictionary[$termId]->field]->number);
+ $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]),
+ $freqPointer, $proxPointer, $skipOffset);
+
+ $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo);
+
+ if ($termCount % self::$indexInterval == 0) {
+ $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo);
+
+ $indexPosition = $tisFile->tell();
+ $tiiFile->writeVInt($indexPosition - $prevIndexPosition);
+ $prevIndexPosition = $indexPosition;
+ }
+ $termCount++;
+ }
+
+ $this->_files[] = $this->_name . '.tis';
+ $this->_files[] = $this->_name . '.tii';
+ $this->_files[] = $this->_name . '.frq';
+ $this->_files[] = $this->_name . '.prx';
+ }
+
+
+ /**
+ * Generate compound index file
+ */
+ private function _generateCFS()
+ {
+ $cfsFile = $this->_directory->createFile($this->_name . '.cfs');
+ $cfsFile->writeVInt(count($this->_files));
+
+ $dataOffsetPointers = array();
+ foreach ($this->_files as $fileName) {
+ $dataOffsetPointers[$fileName] = $cfsFile->tell();
+ $cfsFile->writeLong(0); // write dummy data
+ $cfsFile->writeString($fileName);
+ }
+
+ foreach ($this->_files as $fileName) {
+ // Get actual data offset
+ $dataOffset = $cfsFile->tell();
+ // Seek to the data offset pointer
+ $cfsFile->seek($dataOffsetPointers[$fileName]);
+ // Write actual data offset value
+ $cfsFile->writeLong($dataOffset);
+ // Seek back to the end of file
+ $cfsFile->seek($dataOffset);
+
+ $dataFile = $this->_directory->getFileObject($fileName);
+ $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName)));
+
+ $this->_directory->deleteFile($fileName);
+ }
+ }
+
+
+ /**
+ * Close segment, write it to disk and return segment info
+ *
+ * @return Zend_Search_Lucene_Index_SegmentInfo
+ */
+ public function close()
+ {
+ if ($this->_docCount == 0) {
+ return null;
+ }
+
+ $this->_dumpFNM();
+ $this->_dumpDictionary();
+
+ $this->_generateCFS();
+
+ return new Zend_Search_Lucene_Index_SegmentInfo($this->_name,
+ $this->_docCount,
+ $this->_directory);
+ }
+
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php
new file mode 100644
index 00000000..e30ce587
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php
@@ -0,0 +1,70 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/**
+ * A Term represents a word from text. This is the unit of search. It is
+ * composed of two elements, the text of the word, as a string, and the name of
+ * the field that the text occured in, an interned string.
+ *
+ * Note that terms may represent more than words from text fields, but also
+ * things like dates, email addresses, urls, etc.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_Term
+{
+ /**
+ * Field name or field number (depending from context)
+ *
+ * @var mixed
+ */
+ public $field;
+
+ /**
+ * Term value
+ *
+ * @var string
+ */
+ public $text;
+
+
+ /**
+ * @todo docblock
+ */
+ public function __construct( $text, $field = 'contents' )
+ {
+ $this->field = $field;
+ $this->text = $text;
+ }
+
+
+ /**
+ * @todo docblock
+ */
+ public function key()
+ {
+ return $this->field . chr(0) . $this->text;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php
new file mode 100644
index 00000000..ddef721d
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php
@@ -0,0 +1,77 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/**
+ * A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_TermInfo
+{
+ /**
+ * The number of documents which contain the term.
+ *
+ * @var integer
+ */
+ public $docFreq;
+
+ /**
+ * Data offset in a Frequencies file.
+ *
+ * @var integer
+ */
+ public $freqPointer;
+
+ /**
+ * Data offset in a Positions file.
+ *
+ * @var integer
+ */
+ public $proxPointer;
+
+ /**
+ * ScipData offset in a Frequencies file.
+ *
+ * @var integer
+ */
+ public $skipOffset;
+
+ /**
+ * Term offset of the _next_ term in a TermDictionary file.
+ * Used only for Term Index
+ *
+ * @var integer
+ */
+ public $indexPointer;
+
+ public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
+ {
+ $this->docFreq = $docFreq;
+ $this->freqPointer = $freqPointer;
+ $this->proxPointer = $proxPointer;
+ $this->skipOffset = $skipOffset;
+ $this->indexPointer = $indexPointer;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php
new file mode 100644
index 00000000..da4af000
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php
@@ -0,0 +1,308 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Index_SegmentWriter */
+require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
+
+/** Zend_Search_Lucene_Index_SegmentInfo */
+require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Index
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Index_Writer
+{
+ /**
+ * @todo Implement segment merger
+ * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage.
+ * @todo Implement Analyzer substitution
+ * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
+ * temporary index files
+ * @todo Directory lock processing
+ */
+
+ /**
+ * File system adapter.
+ *
+ * @var Zend_Search_Lucene_Storage_Directory
+ */
+ private $_directory = null;
+
+
+ /**
+ * Index version
+ * Counts how often the index has been changed by adding or deleting docs
+ *
+ * @var integer
+ */
+ private $_version;
+
+ /**
+ * Segment name counter.
+ * Used to name new segments .
+ *
+ * @var integer
+ */
+ private $_segmentNameCounter;
+
+ /**
+ * Number of the segments in the index
+ *
+ * @var inteher
+ */
+ private $_segments;
+
+ /**
+ * Determines how often segment indices
+ * are merged by addDocument().
+ *
+ * @var integer
+ */
+ public $mergeFactor;
+
+ /**
+ * Determines the minimal number of documents required before
+ * the buffered in-memory documents are merging and a new Segment
+ * is created.
+ *
+ * @var integer
+ */
+ public $minMergeDocs;
+
+ /**
+ * Determines the largest number of documents ever merged by addDocument().
+ *
+ * @var integer
+ */
+ public $maxMergeDocs;
+
+ /**
+ * List of the segments, created by index writer
+ * Array of Zend_Search_Lucene_Index_SegmentInfo objects
+ *
+ * @var array
+ */
+ private $_newSegments;
+
+ /**
+ * Current segment to add documents
+ *
+ * @var Zend_Search_Lucene_Index_SegmentWriter
+ */
+ private $_currentSegment;
+
+ /**
+ * Opens the index for writing
+ *
+ * IndexWriter constructor needs Directory as a parameter. It should be
+ * a string with a path to the index folder or a Directory object.
+ * Second constructor parameter create is optional - true to create the
+ * index or overwrite the existing one.
+ *
+ * @param Zend_Search_Lucene_Storage_Directory $directory
+ * @param boolean $create
+ */
+ public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false)
+ {
+ $this->_directory = $directory;
+
+ if ($create) {
+ foreach ($this->_directory->fileList() as $file) {
+ if ($file == 'deletable' ||
+ $file == 'segments' ||
+ substr($file, strlen($file)-4) == '.cfs') {
+ $this->_directory->deleteFile($file);
+ }
+ }
+ $segmentsFile = $this->_directory->createFile('segments');
+ $segmentsFile->writeInt((int)0xFFFFFFFF);
+ // write version
+ $segmentsFile->writeLong(0);
+ // write name counter
+ $segmentsFile->writeInt(0);
+ // write segment counter
+ $segmentsFile->writeInt(0);
+
+ $deletableFile = $this->_directory->createFile('deletable');
+ // write counter
+ $deletableFile->writeInt(0);
+
+ $this->_version = 0;
+ $this->_segmentNameCounter = 0;
+ $this->_segments = 0;
+ } else {
+ $segmentsFile = $this->_directory->getFileObject('segments');
+ $format = $segmentsFile->readInt();
+ if ($format != (int)0xFFFFFFFF) {
+ throw new Zend_Search_Lucene_Exception('Wrong segments file format');
+ }
+
+ // read version
+ $this->_version = $segmentsFile->readLong();
+ // read counter
+ $this->_segmentNameCounter = $segmentsFile->readInt();
+ // read segment counter
+ $this->_segments = $segmentsFile->readInt();
+ }
+
+ $this->_newSegments = array();
+ $this->_currentSegment = null;
+ }
+
+ /**
+ * Adds a document to this index.
+ *
+ * @param Zend_Search_Lucene_Document $document
+ */
+ public function addDocument(Zend_Search_Lucene_Document $document)
+ {
+ if ($this->_currentSegment === null) {
+ $this->_currentSegment =
+ new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName());
+ }
+ $this->_currentSegment->addDocument($document);
+ $this->_version++;
+ }
+
+
+
+ /**
+ * Update segments file by adding current segment to a list
+ * @todo !!!!!Finish the implementation
+ *
+ * @throws Zend_Search_Lucene_Exception
+ */
+ private function _updateSegments()
+ {
+ $segmentsFile = $this->_directory->getFileObject('segments');
+ $newSegmentFile = $this->_directory->createFile('segments.new');
+
+ $newSegmentFile->writeInt((int)0xFFFFFFFF);
+ $newSegmentFile->writeLong($this->_version);
+ $newSegmentFile->writeInt($this->_segmentNameCounter);
+ $newSegmentFile->writeInt($this->_segments + count($this->_newSegments));
+
+ $segmentsFile->seek(20);
+ $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20));
+
+ foreach ($this->_newSegments as $segmentName => $segmentInfo) {
+ $newSegmentFile->writeString($segmentName);
+ $newSegmentFile->writeInt($segmentInfo->count());
+ }
+
+ $this->_directory->renameFile('segments.new', 'segments');
+ }
+
+
+ /**
+ * Commit current changes
+ * returns array of new segments
+ *
+ * @return array
+ */
+ public function commit()
+ {
+ if ($this->_currentSegment !== null) {
+ $newSegment = $this->_currentSegment->close();
+ if ($newSegment !== null) {
+ $this->_newSegments[$newSegment->getName()] = $newSegment;
+ }
+ $this->_currentSegment = null;
+ }
+
+ if (count($this->_newSegments) != 0) {
+ $this->_updateSegments();
+ }
+
+ $result = $this->_newSegments;
+ $this->_newSegments = array();
+
+ return $result;
+ }
+
+
+ /**
+ * Merges the provided indexes into this index.
+ *
+ * @param array $readers
+ * @return void
+ */
+ public function addIndexes($readers)
+ {
+ /**
+ * @todo implementation
+ */
+ }
+
+
+ /**
+ * Returns the number of documents currently in this index.
+ *
+ * @return integer
+ */
+ public function docCount($readers)
+ {
+ /**
+ * @todo implementation
+ */
+ }
+
+
+ /**
+ * Flushes all changes to an index and closes all associated files.
+ *
+ */
+ public function close()
+ {
+ /**
+ * @todo implementation
+ */
+ }
+
+
+ /**
+ * Merges all segments together into a single segment, optimizing
+ * an index for search.
+ *
+ * return void
+ */
+ public function optimize()
+ {
+ /**
+ * @todo implementation
+ */
+ }
+
+ /**
+ * Get name for new segment
+ *
+ * @return string
+ */
+ private function _newSegmentName()
+ {
+ return '_' . base_convert($this->_segmentNameCounter++, 10, 36);
+ }
+
+}