diff options
| author | wei <> | 2006-05-07 03:34:25 +0000 | 
|---|---|---|
| committer | wei <> | 2006-05-07 03:34:25 +0000 | 
| commit | 30eddf57c8de433e8ea02b9e552c8e1744a505a7 (patch) | |
| tree | 9e81f3a15f9a695cb96c5cc4dd80de5a3a0bb7b2 /buildscripts/texbuilder | |
| parent | 0bb2822f68dfe3cf568affd4acf0d8120d9d53c7 (diff) | |
Add search to quickstart demo.
Diffstat (limited to 'buildscripts/texbuilder')
42 files changed, 6729 insertions, 1 deletions
| diff --git a/buildscripts/texbuilder/Zend/Exception.php b/buildscripts/texbuilder/Zend/Exception.php new file mode 100644 index 00000000..ab5e4e95 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Exception.php @@ -0,0 +1,28 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Exception extends Exception +{} + diff --git a/buildscripts/texbuilder/Zend/LICENSE.txt b/buildscripts/texbuilder/Zend/LICENSE.txt new file mode 100644 index 00000000..bfd3ff11 --- /dev/null +++ b/buildscripts/texbuilder/Zend/LICENSE.txt @@ -0,0 +1,52 @@ +----------------------------------------------------------------------- +                 The Zend Framework License, Version 1.0 +  Copyright (c) 2005 Zend Technologies USA, Inc. All rights reserved. +-----------------------------------------------------------------------  + +Redistribution and use in source and binary forms, with or without +modification, is permitted provided that the following conditions +are met: + +  1. Redistributions of source code must retain the above copyright +     notice, this list of conditions and the following disclaimer.  +  +  2. Redistributions in binary form must reproduce the above  +     copyright notice, this list of conditions and the following  +     disclaimer in the documentation and/or other materials provided +     with the distribution. +  +  3. The names "Zend" and "Zend Framework" must not be used to endorse +     or promote products derived from this software without prior +     permission from Zend Technologies USA, Inc. For written +     permission, please contact license@zend.com.  +  +  4. Zend Technologies USA, Inc. may publish revised and/or new +     versions of the license from time to time. Each version will +     be given a distinguishing version number. +     Once covered code has been published under a particular version +     of the license, you may always continue to use it under the +     terms of that version. You may also choose to use such covered +     code under the terms of any subsequent version of the license +     published by Zend Technologies USA, Inc. No one other than Zend +     Technologies USA, Inc. has the right to modify the terms +     applicable to covered code created under this License. + +  5. Redistributions of any form whatsoever must retain the following +     acknowledgment: +     "This product includes the Zend Framework, freely available at +     http://www.zend.com" + +THIS SOFTWARE IS PROVIDED BY ZEND TECHNOLOGIES USA, INC. ``AS IS'' AND  +ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A  +PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ZEND +TECHNOLOGIES USA, INC.  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +----------------------------------------------------------------------- diff --git a/buildscripts/texbuilder/Zend/Search/Exception.php b/buildscripts/texbuilder/Zend/Search/Exception.php new file mode 100644 index 00000000..e0aa2221 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Exception.php @@ -0,0 +1,34 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Framework base exception + */ +require_once 'Zend/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Exception extends Zend_Exception +{} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene.php b/buildscripts/texbuilder/Zend/Search/Lucene.php new file mode 100644 index 00000000..700a8b8a --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene.php @@ -0,0 +1,569 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Document */ +require_once 'Zend/Search/Lucene/Document.php'; + +/** Zend_Search_Lucene_Storage_Directory */ +require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php'; + +/** Zend_Search_Lucene_Index_Term */ +require_once 'Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Index_TermInfo */ +require_once 'Zend/Search/Lucene/Index/TermInfo.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + +/** Zend_Search_Lucene_Index_FieldInfo */ +require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; + +/** Zend_Search_Lucene_Index_Writer */ +require_once 'Zend/Search/Lucene/Index/Writer.php'; + +/** Zend_Search_Lucene_Search_QueryParser */ +require_once 'Zend/Search/Lucene/Search/QueryParser.php'; + +/** Zend_Search_Lucene_Search_QueryHit */ +require_once 'Zend/Search/Lucene/Search/QueryHit.php'; + +/** Zend_Search_Lucene_Search_Similarity */ +require_once 'Zend/Search/Lucene/Search/Similarity.php'; + + +/** + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene +{ +    /** +     * File system adapter. +     * +     * @var Zend_Search_Lucene_Storage_Directory +     */ +    private $_directory = null; + +    /** +     * File system adapter closing option +     * +     * @var boolean +     */ +    private $_closeDirOnExit = true; + +    /** +     * Writer for this index, not instantiated unless required. +     * +     * @var Zend_Search_Lucene_Index_Writer +     */ +    private $_writer = null; + +    /** +     * Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index. +     * +     * @var array Zend_Search_Lucene_Index_SegmentInfo +     */ +    private $_segmentInfos = array(); + +    /** +     * Number of documents in this index. +     * +     * @var integer +     */ +    private $_docCount = 0; + + +    /** +     * Opens the index. +     * +     * IndexReader constructor needs Directory as a parameter. It should be +     * a string with a path to the index folder or a Directory object. +     * +     * @param mixed $directory +     * @throws Zend_Search_Lucene_Exception +     */ +    public function __construct($directory = null, $create = false) +    { +        if ($directory === null) { +            throw new Zend_Search_Exception('No index directory specified'); +        } + +        if ($directory instanceof Zend_Search_Lucene_Storage_Directory_Filesystem) { +            $this->_directory      = $directory; +            $this->_closeDirOnExit = false; +        } else { +            $this->_directory      = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory); +            $this->_closeDirOnExit = true; +        } + +        if ($create) { +            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory, true); +        } else { +            $this->_writer = null; +        } + +        $this->_segmentInfos = array(); + +        $segmentsFile = $this->_directory->getFileObject('segments'); + +        $format = $segmentsFile->readInt(); + +        if ($format != (int)0xFFFFFFFF) { +            throw new Zend_Search_Lucene_Exception('Wrong segments file format'); +        } + +        // read version +        $segmentsFile->readLong(); + +        // read counter +        $segmentsFile->readInt(); + +        $segments = $segmentsFile->readInt(); + +        $this->_docCount = 0; + +        // read segmentInfos +        for ($count = 0; $count < $segments; $count++) { +            $segName = $segmentsFile->readString(); +            $segSize = $segmentsFile->readInt(); +            $this->_docCount += $segSize; + +            $this->_segmentInfos[$count] = +                                new Zend_Search_Lucene_Index_SegmentInfo($segName, +                                                                         $segSize, +                                                                         $this->_directory); +        } +    } + + +    /** +     * Object destructor +     */ +    public function __destruct() +    { +        $this->commit(); + +        if ($this->_closeDirOnExit) { +            $this->_directory->close(); +        } +    } + +    /** +     * Returns an instance of Zend_Search_Lucene_Index_Writer for the index +     * +     * @return Zend_Search_Lucene_Index_Writer +     */ +    public function getIndexWriter() +    { +        if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { +            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); +        } + +        return $this->_writer; +    } + + +    /** +     * Returns the Zend_Search_Lucene_Storage_Directory instance for this index. +     * +     * @return Zend_Search_Lucene_Storage_Directory +     */ +    public function getDirectory() +    { +        return $this->_directory; +    } + + +    /** +     * Returns the total number of documents in this index. +     * +     * @return integer +     */ +    public function count() +    { +        return $this->_docCount; +    } + + +    /** +     * Performs a query against the index and returns an array +     * of Zend_Search_Lucene_Search_QueryHit objects. +     * Input is a string or Zend_Search_Lucene_Search_Query. +     * +     * @param mixed $query +     * @return array ZSearchHit +     */ +    public function find($query) +    { +        if (is_string($query)) { +            $query = Zend_Search_Lucene_Search_QueryParser::parse($query); +        } + +        if (!$query instanceof Zend_Search_Lucene_Search_Query) { +            throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); +        } + +        $this->commit(); + +        $hits = array(); +        $scores = array(); + +        $docNum = $this->count(); +        for( $count=0; $count < $docNum; $count++ ) { +            $docScore = $query->score( $count, $this); +            if( $docScore != 0 ) { +                $hit = new Zend_Search_Lucene_Search_QueryHit($this); +                $hit->id = $count; +                $hit->score = $docScore; + +                $hits[] = $hit; +                $scores[] = $docScore; +            } +        } +        array_multisort($scores, SORT_DESC, SORT_REGULAR, $hits); + +        return $hits; +    } + + +    /** +     * Returns a list of all unique field names that exist in this index. +     * +     * @param boolean $indexed +     * @return array +     */ +    public function getFieldNames($indexed = false) +    { +        $result = array(); +        foreach( $this->_segmentInfos as $segmentInfo ) { +            $result = array_merge($result, $segmentInfo->getFields($indexed)); +        } +        return $result; +    } + + +    /** +     * Returns a Zend_Search_Lucene_Document object for the document +     * number $id in this index. +     * +     * @param integer|Zend_Search_Lucene_Search_QueryHit $id +     * @return Zend_Search_Lucene_Document +     */ +    public function getDocument($id) +    { +        if ($id instanceof Zend_Search_Lucene_Search_QueryHit) { +            /* @var $id Zend_Search_Lucene_Search_QueryHit */ +            $id = $id->id; +        } + +        if ($id >= $this->_docCount) { +            /** +             * @todo exception here? +             */ +            return null; +        } + +        $segCount = 0; +        $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); +        while( $nextSegmentStartId <= $id ) { +               $segCount++; +               $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); +        } +        $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + +        $fdxFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdx'); +        $fdxFile->seek( ($id-$segmentStartId)*8, SEEK_CUR ); +        $fieldValuesPosition = $fdxFile->readLong(); + +        $fdtFile = $this->_segmentInfos[ $segCount ]->openCompoundFile('.fdt'); +        $fdtFile->seek( $fieldValuesPosition, SEEK_CUR ); +        $fieldCount = $fdtFile->readVInt(); + +        $doc = new Zend_Search_Lucene_Document(); +        for( $count = 0; $count < $fieldCount; $count++ ) { +            $fieldNum = $fdtFile->readVInt(); +            $bits = $fdtFile->readByte(); + +            $fieldInfo = $this->_segmentInfos[ $segCount ]->getField($fieldNum); + +            if( !($bits & 2) ) { // Text data +                $field = new Zend_Search_Lucene_Field($fieldInfo->name, +                                                      $fdtFile->readString(), +                                                      true, +                                                      $fieldInfo->isIndexed, +                                                      $bits & 1 ); +            } else { +                $field = new Zend_Search_Lucene_Field($fieldInfo->name, +                                                      $fdtFile->readBinary(), +                                                      true, +                                                      $fieldInfo->isIndexed, +                                                      $bits & 1 ); +            } + +            $doc->addField($field); +        } + +        return $doc; +    } + + +    /** +     * Returns an array of all the documents which contain term. +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @return array +     */ +    public function termDocs(Zend_Search_Lucene_Index_Term $term) +    { +        $result = array(); +        $segmentStartDocId = 0; + +        foreach ($this->_segmentInfos as $segInfo) { +            $termInfo = $segInfo->getTermInfo($term); + +            if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { +                $segmentStartDocId += $segInfo->count(); +                continue; +            } + +            $frqFile = $segInfo->openCompoundFile('.frq'); +            $frqFile->seek($termInfo->freqPointer,SEEK_CUR); +            $docId = 0; +            for( $count=0; $count < $termInfo->docFreq; $count++ ) { +                $docDelta = $frqFile->readVInt(); +                if( $docDelta % 2 == 1 ) { +                    $docId += ($docDelta-1)/2; +                } else { +                    $docId += $docDelta/2; +                    // read freq +                    $frqFile->readVInt(); +                } +                $result[] = $segmentStartDocId + $docId; +            } + +            $segmentStartDocId += $segInfo->count(); +        } + +        return $result; +    } + + +    /** +     * Returns an array of all term positions in the documents. +     * Return array structure: array( docId => array( pos1, pos2, ...), ...) +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @return array +     */ +    public function termPositions(Zend_Search_Lucene_Index_Term $term) +    { +        $result = array(); +        $segmentStartDocId = 0; +        foreach( $this->_segmentInfos as $segInfo ) { +            $termInfo = $segInfo->getTermInfo($term); + +            if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { +                $segmentStartDocId += $segInfo->count(); +                continue; +            } + +            $frqFile = $segInfo->openCompoundFile('.frq'); +            $frqFile->seek($termInfo->freqPointer,SEEK_CUR); +            $freqs = array(); +            $docId = 0; + +            for( $count = 0; $count < $termInfo->docFreq; $count++ ) { +                $docDelta = $frqFile->readVInt(); +                if( $docDelta % 2 == 1 ) { +                    $docId += ($docDelta-1)/2; +                    $freqs[ $docId ] = 1; +                } else { +                    $docId += $docDelta/2; +                    $freqs[ $docId ] = $frqFile->readVInt(); +                } +            } + +            $prxFile = $segInfo->openCompoundFile('.prx'); +            $prxFile->seek($termInfo->proxPointer,SEEK_CUR); +            foreach ($freqs as $docId => $freq) { +                $termPosition = 0; +                $positions = array(); + +                for ($count = 0; $count < $freq; $count++ ) { +                    $termPosition += $prxFile->readVInt(); +                    $positions[] = $termPosition; +                } +                $result[ $segmentStartDocId + $docId ] = $positions; +            } + +            $segmentStartDocId += $segInfo->count(); +        } + +        return $result; +    } + + +    /** +     * Returns the number of documents in this index containing the $term. +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @return integer +     */ +    public function docFreq(Zend_Search_Lucene_Index_Term $term) +    { +        $result = 0; +        foreach ($this->_segmentInfos as $segInfo) { +            $termInfo = $segInfo->getTermInfo($term); +            if ($termInfo !== null) { +                $result += $termInfo->docFreq; +            } +        } + +        return $result; +    } + + +    /** +     * Retrive similarity used by index reader +     * +     * @return Zend_Search_Lucene_Search_Similarity +     */ +    public function getSimilarity() +    { +        return Zend_Search_Lucene_Search_Similarity::getDefault(); +    } + + +    /** +     * Returns a normalization factor for "field, document" pair. +     * +     * @param integer $id +     * @param string $fieldName +     * @return Zend_Search_Lucene_Document +     */ +    public function norm( $id, $fieldName ) +    { +        if( $id >= $this->_docCount ) +            return null; + +        $segCount = 0; +        $nextSegmentStartId = $this->_segmentInfos[ 0 ]->count(); +        while( $nextSegmentStartId <= $id ) { +               $segCount++; +               $nextSegmentStartId += $this->_segmentInfos[ $segCount ]->count(); +        } + +        $segmentStartId = $nextSegmentStartId - $this->_segmentInfos[ $segCount ]->count(); + +        return $this->_segmentInfos[ $segCount ]->norm($id - $segmentStartId, $fieldName); +    } + + +    /** +     * Adds a document to this index. +     * +     * @param Zend_Search_Lucene_Document $document +     */ +    public function addDocument(Zend_Search_Lucene_Document $document) +    { +        if (!$this->_writer instanceof Zend_Search_Lucene_Index_Writer) { +            $this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory); +        } + +        $this->_writer->addDocument($document); +    } + + +    /** +     * Commit changes resulting from delete() or undeleteAll() operations. +     * +     * @todo delete() and undeleteAll processing. +     */ +    public function commit() +    { +        if ($this->_writer !== null) { +            foreach ($this->_writer->commit() as $segmentName => $segmentInfo) { +                if ($segmentInfo !== null) { +                    $this->_segmentInfos[] = $segmentInfo; +                    $this->_docCount += $segmentInfo->count(); +                } else { +                    foreach ($this->_segmentInfos as $segId => $segInfo) { +                        if ($segInfo->getName() == $segmentName) { +                            unset($this->_segmentInfos[$segId]); +                        } +                    } +                } +            } +        } +    } + + +    /************************************************************************* +    @todo UNIMPLEMENTED +    *************************************************************************/ + +    /** +     * Returns an array of all terms in this index. +     * +     * @todo Implementation +     * @return array +     */ +    public function terms() +    { +        return array(); +    } + + +    /** +     * Returns true if any documents have been deleted from this index. +     * +     * @todo Implementation +     * @return boolean +     */ +    public function hasDeletions() +    { +        return false; +    } + + +    /** +     * Deletes a document from the index.  $doc may contain a Zend_Search_Lucene_Document +     * or the number of the document to delete. +     * +     * @todo Implementation +     * @param mixed $item_to_del +     */ +    public function delete($doc) +    {} + + +    /** +     * Undeletes all documents currently marked as deleted in this index. +     * +     * @todo Implementation +     */ +    public function undeleteAll() +    {} +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php new file mode 100644 index 00000000..8e234c16 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php @@ -0,0 +1,94 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Token */ +require_once 'Zend/Search/Lucene/Analysis/Token.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; + + + +/** + * An Analyzer is used to analyze text. + * It thus represents a policy for extracting index terms from text. + * + * Note: + * Lucene Java implementation is oriented to streams. It provides effective work + * with a huge documents (more then 20Mb). + * But engine itself is not oriented such documents. + * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays). + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +abstract class Zend_Search_Lucene_Analysis_Analyzer +{ +    /** +     * The Analyzer implementation used by default. +     * +     * @var Zend_Search_Lucene_Analysis_Analyzer +     */ +    static private $_defaultImpl; + +    /** +     * Tokenize text to a terms +     * Returns array of Zend_Search_Lucene_Analysis_Token objects +     * +     * @param string $data +     * @return array +     */ +    abstract public function tokenize($data); + + +    /** +     * Set the default Analyzer implementation used by indexing code. +     * +     * @param Zend_Search_Lucene_Analysis_Analyzer $similarity +     */ +    static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) +    { +        self::$_defaultImpl = $analyzer; +    } + + +    /** +     * Return the default Analyzer implementation used by indexing code. +     * +     * @return Zend_Search_Lucene_Analysis_Analyzer +     */ +    static public function getDefault() +    { +        if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { +            self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); +        } + +        return self::$_defaultImpl; +    } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php new file mode 100644 index 00000000..5c61e5b5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php @@ -0,0 +1,73 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; + + +/** + * Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface. + * There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis + * subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer. + * + * @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer +{ +    /** +     * The set of Token filters applied to the Token stream. +     * Array of Zend_Search_Lucene_Analysis_TokenFilter objects. +     * +     * @var array +     */ +    private $_filters = array(); + +    /** +     * Add Token filter to the Analyzer +     * +     * @param Zend_Search_Lucene_Analysis_TokenFilter $filter +     */ +    public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter) +    { +        $this->_filters[] = $filter; +    } + +    /** +     * Apply filters to the token. +     * +     * @param Zend_Search_Lucene_Analysis_Token $token +     * @return Zend_Search_Lucene_Analysis_Token +     */ +    public function normalize(Zend_Search_Lucene_Analysis_Token $token) +    { +        foreach ($this->_filters as $filter) { +            $token = $filter->normalize($token); +        } + +        return $token; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php new file mode 100644 index 00000000..2a80c1f8 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php @@ -0,0 +1,76 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer_Common */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common +{ +    /** +     * Tokenize text to a terms +     * Returns array of Zend_Search_Lucene_Analysis_Token objects +     * +     * @param string $data +     * @return array +     */ +    public function tokenize($data) +    { +        $tokenStream = array(); + +        $position = 0; +        while ($position < strlen($data)) { +            // skip white space +            while ($position < strlen($data) && !ctype_alpha( $data{$position} )) { +                $position++; +            } + +            $termStartPosition = $position; + +            // read token +            while ($position < strlen($data) && ctype_alpha( $data{$position} )) { +                $position++; +            } + +            // Empty token, end of stream. +            if ($position == $termStartPosition) { +                break; +            } + +            $token = new Zend_Search_Lucene_Analysis_Token(substr($data, +                                             $termStartPosition, +                                             $position-$termStartPosition), +                                      $termStartPosition, +                                      $position); +            $tokenStream[] = $this->normalize($token); +        } + +        return $tokenStream; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php new file mode 100644 index 00000000..d77e38d5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php @@ -0,0 +1,43 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; + +/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */ +require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text +{ +    public function __construct() +    { +        $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase()); +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php new file mode 100644 index 00000000..a60d5d96 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php @@ -0,0 +1,170 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage document + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Analysis_Token +{ +    /** +     * The text of the term. +     * +     * @var string +     */ +    private $_termText; + +    /** +     * Start in source text. +     * +     * @var integer +     */ +    private $_startOffset; + +    /** +     * End in source text +     * +     * @var integer +     */ +    private $_endOffset; + +    /** +     * Lexical type. +     * +     * @var string +     */ +    private $_type; + +    /** +     * The position of this token relative to the previous Token. +     * +     * The default value is one. +     * +     * Some common uses for this are: +     * Set it to zero to put multiple terms in the same position.  This is +     * useful if, e.g., a word has multiple stems.  Searches for phrases +     * including either stem will match.  In this case, all but the first stem's +     * increment should be set to zero: the increment of the first instance +     * should be one.  Repeating a token with an increment of zero can also be +     * used to boost the scores of matches on that token. +     * +     * Set it to values greater than one to inhibit exact phrase matches. +     * If, for example, one does not want phrases to match across removed stop +     * words, then one could build a stop word filter that removes stop words and +     * also sets the increment to the number of stop words removed before each +     * non-stop word.  Then exact phrase queries will only match when the terms +     * occur with no intervening stop words. +     * +     * @var integer +     */ +    private $_positionIncrement; + + +    /** +     * Object constructor +     * +     * @param string  $text +     * @param integer $start +     * @param integer $end +     * @param string  $type +     */ +    public function __construct($text, $start, $end, $type = 'word' ) +    { +        $this->_termText    = $text; +        $this->_startOffset = $start; +        $this->_endOffset   = $end; +        $this->_type        = $type; + +        $this->_positionIncrement = 1; +    } + + +    /** +     * positionIncrement setter +     * +     * @param integer $positionIncrement +     */ +    public function setPositionIncrement($positionIncrement) +    { +        $this->_positionIncrement = $positionIncrement; +    } + +    /** +     * Returns the position increment of this Token. +     * +     * @return integer +     */ +    public function getPositionIncrement() +    { +        return $this->_positionIncrement; +    } + +    /** +     * Returns the Token's term text. +     * +     * @return string +     */ +    public function getTermText() +    { +        return $this->_termText; +    } + +    /** +     * Returns this Token's starting offset, the position of the first character +     * corresponding to this token in the source text. +     * +     * Note: +     * The difference between getEndOffset() and getStartOffset() may not be equal +     * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered +     * by a stemmer or some other filter. +     * +     * @return integer +     */ +    public function getStartOffset() +    { +        return $this->_startOffset; +    } + +    /** +     * Returns this Token's ending offset, one greater than the position of the +     * last character corresponding to this token in the source text. +     * +     * @return integer +     */ +    public function getEndOffset() +    { +        return $this->_endOffset; +    } + +    /** +     * Returns this Token's lexical type.  Defaults to 'word'. +     * +     * @return string +     */ +    public function getType() +    { +        return $this->_type; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php new file mode 100644 index 00000000..9ea5125f --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php @@ -0,0 +1,45 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_Token */ +require_once 'Zend/Search/Lucene/Analysis/Token.php'; + + +/** + * Token filter converts (normalizes) Token ore removes it from a token stream. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +abstract class Zend_Search_Lucene_Analysis_TokenFilter +{ +    /** +     * Normalize Token or remove it (if null is returned) +     * +     * @param Zend_Search_Lucene_Analysis_Token $srcToken +     * @return Zend_Search_Lucene_Analysis_Token +     */ +    abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php new file mode 100644 index 00000000..53585e21 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php @@ -0,0 +1,55 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Analysis_TokenFilter */ +require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php'; + + +/** + * Lower case Token filter. + * + * @package    Zend_Search_Lucene + * @subpackage Analysis + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + +class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter +{ +    /** +     * Normalize Token or remove it (if null is returned) +     * +     * @param Zend_Search_Lucene_Analysis_Token $srcToken +     * @return Zend_Search_Lucene_Analysis_Token +     */ +    public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) +    { +        $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ), +                                     $srcToken->getStartOffset(), +                                     $srcToken->getEndOffset(), +                                     $srcToken->getType()); + +        $newToken->setPositionIncrement($srcToken->getPositionIncrement()); + +        return $newToken; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Document.php b/buildscripts/texbuilder/Zend/Search/Lucene/Document.php new file mode 100644 index 00000000..29c0c2d9 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Document.php @@ -0,0 +1,109 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage document + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Field */ +require_once 'Zend/Search/Lucene/Field.php'; + + +/** + * A Document is a set of fields. Each field has a name and a textual value. + * + * @package    Zend_Search_Lucene + * @subpackage document + * @copyright  Copyright (c) 2005-2006 Zend Technologies Inc. (http://www.zend.com) + * @license    Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Document +{ + +    /** +     * Associative array Zend_Search_Lucene_Field objects where the keys to the +     * array are the names of the fields. +     * +     * @var array +     */ +    protected $_fields = array(); + +    public $boost = 1.0; + + +    /** +     * Proxy method for getFieldValue(), provides more convenient access to +     * the string value of a field. +     * +     * @param  $offset +     * @return string +     */ +	public function __get($offset) +	{ +		return $this->getFieldValue($offset); +	} + + +    /** +     * Add a field object to this document. +     * +     * @param Zend_Search_Lucene_Field $field +     */ +    public function addField(Zend_Search_Lucene_Field $field) +    { +        $this->_fields[$field->name] = $field; +    } + + +    /** +     * Return an array with the names of the fields in this document. +     * +     * @return array +     */ +    public function getFieldNames() +    { +    	return array_keys($this->_fields); +    } + + +    /** +     * Returns Zend_Search_Lucene_Field object for a named field in this document. +     * +     * @param string $fieldName +     * @return Zend_Search_Lucene_Field +     */ +    public function getField($fieldName) +    { +		if (!array_key_exists($fieldName, $this->_fields)) { +			throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document."); +		} +        return $this->_fields[$fieldName]; +    } + + +    /** +     * Returns the string value of a named field in this document. +     * +     * @see __get() +     * @return string +     */ +    public function getFieldValue($fieldName) +    { +    	return $this->getField($fieldName)->stringValue; +    } + +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php b/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php new file mode 100644 index 00000000..5f12c5f6 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Exception.php @@ -0,0 +1,34 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Framework base exception + */ +require_once 'Zend/Search/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Exception extends Zend_Search_Exception +{} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Field.php b/buildscripts/texbuilder/Zend/Search/Lucene/Field.php new file mode 100644 index 00000000..cce6bfce --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Field.php @@ -0,0 +1,134 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage document + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A field is a section of a Document.  Each field has two parts, + * a name and a value. Values may be free text or they may be atomic + * keywords, which are not further processed. Such keywords may + * be used to represent dates, urls, etc.  Fields are optionally + * stored in the index, so that they may be returned with hits + * on the document. + * + * @package    Zend_Search_Lucene + * @subpackage document + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Field +{ +    public $kind; + +    public $name        = 'body'; +    public $stringValue = null; +    public $isStored    = false; +    public $isIndexed   = true; +    public $isTokenized = true; +    public $isBinary    = false; + +    public $storeTermVector = false; + +    public $boost = 1.0; + +    public function __construct($name, $stringValue, $isStored, $isIndexed, $isTokenized, $isBinary = false) +    { +        $this->name        = $name; +        $this->stringValue = $stringValue; +        $this->isStored    = $isStored; +        $this->isIndexed   = $isIndexed; +        $this->isTokenized = $isTokenized; +        $this->isBinary    = $isBinary; + +        $this->storeTermVector = false; +        $this->boost           = 1.0; +    } + + +    /** +     * Constructs a String-valued Field that is not tokenized, but is indexed +     * and stored.  Useful for non-text fields, e.g. date or url. +     * +     * @param string $name +     * @param string $value +     * @return Zend_Search_Lucene_Field +     */ +    static public function Keyword($name, $value) +    { +        return new self($name, $value, true, true, false); +    } + + +    /** +     * Constructs a String-valued Field that is not tokenized nor indexed, +     * but is stored in the index, for return with hits. +     * +     * @param string $name +     * @param string $value +     * @return Zend_Search_Lucene_Field +     */ +    static public function UnIndexed($name, $value) +    { +        return new self($name, $value, true, false, false); +    } + + +    /** +     * Constructs a Binary String valued Field that is not tokenized nor indexed, +     * but is stored in the index, for return with hits. +     * +     * @param string $name +     * @param string $value +     * @return Zend_Search_Lucene_Field +     */ +    static public function Binary($name, $value) +    { +        return new self($name, $value, true, false, false, true); +    } + +    /** +     * Constructs a String-valued Field that is tokenized and indexed, +     * and is stored in the index, for return with hits.  Useful for short text +     * fields, like "title" or "subject". Term vector will not be stored for this field. +     * +     * @param string $name +     * @param string $value +     * @return Zend_Search_Lucene_Field +     */ +    static public function Text($name, $value) +    { +        return new self($name, $value, true, true, true); +    } + + +    /** +     * Constructs a String-valued Field that is tokenized and indexed, +     * but that is not stored in the index. +     * +     * @param string $name +     * @param string $value +     * @return Zend_Search_Lucene_Field +     */ +    static public function UnStored($name, $value) +    { +        return new self($name, $value, false, true, true); +    } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php new file mode 100644 index 00000000..eaca4ecf --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/FieldInfo.php @@ -0,0 +1,43 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_FieldInfo +{ +    public $name; +    public $isIndexed; +    public $number; +    public $storeTermVector; + +    public function __construct( $name, $isIndexed, $number, $storeTermVector ) +    { +        $this->name            = $name; +        $this->isIndexed       = $isIndexed; +        $this->number          = $number; +        $this->storeTermVector = $storeTermVector; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php new file mode 100644 index 00000000..f5c596a0 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentInfo.php @@ -0,0 +1,412 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_SegmentInfo +{ +    /** +     * Number of docs in a segment +     * +     * @var integer +     */ +    private $_docCount; + +    /** +     * Segment name +     * +     * @var string +     */ +    private $_name; + +    /** +     * Term Dictionary Index +     * Array of the Zend_Search_Lucene_Index_Term objects +     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos +     * +     * @var array +     */ +    private $_termDictionary; + +    /** +     * Term Dictionary Index TermInfos +     * Array of the Zend_Search_Lucene_Index_TermInfo objects +     * +     * @var array +     */ +    private $_termDictionaryInfos; + +    /** +     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment +     * +     * @var array +     */ +    private $_fields; + +    /** +     * Field positions in a dictionary. +     * (Term dictionary contains filelds ordered by names) +     * +     * @var array +     */ +    private $_fieldsDicPositions; + + +    /** +     * Associative array where the key is the file name and the value is data offset +     * in a compound segment file (.csf). +     * +     * @var array +     */ +    private $_segFiles; + +    /** +     * File system adapter. +     * +     * @var Zend_Search_Lucene_Storage_Directory_Filesystem +     */ +    private $_directory; + +    /** +     * Normalization factors. +     * An array fieldName => normVector +     * normVector is a binary string. +     * Each byte corresponds to an indexed document in a segment and +     * encodes normalization factor (float value, encoded by +     * Zend_Search_Lucene_Search_Similarity::encodeNorm()) +     * +     * @var array +     */ +    private $_norms = array(); + +    /** +     * Zend_Search_Lucene_Index_SegmentInfo constructor needs Segmentname, +     * Documents count and Directory as a parameter. +     * +     * @param string $name +     * @param integer $docCount +     * @param Zend_Search_Lucene_Storage_Directory $directory +     */ +    public function __construct($name, $docCount, $directory) +    { +        $this->_name = $name; +        $this->_docCount = $docCount; +        $this->_directory = $directory; +        $this->_termDictionary = null; + +        $this->_segFiles = array(); +        $cfsFile = $this->_directory->getFileObject($name . '.cfs'); +        $segFilesCount = $cfsFile->readVInt(); + +        for ($count = 0; $count < $segFilesCount; $count++) { +            $dataOffset = $cfsFile->readLong(); +            $fileName = $cfsFile->readString(); +            $this->_segFiles[$fileName] = $dataOffset; +        } + +        $fnmFile = $this->openCompoundFile('.fnm'); +        $fieldsCount = $fnmFile->readVInt(); +        $fieldNames = array(); +        $fieldNums  = array(); +        $this->_fields = array(); +        for ($count=0; $count < $fieldsCount; $count++) { +            $fieldName = $fnmFile->readString(); +            $fieldBits = $fnmFile->readByte(); +            $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, +                                                                            $fieldBits & 1, +                                                                            $count, +                                                                            $fieldBits & 2 ); +            if ($fieldBits & 0x10) { +                // norms are omitted for the indexed field +                $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); +            } + +            $fieldNums[$count]  = $count; +            $fieldNames[$count] = $fieldName; +        } +        array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); +        $this->_fieldsDicPositions = array_flip($fieldNums); +    } + +    /** +     * Opens index file stoted within compound index file +     * +     * @param string $extension +     * @throws Zend_Search_Lucene_Exception +     * @return Zend_Search_Lucene_Storage_File +     */ +    public function openCompoundFile($extension) +    { +        $filename = $this->_name . $extension; + +        if( !isset($this->_segFiles[ $filename ]) ) { +            throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' +                                       . $filename . ' file.' ); +        } + +        $file = $this->_directory->getFileObject( $this->_name.".cfs" ); +        $file->seek( $this->_segFiles[ $filename ] ); +        return $file; +    } + +    /** +     * Returns field index or -1 if field is not found +     * +     * @param string $fieldName +     * @return integer +     */ +    public function getFieldNum($fieldName) +    { +        foreach( $this->_fields as $field ) { +            if( $field->name == $fieldName ) { +                return $field->number; +            } +        } + +        return -1; +    } + +    /** +     * Returns field info for specified field +     * +     * @param integer $fieldNum +     * @return ZSearchFieldInfo +     */ +    public function getField($fieldNum) +    { +        return $this->_fields[$fieldNum]; +    } + +    /** +     * Returns array of fields. +     * if $indexed parameter is true, then returns only indexed fields. +     * +     * @param boolean $indexed +     * @return array +     */ +    public function getFields($indexed = false) +    { +        $result = array(); +        foreach( $this->_fields as $field ) { +            if( (!$indexed) || $field->isIndexed ) { +                $result[ $field->name ] = $field->name; +            } +        } +        return $result; +    } + +    /** +     * Returns the total number of documents in this segment. +     * +     * @return integer +     */ +    public function count() +    { +        return $this->_docCount; +    } + + +    /** +     * Loads Term dictionary from TermInfoIndex file +     */ +    protected function _loadDictionary() +    { +        if ($this->_termDictionary !== null) { +            return; +        } + +        $this->_termDictionary = array(); +        $this->_termDictionaryInfos = array(); + +        $tiiFile = $this->openCompoundFile('.tii'); +        $tiVersion = $tiiFile->readInt(); +        if ($tiVersion != (int)0xFFFFFFFE) { +            throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); +        } + +        $indexTermCount = $tiiFile->readLong(); +                          $tiiFile->readInt();  // IndexInterval +        $skipInterval   = $tiiFile->readInt(); + +        $prevTerm     = ''; +        $freqPointer  =  0; +        $proxPointer  =  0; +        $indexPointer =  0; +        for ($count = 0; $count < $indexTermCount; $count++) { +            $termPrefixLength = $tiiFile->readVInt(); +            $termSuffix       = $tiiFile->readString(); +            $termValue        = substr( $prevTerm, 0, $termPrefixLength ) . $termSuffix; + +            $termFieldNum     = $tiiFile->readVInt(); +            $docFreq          = $tiiFile->readVInt(); +            $freqPointer     += $tiiFile->readVInt(); +            $proxPointer     += $tiiFile->readVInt(); +            if( $docFreq >= $skipInterval ) { +                $skipDelta = $tiiFile->readVInt(); +            } else { +                $skipDelta = 0; +            } + +            $indexPointer += $tiiFile->readVInt(); + +            $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue,$termFieldNum); +            $this->_termDictionaryInfos[] = +                new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); +            $prevTerm = $termValue; +        } +    } + + +    /** +     * Return segment name +     * +     * @return string +     */ +    public function getName() +    { +        return $this->_name; +    } + + +    /** +     * Scans terms dictionary and returns term info +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @return Zend_Search_Lucene_Index_TermInfo +     */ +    public function getTermInfo($term) +    { +        $this->_loadDictionary(); + +        $searchField = $this->getFieldNum($term->field); + +        if ($searchField == -1) { +            return null; +        } +        $searchDicField = $this->_fieldsDicPositions[$searchField]; + +        // search for appropriate value in dictionary +        $lowIndex = 0; +        $highIndex = count($this->_termDictionary)-1; +        while ($highIndex >= $lowIndex) { +            // $mid = ($highIndex - $lowIndex)/2; +            $mid = ($highIndex + $lowIndex) >> 1; +            $midTerm = $this->_termDictionary[$mid]; + +            $delta = $searchDicField - $this->_fieldsDicPositions[$midTerm->field]; +            if ($delta == 0) { +                $delta = strcmp($term->text, $midTerm->text); +            } + +            if ($delta < 0) { +                $highIndex = $mid-1; +            } elseif ($delta > 0) { +                $lowIndex  = $mid+1; +            } else { +                return $this->_termDictionaryInfos[$mid]; // We got it! +            } +        } + +        if ($highIndex == -1) { +            // Term is out of the dictionary range +            return null; +        } + +        $prevPosition = $highIndex; +        $prevTerm = $this->_termDictionary[$prevPosition]; +        $prevTermInfo = $this->_termDictionaryInfos[ $prevPosition ]; + +        $tisFile = $this->openCompoundFile('.tis'); +        $tiVersion = $tisFile->readInt(); +        if ($tiVersion != (int)0xFFFFFFFE) { +            throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); +        } + +        $termCount     = $tisFile->readLong(); +        $indexInterval = $tisFile->readInt(); +        $skipInterval  = $tisFile->readInt(); + +        $tisFile->seek($prevTermInfo->indexPointer - 20 /* header size*/, SEEK_CUR); + +        $termValue    = $prevTerm->text; +        $termFieldNum = $prevTerm->field; +        $freqPointer = $prevTermInfo->freqPointer; +        $proxPointer = $prevTermInfo->proxPointer; +        for ($count = $prevPosition*$indexInterval + 1; +             $count < $termCount && +             ( $this->_fieldsDicPositions[ $termFieldNum ] < $searchDicField || +              ($this->_fieldsDicPositions[ $termFieldNum ] == $searchDicField && +               strcmp($termValue, $term->text) < 0) ); +             $count++) { +            $termPrefixLength = $tisFile->readVInt(); +            $termSuffix       = $tisFile->readString(); +            $termFieldNum     = $tisFile->readVInt(); +            $termValue        = substr( $termValue, 0, $termPrefixLength ) . $termSuffix; + +            $docFreq      = $tisFile->readVInt(); +            $freqPointer += $tisFile->readVInt(); +            $proxPointer += $tisFile->readVInt(); +            if( $docFreq >= $skipInterval ) { +                $skipOffset = $tisFile->readVInt(); +            } else { +                $skipOffset = 0; +            } +        } + +        if ($termFieldNum == $searchField && $termValue == $term->text) { +            return new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); +        } else { +            return null; +        } +    } + +    /** +     * Returns normalization factor for specified documents +     * +     * @param integer $id +     * @param string $fieldName +     * @return string +     */ +    public function norm($id, $fieldName) +    { +        $fieldNum = $this->getFieldNum($fieldName); + +        if ( !($this->_fields[$fieldNum]->isIndexed) ) { +            return null; +        } + +        if ( !isset( $this->_norms[$fieldNum] )) { +            $fFile = $this->openCompoundFile('.f' . $fieldNum); +            $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); +        } + +        return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum]{$id}) ); +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php new file mode 100644 index 00000000..f90d6ed3 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/SegmentWriter.php @@ -0,0 +1,491 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + +/** Zend_Search_Lucene_Analysis_Analyzer */ +require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_SegmentWriter +{ +    /** +     * Expert: The fraction of terms in the "dictionary" which should be stored +     * in RAM.  Smaller values use more memory, but make searching slightly +     * faster, while larger values use less memory and make searching slightly +     * slower.  Searching is typically not dominated by dictionary lookup, so +     * tweaking this is rarely useful. +     * +     * @var integer +     */ +    static public $indexInterval = 128; + +    /** Expert: The fraction of TermDocs entries stored in skip tables. +     * Larger values result in smaller indexes, greater acceleration, but fewer +     * accelerable cases, while smaller values result in bigger indexes, +     * less acceleration and more +     * accelerable cases. More detailed experiments would be useful here. +     * +     * 0x0x7FFFFFFF indicates that we don't use skip data +     * Default value is 16 +     * +     * @var integer +     */ +    static public $skipInterval = 0x7FFFFFFF; + +    /** +     * Number of docs in a segment +     * +     * @var integer +     */ +    private $_docCount; + +    /** +     * Segment name +     * +     * @var string +     */ +    private $_name; + +    /** +     * File system adapter. +     * +     * @var Zend_Search_Lucene_Storage_Directory +     */ +    private $_directory; + +    /** +     * List of the index files. +     * Used for automatic compound file generation +     * +     * @var unknown_type +     */ +    private $_files; + +    /** +     * Term Dictionary +     * Array of the Zend_Search_Lucene_Index_Term objects +     * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos +     * +     * @var array +     */ +    private $_termDictionary; + +    /** +     * Documents, which contain the term +     * +     * @var array +     */ +    private $_termDocs; + +    /** +     * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment +     * +     * @var array +     */ +    private $_fields; + +    /** +     * Normalization factors. +     * An array fieldName => normVector +     * normVector is a binary string. +     * Each byte corresponds to an indexed document in a segment and +     * encodes normalization factor (float value, encoded by +     * Zend_Search_Lucene_Search_Similarity::encodeNorm()) +     * +     * @var array +     */ +    private $_norms; + + +    /** +     * '.fdx'  file - Stored Fields, the field index. +     * +     * @var Zend_Search_Lucene_Storage_File +     */ +    private $_fdxFile; + +    /** +     * '.fdx'  file - Stored Fields, the field data. +     * +     * @var Zend_Search_Lucene_Storage_File +     */ +    private $_fdtFile; + + +    /** +     * Object constructor. +     * +     * @param Zend_Search_Lucene_Storage_Directory $directory +     * @param string $name +     */ +    public function __construct($directory, $name) +    { +        $this->_directory = $directory; +        $this->_name      = $name; +        $this->_docCount  = 0; + +        $this->_fields   = array(); +        $this->_termDocs = array(); +        $this->_files    = array(); +        $this->_norms    = array(); + +        $this->_fdxFile = null; +        $this->_fdtFile = null; +    } + + +    /** +     * Add field to the segment +     * +     * @param Zend_Search_Lucene_Field $field +     */ +    private function _addFieldInfo(Zend_Search_Lucene_Field $field) +    { +        if (!isset($this->_fields[$field->name])) { +            $this->_fields[$field->name] = +                                new Zend_Search_Lucene_Index_FieldInfo($field->name, +                                                                       $field->isIndexed, +                                                                       count($this->_fields), +                                                                       $field->storeTermVector); +        } else { +            $this->_fields[$field->name]->isIndexed       |= $field->isIndexed; +            $this->_fields[$field->name]->storeTermVector |= $field->storeTermVector; +        } +    } + + +    /** +     * Adds a document to this segment. +     * +     * @param Zend_Search_Lucene_Document $document +     * @throws Zend_Search_Lucene_Exception +     */ +    public function addDocument(Zend_Search_Lucene_Document $document) +    { +        $storedFields = array(); + +        foreach ($document->getFieldNames() as $fieldName) { +            $field = $document->getField($fieldName); +            $this->_addFieldInfo($field); + +            if ($field->storeTermVector) { +                /** +                 * @todo term vector storing support +                 */ +                throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); +            } + +            if ($field->isIndexed) { +                if ($field->isTokenized) { +                    $tokenList = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($field->stringValue); +                } else { +                    $tokenList = array(); +                    $tokenList[] = new Zend_Search_Lucene_Analysis_Token($field->stringValue, 0, strlen($field->stringValue)); +                } + +                $position = 0; +                foreach ($tokenList as $token) { +                    $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); +                    $termKey = $term->key(); + +                    if (!isset($this->_termDictionary[$termKey])) { +                        // New term +                        $this->_termDictionary[$termKey] = $term; +                        $this->_termDocs[$termKey] = array(); +                        $this->_termDocs[$termKey][$this->_docCount] = array(); +                    } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { +                        // Existing term, but new term entry +                        $this->_termDocs[$termKey][$this->_docCount] = array(); +                    } +                    $position += $token->getPositionIncrement(); +                    $this->_termDocs[$termKey][$this->_docCount][] = $position; +                } +            } + +            if ($field->isStored) { +                $storedFields[] = $field; +            } +        } + +        if (count($storedFields) != 0) { +            if (!isset($this->_fdxFile)) { +                $this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx'); +                $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); + +                $this->_files[] = $this->_name . '.fdx'; +                $this->_files[] = $this->_name . '.fdt'; +            } + +            $this->_fdxFile->writeLong($this->_fdtFile->tell()); + +            $this->_fdtFile->writeVInt(count($storedFields)); +            foreach ($storedFields as $field) { +                $this->_fdtFile->writeVInt($this->_fields[$field->name]->number); +                $this->_fdtFile->writeByte($field->isTokenized ? 0x01 : 0x00 | +                                           $field->isBinary ?    0x02 : 0x00 | +                                           0x00 /* 0x04 - third bit, compressed (ZLIB) */ ); +                if ($field->isBinary) { +                    $this->_fdtFile->writeVInt(strlen($field->stringValue)); +                    $this->_fdtFile->writeBytes($field->stringValue); +                } else { +                    $this->_fdtFile->writeString($field->stringValue); +                } +            } +        } + +        $this->_docCount++; +    } + + +    /** +     * Dump Field Info (.fnm) segment file +     */ +    private function _dumpFNM() +    { +        $fnmFile = $this->_directory->createFile($this->_name . '.fnm'); +        $fnmFile->writeVInt(count($this->_fields)); + +        foreach ($this->_fields as $field) { +            $fnmFile->writeString($field->name); +            $fnmFile->writeByte(($field->isIndexed       ? 0x01 : 0x00) | +                                ($field->storeTermVector ? 0x02 : 0x00) | +// not supported yet            0x04 /* term positions are stored with the term vectors */ | +// not supported yet            0x08 /* term offsets are stored with the term vectors */   | +/* not supported yet */         0x10 /* norms are omitted for the indexed field */ +                               ); +        } + +        $this->_files[] = $this->_name . '.fnm'; +    } + + +    /** +     * Dump Term Dictionary segment file entry. +     * Used to write entry to .tis or .tii files +     * +     * @param Zend_Search_Lucene_Storage_File $dicFile +     * @param Zend_Search_Lucene_Index_Term $prevTerm +     * @param Zend_Search_Lucene_Index_Term $term +     * @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo +     * @param Zend_Search_Lucene_Index_TermInfo $termInfo +     */ +    private function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile, +                                        &$prevTerm,     Zend_Search_Lucene_Index_Term     $term, +                                        &$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo) +    { +        if (isset($prevTerm) && $prevTerm->field == $term->field) { +            $prefixLength = 0; +            while ($prefixLength < strlen($prevTerm->text) && +                   $prefixLength < strlen($term->text) && +                   $prevTerm->text{$prefixLength} == $term->text{$prefixLength} +                  ) { +                $prefixLength++; +            } +            // Write preffix length +            $dicFile->writeVInt($prefixLength); +            // Write suffix +            $dicFile->writeString( substr($term->text, $prefixLength) ); +        } else { +            // Write preffix length +            $dicFile->writeVInt(0); +            // Write suffix +            $dicFile->writeString($term->text); +        } +        // Write field number +        $dicFile->writeVInt($term->field); +        // DocFreq (the count of documents which contain the term) +        $dicFile->writeVInt($termInfo->docFreq); + +        $prevTerm = $term; + +        if (!isset($prevTermInfo)) { +            // Write FreqDelta +            $dicFile->writeVInt($termInfo->freqPointer); +            // Write ProxDelta +            $dicFile->writeVInt($termInfo->proxPointer); +        } else { +            // Write FreqDelta +            $dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer); +            // Write ProxDelta +            $dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer); +        } +        // Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval +        if ($termInfo->skipOffset != 0) { +            $dicFile->writeVInt($termInfo->skipOffset); +        } + +        $prevTermInfo = $termInfo; +    } + +    /** +     * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files +     */ +    private function _dumpDictionary() +    { +        $tisFile = $this->_directory->createFile($this->_name . '.tis'); +        $tisFile->writeInt((int)0xFFFFFFFE); +        $tisFile->writeLong(count($this->_termDictionary)); +        $tisFile->writeInt(self::$indexInterval); +        $tisFile->writeInt(self::$skipInterval); + +        $tiiFile = $this->_directory->createFile($this->_name . '.tii'); +        $tiiFile->writeInt((int)0xFFFFFFFE); +        $tiiFile->writeLong((int)((count($this->_termDictionary) - 1)/self::$indexInterval) + 1); +        $tiiFile->writeInt(self::$indexInterval); +        $tiiFile->writeInt(self::$skipInterval); + +        $frqFile = $this->_directory->createFile($this->_name . '.frq'); +        $prxFile = $this->_directory->createFile($this->_name . '.prx'); + +        $termKeys = array_keys($this->_termDictionary); +        sort($termKeys, SORT_STRING); + +        $termCount = 0; + +        $prevTerm     = null; +        $prevTermInfo = null; +        $prevIndexTerm     = null; +        $prevIndexTermInfo = null; +        $prevIndexPosition = 0; + +        foreach ($termKeys as $termId) { +            $freqPointer = $frqFile->tell(); +            $proxPointer = $prxFile->tell(); + +            $prevDoc = 0; +            foreach ($this->_termDocs[$termId] as $docId => $termPositions) { +                $docDelta = ($docId - $prevDoc)*2; +                $prevDoc = $docId; +                if (count($termPositions) > 1) { +                    $frqFile->writeVInt($docDelta); +                    $frqFile->writeVInt(count($termPositions)); +                } else { +                    $frqFile->writeVInt($docDelta + 1); +                } + +                $prevPosition = 0; +                foreach ($termPositions as $position) { +                    $prxFile->writeVInt($position - $prevPosition); +                    $prevPosition = $position; +                } +            } + +            if (count($this->_termDocs[$termId]) >= self::$skipInterval) { +                /** +                 * @todo Write Skip Data to a freq file. +                 * It's not used now, but must be implemented to be compatible with Lucene +                 */ +                $skipOffset = $frqFile->tell() - $freqPointer; +            } else { +                $skipOffset = 0; +            } + +            $term = new Zend_Search_Lucene_Index_Term($this->_termDictionary[$termId]->text, +                                                      $this->_fields[$this->_termDictionary[$termId]->field]->number); +            $termInfo = new Zend_Search_Lucene_Index_TermInfo(count($this->_termDocs[$termId]), +                                            $freqPointer, $proxPointer, $skipOffset); + +            $this->_dumpTermDictEntry($tisFile, $prevTerm, $term, $prevTermInfo, $termInfo); + +            if ($termCount % self::$indexInterval == 0) { +                $this->_dumpTermDictEntry($tiiFile, $prevIndexTerm, $term, $prevIndexTermInfo, $termInfo); + +                $indexPosition = $tisFile->tell(); +                $tiiFile->writeVInt($indexPosition - $prevIndexPosition); +                $prevIndexPosition = $indexPosition; +            } +            $termCount++; +        } + +        $this->_files[] = $this->_name . '.tis'; +        $this->_files[] = $this->_name . '.tii'; +        $this->_files[] = $this->_name . '.frq'; +        $this->_files[] = $this->_name . '.prx'; +    } + + +    /** +     * Generate compound index file +     */ +    private function _generateCFS() +    { +        $cfsFile = $this->_directory->createFile($this->_name . '.cfs'); +        $cfsFile->writeVInt(count($this->_files)); + +        $dataOffsetPointers = array(); +        foreach ($this->_files as $fileName) { +            $dataOffsetPointers[$fileName] = $cfsFile->tell(); +            $cfsFile->writeLong(0); // write dummy data +            $cfsFile->writeString($fileName); +        } + +        foreach ($this->_files as $fileName) { +            // Get actual data offset +            $dataOffset = $cfsFile->tell(); +            // Seek to the data offset pointer +            $cfsFile->seek($dataOffsetPointers[$fileName]); +            // Write actual data offset value +            $cfsFile->writeLong($dataOffset); +            // Seek back to the end of file +            $cfsFile->seek($dataOffset); + +            $dataFile = $this->_directory->getFileObject($fileName); +            $cfsFile->writeBytes($dataFile->readBytes($this->_directory->fileLength($fileName))); + +            $this->_directory->deleteFile($fileName); +        } +    } + + +    /** +     * Close segment, write it to disk and return segment info +     * +     * @return Zend_Search_Lucene_Index_SegmentInfo +     */ +    public function close() +    { +        if ($this->_docCount == 0) { +            return null; +        } + +        $this->_dumpFNM(); +        $this->_dumpDictionary(); + +        $this->_generateCFS(); + +        return new Zend_Search_Lucene_Index_SegmentInfo($this->_name, +                                                        $this->_docCount, +                                                        $this->_directory); +    } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php new file mode 100644 index 00000000..e30ce587 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Term.php @@ -0,0 +1,70 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A Term represents a word from text.  This is the unit of search.  It is + * composed of two elements, the text of the word, as a string, and the name of + * the field that the text occured in, an interned string. + * + * Note that terms may represent more than words from text fields, but also + * things like dates, email addresses, urls, etc. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_Term +{ +    /** +     * Field name or field number (depending from context) +     * +     * @var mixed +     */ +    public $field; + +    /** +     * Term value +     * +     * @var string +     */ +    public $text; + + +    /** +     * @todo docblock +     */ +    public function __construct( $text, $field = 'contents' ) +    { +        $this->field = $field; +        $this->text = $text; +    } + + +    /** +     * @todo docblock +     */ +    public function key() +    { +        return $this->field . chr(0) . $this->text; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php new file mode 100644 index 00000000..ddef721d --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/TermInfo.php @@ -0,0 +1,77 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_TermInfo +{ +    /** +     * The number of documents which contain the term. +     * +     * @var integer +     */ +    public $docFreq; + +    /** +     * Data offset in a Frequencies file. +     * +     * @var integer +     */ +    public $freqPointer; + +    /** +     * Data offset in a Positions file. +     * +     * @var integer +     */ +    public $proxPointer; + +    /** +     * ScipData offset in a Frequencies file. +     * +     * @var integer +     */ +    public $skipOffset; + +    /** +     * Term offset of the _next_ term in a TermDictionary file. +     * Used only for Term Index +     * +     * @var integer +     */ +    public $indexPointer; + +    public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null) +    { +        $this->docFreq      = $docFreq; +        $this->freqPointer  = $freqPointer; +        $this->proxPointer  = $proxPointer; +        $this->skipOffset   = $skipOffset; +        $this->indexPointer = $indexPointer; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php new file mode 100644 index 00000000..da4af000 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Index/Writer.php @@ -0,0 +1,308 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Index_SegmentWriter */ +require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; + +/** Zend_Search_Lucene_Index_SegmentInfo */ +require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Index + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Index_Writer +{ +    /** +     * @todo Implement segment merger +     * @todo Implement mergeFactor, minMergeDocs, maxMergeDocs usage. +     * @todo Implement Analyzer substitution +     * @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for +     *       temporary index files +     * @todo Directory lock processing +     */ + +    /** +     * File system adapter. +     * +     * @var Zend_Search_Lucene_Storage_Directory +     */ +    private $_directory = null; + + +    /** +     * Index version +     * Counts how often the index has been changed by adding or deleting docs +     * +     * @var integer +     */ +    private $_version; + +    /** +     * Segment name counter. +     * Used to name new segments . +     * +     * @var integer +     */ +    private $_segmentNameCounter; + +    /** +     * Number of the segments in the index +     * +     * @var inteher +     */ +    private $_segments; + +    /** +     * Determines how often segment indices +     * are merged by addDocument(). +     * +     * @var integer +     */ +    public $mergeFactor; + +    /** +     * Determines the minimal number of documents required before +     * the buffered in-memory documents are merging and a new Segment +     * is created. +     * +     * @var integer +     */ +    public $minMergeDocs; + +    /** +     * Determines the largest number of documents ever merged by addDocument(). +     * +     * @var integer +     */ +    public $maxMergeDocs; + +    /** +     * List of the segments, created by index writer +     * Array of Zend_Search_Lucene_Index_SegmentInfo objects +     * +     * @var array +     */ +    private $_newSegments; + +    /** +     * Current segment to add documents +     * +     * @var Zend_Search_Lucene_Index_SegmentWriter +     */ +    private $_currentSegment; + +    /** +     * Opens the index for writing +     * +     * IndexWriter constructor needs Directory as a parameter. It should be +     * a string with a path to the index folder or a Directory object. +     * Second constructor parameter create is optional - true to create the +     * index or overwrite the existing one. +     * +     * @param Zend_Search_Lucene_Storage_Directory $directory +     * @param boolean $create +     */ +    public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $create = false) +    { +        $this->_directory = $directory; + +        if ($create) { +            foreach ($this->_directory->fileList() as $file) { +                if ($file == 'deletable' || +                    $file == 'segments'  || +                    substr($file, strlen($file)-4) == '.cfs') { +                        $this->_directory->deleteFile($file); +                    } +            } +            $segmentsFile = $this->_directory->createFile('segments'); +            $segmentsFile->writeInt((int)0xFFFFFFFF); +            // write version +            $segmentsFile->writeLong(0); +            // write name counter +            $segmentsFile->writeInt(0); +            // write segment counter +            $segmentsFile->writeInt(0); + +            $deletableFile = $this->_directory->createFile('deletable'); +            // write counter +            $deletableFile->writeInt(0); + +            $this->_version            = 0; +            $this->_segmentNameCounter = 0; +            $this->_segments           = 0; +        } else { +            $segmentsFile = $this->_directory->getFileObject('segments'); +            $format = $segmentsFile->readInt(); +            if ($format != (int)0xFFFFFFFF) { +                throw new Zend_Search_Lucene_Exception('Wrong segments file format'); +            } + +            // read version +            $this->_version            = $segmentsFile->readLong(); +            // read counter +            $this->_segmentNameCounter = $segmentsFile->readInt(); +            // read segment counter +            $this->_segments           = $segmentsFile->readInt(); +        } + +        $this->_newSegments = array(); +        $this->_currentSegment = null; +    } + +    /** +     * Adds a document to this index. +     * +     * @param Zend_Search_Lucene_Document $document +     */ +    public function addDocument(Zend_Search_Lucene_Document $document) +    { +        if ($this->_currentSegment === null) { +            $this->_currentSegment = +                new Zend_Search_Lucene_Index_SegmentWriter($this->_directory, $this->_newSegmentName()); +        } +        $this->_currentSegment->addDocument($document); +        $this->_version++; +    } + + + +    /** +     * Update segments file by adding current segment to a list +     * @todo !!!!!Finish the implementation +     * +     * @throws Zend_Search_Lucene_Exception +     */ +    private function _updateSegments() +    { +        $segmentsFile   = $this->_directory->getFileObject('segments'); +        $newSegmentFile = $this->_directory->createFile('segments.new'); + +        $newSegmentFile->writeInt((int)0xFFFFFFFF); +        $newSegmentFile->writeLong($this->_version); +        $newSegmentFile->writeInt($this->_segmentNameCounter); +        $newSegmentFile->writeInt($this->_segments + count($this->_newSegments)); + +        $segmentsFile->seek(20); +        $newSegmentFile->writeBytes($segmentsFile->readBytes($this->_directory->fileLength('segments') - 20)); + +        foreach ($this->_newSegments as $segmentName => $segmentInfo) { +            $newSegmentFile->writeString($segmentName); +            $newSegmentFile->writeInt($segmentInfo->count()); +        } + +        $this->_directory->renameFile('segments.new', 'segments'); +    } + + +    /** +     * Commit current changes +     * returns array of new segments +     * +     * @return array +     */ +    public function commit() +    { +        if ($this->_currentSegment !== null) { +            $newSegment = $this->_currentSegment->close(); +            if ($newSegment !== null) { +                $this->_newSegments[$newSegment->getName()] = $newSegment; +            } +            $this->_currentSegment = null; +        } + +        if (count($this->_newSegments) != 0) { +            $this->_updateSegments(); +        } + +        $result = $this->_newSegments; +        $this->_newSegments = array(); + +        return $result; +    } + + +    /** +     * Merges the provided indexes into this index. +     * +     * @param array $readers +     * @return void +     */ +    public function addIndexes($readers) +    { +        /** +         * @todo implementation +         */ +    } + + +    /** +     * Returns the number of documents currently in this index. +     * +     * @return integer +     */ +    public function docCount($readers) +    { +        /** +         * @todo implementation +         */ +    } + + +    /** +     * Flushes all changes to an index and closes all associated files. +     * +     */ +    public function close() +    { +        /** +         * @todo implementation +         */ +    } + + +    /** +     * Merges all segments together into a single segment, optimizing +     * an index for search. +     * +     * return void +     */ +    public function optimize() +    { +        /** +         * @todo implementation +         */ +    } + +    /** +     * Get name for new segment +     * +     * @return string +     */ +    private function _newSegmentName() +    { +        return '_' . base_convert($this->_segmentNameCounter++, 10, 36); +    } + +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php new file mode 100644 index 00000000..dd8698e8 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query.php @@ -0,0 +1,98 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Query +{ + +    /** +     * query boost factor +     * +     * @var float +     */ +    private $_boost = 1.0; + +    /** +     * Query weight +     * +     * @var Zend_Search_Lucene_Search_Weight +     */ +    protected $_weight; + + +    /** +     * Gets the boost for this clause.  Documents matching +     * this clause will (in addition to the normal weightings) have their score +     * multiplied by boost.   The boost is 1.0 by default. +     * +     * @return float +     */ +    public function getBoost() +    { +        return $this->_boost; +    } + +    /** +     * Sets the boost for this query clause to $boost. +     * +     * @param float $boost +     */ +    public function setBoost($boost) +    { +        $this->_boost = $boost; +    } + +    /** +     * Score specified document +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    abstract public function score($docId, $reader); + +    /** +     * Constructs an appropriate Weight implementation for this query. +     * +     * @param Zend_Search_Lucene $reader +     * @return Zend_Search_Lucene_Search_Weight +     */ +    abstract protected function _createWeight($reader); + +    /** +     * Constructs an initializes a Weight for a query. +     * +     * @param Zend_Search_Lucene $reader +     */ +    protected function _initWeight($reader) +    { +        $this->_weight = $this->_createWeight($reader); +        $sum = $this->_weight->sumOfSquaredWeights(); +        $queryNorm = $reader->getSimilarity()->queryNorm($sum); +        $this->_weight->normalize($queryNorm); +    } + +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php new file mode 100644 index 00000000..4a99c0f7 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/MultiTerm.php @@ -0,0 +1,437 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Query */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_MultiTerm */ +require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query +{ + +    /** +     * Terms to find. +     * Array of Zend_Search_Lucene_Index_Term +     * +     * @var array +     */ +    private $_terms = array(); + +    /** +     * Term signs. +     * If true then term is required. +     * If false then term is prohibited. +     * If null then term is neither prohibited, nor required +     * +     * If array is null then all terms are required +     * +     * @var array +     */ + +    private $_signs = array(); + +    /** +     * Result vector. +     * Bitset or array of document IDs +     * (depending from Bitset extension availability). +     * +     * @var mixed +     */ +    private $_resVector = null; + +    /** +     * Terms positions vectors. +     * Array of Arrays: +     * term1Id => (docId => array( pos1, pos2, ... ), ...) +     * term2Id => (docId => array( pos1, pos2, ... ), ...) +     * +     * @var array +     */ +    private $_termsPositions = array(); + + +    /** +     * A score factor based on the fraction of all query terms +     * that a document contains. +     * float for conjunction queries +     * array of float for non conjunction queries +     * +     * @var mixed +     */ +    private $_coord = null; + + +    /** +     * Terms weights +     * array of Zend_Search_Lucene_Search_Weight +     * +     * @var array +     */ +    private $_weights = array(); + + +    /** +     * Class constructor.  Create a new multi-term query object. +     * +     * @param array $terms    Array of Zend_Search_Lucene_Index_Term objects +     * @param array $signs    Array of signs.  Sign is boolean|null. +     * @return void +     */ +    public function __construct($terms = null, $signs = null) +    { +        /** +         * @todo Check contents of $terms and $signs before adding them. +         */ +        if (is_array($terms)) { +            $this->_terms = $terms; + +            $this->_signs = null; +            // Check if all terms are required +            if (is_array($signs)) { +                foreach ($signs as $sign ) { +                    if ($sign !== true) { +                        $this->_signs = $signs; +                        continue; +                    } +                } +            } +        } +    } + + +    /** +     * Add a $term (Zend_Search_Lucene_Index_Term) to this query. +     * +     * The sign is specified as: +     *     TRUE  - term is required +     *     FALSE - term is prohibited +     *     NULL  - term is neither prohibited, nor required +     * +     * @param  Zend_Search_Lucene_Index_Term $term +     * @param  boolean|null $sign +     * @return void +     */ +    public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign=null) { +        $this->_terms[] = $term; + +        /** +         * @todo This is not good.  Sometimes $this->_signs is an array, sometimes +         * it is null, even when there are terms.  It will be changed so that +         * it is always an array. +         */ +        if ($this->_signs === null) { +            if ($sign !== null) { +                $this->_signs = array(); +                foreach ($this->_terms as $term) { +                    $this->_signs[] = null; +                } +                $this->_signs[] = $sign; +            } +        } else { +            $this->_signs[] = $sign; +        } +    } + + +    /** +     * Returns query term +     * +     * @return array +     */ +    public function getTerms() +    { +        return $this->_terms; +    } + + +    /** +     * Return terms signs +     * +     * @return array +     */ +    public function getSigns() +    { +        return $this->_signs; +    } + + +    /** +     * Set weight for specified term +     * +     * @param integer $num +     * @param Zend_Search_Lucene_Search_Weight_Term $weight +     */ +    public function setWeight($num, $weight) +    { +        $this->_weights[$num] = $weight; +    } + + +    /** +     * Constructs an appropriate Weight implementation for this query. +     * +     * @param Zend_Search_Lucene $reader +     * @return Zend_Search_Lucene_Search_Weight +     */ +    protected function _createWeight($reader) +    { +        return new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader); +    } + + +    /** +     * Calculate result vector for Conjunction query +     * (like '+something +another') +     * +     * @param Zend_Search_Lucene $reader +     */ +    private function _calculateConjunctionResult($reader) +    { +        if (extension_loaded('bitset')) { +            foreach( $this->_terms as $termId=>$term ) { +                if($this->_resVector === null) { +                    $this->_resVector = bitset_from_array($reader->termDocs($term)); +                } else { +                    $this->_resVector = bitset_intersection( +                                $this->_resVector, +                                bitset_from_array($reader->termDocs($term)) ); +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } +        } else { +            foreach( $this->_terms as $termId=>$term ) { +                if($this->_resVector === null) { +                    $this->_resVector = array_flip($reader->termDocs($term)); +                } else { +                    $termDocs = array_flip($reader->termDocs($term)); +                    foreach($this->_resVector as $key=>$value) { +                        if (!isset( $termDocs[$key] )) { +                            unset( $this->_resVector[$key] ); +                        } +                    } +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } +        } +    } + + +    /** +     * Calculate result vector for non Conjunction query +     * (like '+something -another') +     * +     * @param Zend_Search_Lucene $reader +     */ +    private function _calculateNonConjunctionResult($reader) +    { +        if (extension_loaded('bitset')) { +            $required   = null; +            $neither    = bitset_empty(); +            $prohibited = bitset_empty(); + +            foreach ($this->_terms as $termId => $term) { +                $termDocs = bitset_from_array($reader->termDocs($term)); + +                if ($this->_signs[$termId] === true) { +                    // required +                    if ($required !== null) { +                        $required = bitset_intersection($required, $termDocs); +                    } else { +                        $required = $termDocs; +                    } +                } elseif ($this->_signs[$termId] === false) { +                    // prohibited +                    $prohibited = bitset_union($prohibited, $termDocs); +                } else { +                    // neither required, nor prohibited +                    $neither = bitset_union($neither, $termDocs); +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } + +            if ($required === null) { +                $required = $neither; +            } +            $this->_resVector = bitset_intersection( $required, +                                                     bitset_invert($prohibited, $reader->count()) ); +        } else { +            $required   = null; +            $neither    = array(); +            $prohibited = array(); + +            foreach ($this->_terms as $termId => $term) { +                $termDocs = array_flip($reader->termDocs($term)); + +                if ($this->_signs[$termId] === true) { +                    // required +                    if ($required !== null) { +                        // substitute for bitset_intersection +                        foreach ($required as $key => $value) { +                            if (!isset( $termDocs[$key] )) { +                                unset($required[$key]); +                            } +                        } +                    } else { +                        $required = $termDocs; +                    } +                } elseif ($this->_signs[$termId] === false) { +                    // prohibited +                    // substitute for bitset_union +                    foreach ($termDocs as $key => $value) { +                        $prohibited[$key] = $value; +                    } +                } else { +                    // neither required, nor prohibited +                    // substitute for bitset_union +                    foreach ($termDocs as $key => $value) { +                        $neither[$key] = $value; +                    } +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } + +            if ($required === null) { +                $required = $neither; +            } + +            foreach ($required as $key=>$value) { +                if (isset( $prohibited[$key] )) { +                    unset($required[$key]); +                } +            } +            $this->_resVector = $required; +        } +    } + + +    /** +     * Score calculator for conjunction queries (all terms are required) +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function _conjunctionScore($docId, $reader) +    { +        if ($this->_coord === null) { +            $this->_coord = $reader->getSimilarity()->coord(count($this->_terms), +                                                            count($this->_terms) ); +        } + +        $score = 0.0; + +        foreach ($this->_terms as $termId=>$term) { +            $score += $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * +                      $this->_weights[$termId]->getValue() * +                      $reader->norm($docId, $term->field); +        } + +        return $score * $this->_coord; +    } + + +    /** +     * Score calculator for non conjunction queries (not all terms are required) +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function _nonConjunctionScore($docId, $reader) +    { +        if ($this->_coord === null) { +            $this->_coord = array(); + +            $maxCoord = 0; +            foreach ($this->_signs as $sign) { +                if ($sign !== false /* not prohibited */) { +                    $maxCoord++; +                } +            } + +            for ($count = 0; $count <= $maxCoord; $count++) { +                $this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord); +            } +        } + +        $score = 0.0; +        $matchedTerms = 0; +        foreach ($this->_terms as $termId=>$term) { +            // Check if term is +            if ($this->_signs[$termId] !== false &&            // not prohibited +                isset($this->_termsPositions[$termId][$docId]) // matched +               ) { +                $matchedTerms++; +                $score += +                      $reader->getSimilarity()->tf(count($this->_termsPositions[$termId][$docId]) ) * +                      $this->_weights[$termId]->getValue() * +                      $reader->norm($docId, $term->field); +            } +        } + +        return $score * $this->_coord[$matchedTerms]; +    } + +    /** +     * Score specified document +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function score($docId, $reader) +    { +        if($this->_resVector === null) { +            if ($this->_signs === null) { +                $this->_calculateConjunctionResult($reader); +            } else { +                $this->_calculateNonConjunctionResult($reader); +            } + +            $this->_initWeight($reader); +        } + +        if ( (extension_loaded('bitset')) ? +                bitset_in($this->_resVector, $docId) : +                isset($this->_resVector[$docId])  ) { +            if ($this->_signs === null) { +                return $this->_conjunctionScore($docId, $reader); +            } else { +                return $this->_nonConjunctionScore($docId, $reader); +            } +        } else { +            return 0; +        } +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php new file mode 100644 index 00000000..3e52666b --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Phrase.php @@ -0,0 +1,424 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Zend_Search_Lucene_Search_Query + */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** + * Zend_Search_Lucene_Search_Weight_MultiTerm + */ +require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php'; + + +/** + * A Query that matches documents containing a particular sequence of terms. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query +{ +    /** +     * Terms to find. +     * Array of Zend_Search_Lucene_Index_Term objects. +     * +     * @var array +     */ +    private $_terms; + +    /** +     * Term positions (relative positions of terms within the phrase). +     * Array of integers +     * +     * @var array +     */ +    private $_offsets; + +    /** +     * Sets the number of other words permitted between words in query phrase. +     * If zero, then this is an exact phrase search.  For larger values this works +     * like a WITHIN or NEAR operator. +     * +     * The slop is in fact an edit-distance, where the units correspond to +     * moves of terms in the query phrase out of position.  For example, to switch +     * the order of two words requires two moves (the first move places the words +     * atop one another), so to permit re-orderings of phrases, the slop must be +     * at least two. +     * More exact matches are scored higher than sloppier matches, thus search +     * results are sorted by exactness. +     * +     * The slop is zero by default, requiring exact matches. +     * +     * @var unknown_type +     */ +    private $_slop; + +    /** +     * Result vector. +     * Bitset or array of document IDs +     * (depending from Bitset extension availability). +     * +     * @var mixed +     */ +    private $_resVector = null; + +    /** +     * Terms positions vectors. +     * Array of Arrays: +     * term1Id => (docId => array( pos1, pos2, ... ), ...) +     * term2Id => (docId => array( pos1, pos2, ... ), ...) +     * +     * @var array +     */ +    private $_termsPositions = array(); + +    /** +     * Class constructor.  Create a new prase query. +     * +     * @param string $field    Field to search. +     * @param array  $terms    Terms to search Array of strings. +     * @param array  $offsets  Relative term positions. Array of integers. +     * @throws Zend_Search_Lucene_Exception +     */ +    public function __construct($terms = null, $offsets = null, $field = null) +    { +        $this->_slop = 0; + +        if (is_array($terms)) { +            $this->_terms = array(); +            foreach ($terms as $termId => $termText) { +                $this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field): +                                                            new Zend_Search_Lucene_Index_Term($termText); +            } +        } else if ($terms === null) { +            $this->_terms = array(); +        } else { +            throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null'); +        } + +        if (is_array($offsets)) { +            if (count($this->_terms) != count($offsets)) { +                throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.'); +            } +            $this->_offsets = $offsets; +        } else if ($offsets === null) { +            $this->_offsets = array(); +            foreach ($this->_terms as $termId => $term) { +                $position = count($this->_offsets); +                $this->_offsets[$termId] = $position; +            } +        } else { +            throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null'); +        } +    } + +    /** +     * Set slop +     * +     * @param integer $slop +     */ +    public function setSlop($slop) +    { +        $this->_slop = $slop; +    } + + +    /** +     * Get slop +     * +     * @return integer +     */ +    public function getSlop() +    { +        return $this->_slop; +    } + + +    /** +     * Adds a term to the end of the query phrase. +     * The relative position of the term is specified explicitly or the one immediately +     * after the last term added. +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @param integer $position +     */ +    public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) { +        if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) { +            throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' . +                                                   $term->field . ':' . $term->text); +        } + +        $this->_terms[] = $term; +        if ($position !== null) { +            $this->_offsets[] = $position; +        } else if (count($this->_offsets) != 0) { +            $this->_offsets[] = end($this->_offsets) + 1; +        } else { +            $this->_offsets[] = 0; +        } +    } + + +    /** +     * Returns query term +     * +     * @return array +     */ +    public function getTerms() +    { +        return $this->_terms; +    } + + +    /** +     * Set weight for specified term +     * +     * @param integer $num +     * @param Zend_Search_Lucene_Search_Weight_Term $weight +     */ +    public function setWeight($num, $weight) +    { +        $this->_weights[$num] = $weight; +    } + + +    /** +     * Constructs an appropriate Weight implementation for this query. +     * +     * @param Zend_Search_Lucene $reader +     * @return Zend_Search_Lucene_Search_Weight +     */ +    protected function _createWeight($reader) +    { +        return new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader); +    } + + +    /** +     * Calculate result vector +     * +     * @param Zend_Search_Lucene $reader +     */ +    private function _calculateResult($reader) +    { +        if (extension_loaded('bitset')) { +            foreach( $this->_terms as $termId=>$term ) { +                if($this->_resVector === null) { +                    $this->_resVector = bitset_from_array($reader->termDocs($term)); +                } else { +                    $this->_resVector = bitset_intersection( +                                $this->_resVector, +                                bitset_from_array($reader->termDocs($term)) ); +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } +        } else { +            foreach( $this->_terms as $termId=>$term ) { +                if($this->_resVector === null) { +                    $this->_resVector = array_flip($reader->termDocs($term)); +                } else { +                    $termDocs = array_flip($reader->termDocs($term)); +                    foreach($this->_resVector as $key=>$value) { +                        if (!isset( $termDocs[$key] )) { +                            unset( $this->_resVector[$key] ); +                        } +                    } +                } + +                $this->_termsPositions[$termId] = $reader->termPositions($term); +            } +        } +    } + + +    /** +     * Score calculator for exact phrase queries (terms sequence is fixed) +     * +     * @param integer $docId +     * @return float +     */ +    public function _exactPhraseFreq($docId) +    { +        $freq = 0; + +        // Term Id with lowest cardinality +        $lowCardTermId = null; + +        // Calculate $lowCardTermId +        foreach ($this->_terms as $termId => $term) { +            if ($lowCardTermId === null || +                count($this->_termsPositions[$termId][$docId]) < +                count($this->_termsPositions[$lowCardTermId][$docId]) ) { +                    $lowCardTermId = $termId; +                } +        } + +        // Walk through positions of the term with lowest cardinality +        foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { +            // We expect phrase to be found +            $freq++; + +            // Walk through other terms +            foreach ($this->_terms as $termId => $term) { +                if ($termId != $lowCardTermId) { +                    $expectedPosition = $lowCardPos + +                                            ($this->_offsets[$termId] - +                                             $this->_offsets[$lowCardTermId]); + +                    if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { +                        $freq--;  // Phrase wasn't found. +                        break; +                    } +                } +            } +        } + +        return $freq; +    } + +    /** +     * Score calculator for sloppy phrase queries (terms sequence is fixed) +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function _sloppyPhraseFreq($docId, Zend_Search_Lucene $reader) +    { +        $freq = 0; + +        $phraseQueue = array(); +        $phraseQueue[0] = array(); // empty phrase +        $lastTerm = null; + +        // Walk through the terms to create phrases. +        foreach ($this->_terms as $termId => $term) { +            $queueSize = count($phraseQueue); +            $firstPass = true; + +            // Walk through the term positions. +            // Each term position produces a set of phrases. +            foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) { +                if ($firstPass) { +                    for ($count = 0; $count < $queueSize; $count++) { +                        $phraseQueue[$count][$termId] = $termPosition; +                    } +                } else { +                    for ($count = 0; $count < $queueSize; $count++) { +                        if ($lastTerm !== null && +                            abs( $termPosition - $phraseQueue[$count][$lastTerm] - +                                 ($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) { +                            continue; +                        } + +                        $newPhraseId = count($phraseQueue); +                        $phraseQueue[$newPhraseId]          = $phraseQueue[$count]; +                        $phraseQueue[$newPhraseId][$termId] = $termPosition; +                    } + +                } + +                $firstPass = false; +            } +            $lastTerm = $termId; +        } + + +        foreach ($phraseQueue as $phrasePos) { +            $minDistance = null; + +            for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) { +                $distance = 0; +                $start = reset($phrasePos) - reset($this->_offsets) + $shift; + +                foreach ($this->_terms as $termId => $term) { +                    $distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start); + +                    if($distance > $this->_slop) { +                        break; +                    } +                } + +                if ($minDistance === null || $distance < $minDistance) { +                    $minDistance = $distance; +                } +            } + +            if ($minDistance <= $this->_slop) { +                $freq += $reader->getSimilarity()->sloppyFreq($minDistance); +            } +        } + +        return $freq; +    } + + +    /** +     * Score specified document +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function score($docId, $reader) +    { +        // optimize zero-term case +        if (count($this->_terms) == 0) { +            return 0; +        } + +        if($this->_resVector === null) { +            $this->_calculateResult($reader); +            $this->_initWeight($reader); +        } + +        if ( (extension_loaded('bitset')) ? +                bitset_in($this->_resVector, $docId) : +                isset($this->_resVector[$docId])  ) { +            if ($this->_slop == 0) { +                $freq = $this->_exactPhraseFreq($docId); +            } else { +                $freq = $this->_sloppyPhraseFreq($docId, $reader); +            } + +/* +            return $reader->getSimilarity()->tf($freq) * +                   $this->_weight->getValue() * +                   $reader->norm($docId, reset($this->_terms)->field); +*/ +            if ($freq != 0) { +                $tf = $reader->getSimilarity()->tf($freq); +                $weight = $this->_weight->getValue(); +                $norm = $reader->norm($docId, reset($this->_terms)->field); + +                return $tf*$weight*$norm; +            } +        } else { +            return 0; +        } +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php new file mode 100644 index 00000000..d622f845 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Query/Term.php @@ -0,0 +1,126 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Query */ +require_once 'Zend/Search/Lucene/Search/Query.php'; + +/** Zend_Search_Lucene_Search_Weight_Term */ +require_once 'Zend/Search/Lucene/Search/Weight/Term.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query +{ +    /** +     * Term to find. +     * +     * @var Zend_Search_Lucene_Index_Term +     */ +    private $_term; + +    /** +     * Term sign. +     * If true then term is required +     * If false then term is prohibited. +     * +     * @var bool +     */ +    private $_sign; + +    /** +     * Documents vector. +     * Bitset or array of document IDs +     * (depending from Bitset extension availability). +     * +     * @var mixed +     */ +    private $_docVector = null; + +    /** +     * Term positions vector. +     * Array: docId => array( pos1, pos2, ... ) +     * +     * @var array +     */ +    private $_termPositions; + + +    /** +     * Zend_Search_Lucene_Search_Query_Term constructor +     * +     * @param Zend_Search_Lucene_Index_Term $term +     * @param boolean $sign +     */ +    public function __construct( $term, $sign = true ) +    { +        $this->_term = $term; +        $this->_sign = $sign; +    } + + +    /** +     * Constructs an appropriate Weight implementation for this query. +     * +     * @param Zend_Search_Lucene $reader +     * @return Zend_Search_Lucene_Search_Weight +     */ +    protected function _createWeight($reader) +    { +        return new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader); +    } + +    /** +     * Score specified document +     * +     * @param integer $docId +     * @param Zend_Search_Lucene $reader +     * @return float +     */ +    public function score( $docId, $reader ) +    { +        if($this->_docVector===null) { +            if (extension_loaded('bitset')) { +                $this->_docVector = bitset_from_array( $reader->termDocs($this->_term) ); +            } else { +                $this->_docVector = array_flip($reader->termDocs($this->_term)); +            } + +            $this->_termPositions = $reader->termPositions($this->_term); +            $this->_initWeight($reader); +        } + +        $match = extension_loaded('bitset') ?  bitset_in($this->_docVector, $docId) : +                                               isset($this->_docVector[$docId]); +        if ($this->_sign && $match) { +            return $reader->getSimilarity()->tf(count($this->_termPositions[$docId]) ) * +                   $this->_weight->getValue() * +                   $reader->norm($docId, $this->_term->field); +        } else { +            return 0; +        } +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php new file mode 100644 index 00000000..65290a9e --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryHit.php @@ -0,0 +1,106 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryHit +{ +    /** +     * Object handle of the index +     * @var Zend_Search_Lucene +     */ +    protected $_index = null; + +    /** +     * Object handle of the document associated with this hit +     * @var Zend_Search_Lucene_Document +     */ +    protected $_document = null; + +    /** +     * Number of the document in the index +     * @var integer +     */ +    public $id; + +    /** +     * Score of the hit +     * @var float +     */ +    public $score; + + +    /** +     * Constructor - pass object handle of Zend_Search_Lucene index that produced +     * the hit so the document can be retrieved easily from the hit. +     * +     * @param Zend_Search_Lucene $index +     */ + +    public function __construct(Zend_Search_Lucene $index) +    { +        $this->_index = $index; +    } + + +    /** +     * Convenience function for getting fields from the document +     * associated with this hit. +     * +     * @param string $offset +     * @return string +     */ +    public function __get($offset) +    { +        return $this->getDocument()->getFieldValue($offset); +    } + + +    /** +     * Return the document object for this hit +     * +     * @return Zend_Search_Lucene_Document +     */ +    public function getDocument() +    { +        if (!$this->_document instanceof Zend_Search_Lucene_Document) { +            $this->_document = $this->_index->getDocument($this->id); +        } + +        return $this->_document; +    } + + +    /** +     * Return the index object for this hit +     * +     * @return Zend_Search_Lucene +     */ +    public function getIndex() +    { +        return $this->_index; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php new file mode 100644 index 00000000..9387afca --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryParser.php @@ -0,0 +1,140 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_QueryTokenizer */ +require_once 'Zend/Search/Lucene/Search/QueryTokenizer.php'; + +/** Zend_Search_Lucene_Index_Term */ +require_once 'Zend/Search/Lucene/Index/Term.php'; + +/** Zend_Search_Lucene_Search_Query_Term */ +require_once 'Zend/Search/Lucene/Search/Query/Term.php'; + +/** Zend_Search_Lucene_Search_Query_MultiTerm */ +require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php'; + +/** Zend_Search_Lucene_Search_Query_Phrase */ +require_once 'Zend/Search/Lucene/Search/Query/Phrase.php'; + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryParser +{ + +    /** +     * Parses a query string, returning a Zend_Search_Lucene_Search_Query +     * +     * @param string $strQuery +     * @return Zend_Search_Lucene_Search_Query +     */ +    static public function parse($strQuery) +    { +        $tokens = new Zend_Search_Lucene_Search_QueryTokenizer($strQuery); + +        // Empty query +        if (!$tokens->count()) { +            throw new Zend_Search_Lucene_Exception('Syntax error: query string cannot be empty.'); +        } + +        // Term query +        if ($tokens->count() == 1) { +            if ($tokens->current()->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { +                return new Zend_Search_Lucene_Search_Query_Term(new Zend_Search_Lucene_Index_Term($tokens->current()->text, 'contents')); +            } else { +                throw new Zend_Search_Lucene_Exception('Syntax error: query string must contain at least one word.'); +            } +        } + + +        /** +         * MultiTerm Query +         * +         * Process each token that was returned by the tokenizer. +         */ +        $terms = array(); +        $signs = array(); +        $prevToken = null; +        $openBrackets = 0; +        $field = 'contents'; +        foreach ($tokens as $token) { +            switch ($token->type) { +                case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD: +                    $terms[] = new Zend_Search_Lucene_Index_Term($token->text, $field); +                    $field = 'contents'; +                    if ($prevToken !== null && +                        $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { +                            if ($prevToken->text == "+") { +                                $signs[] = true; +                            } else { +                                $signs[] = false; +                            } +                    } else { +                        $signs[] = null; +                    } +                    break; +                case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN: +                    if ($prevToken !== null && +                        $prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { +                            throw new Zend_Search_Lucene_Exception('Syntax error: sign operator must be followed by a word.'); +                    } +                    break; +                case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD: +                    $field = $token->text; +                    // let previous token to be signed as next $prevToken +                    $token = $prevToken; +                    break; +                case Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET: +                    $token->text=='(' ? $openBrackets++ : $openBrackets--; +            } +            $prevToken = $token; +        } + +        // Finish up parsing: check the last token in the query for an opening sign or parenthesis. +        if ($prevToken->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN) { +            throw new Zend_Search_Lucene_Exception('Syntax Error: sign operator must be followed by a word.'); +        } + +        // Finish up parsing: check that every opening bracket has a matching closing bracket. +        if ($openBrackets != 0) { +            throw new Zend_Search_Lucene_Exception('Syntax Error: mismatched parentheses, every opening must have closing.'); +        } + +        switch (count($terms)) { +            case 0: +                throw new Zend_Search_Lucene_Exception('Syntax error: bad term count.'); +            case 1: +                return new Zend_Search_Lucene_Search_Query_Term($terms[0],$signs[0] !== false); +            default: +                return new Zend_Search_Lucene_Search_Query_MultiTerm($terms,$signs); +        } +    } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php new file mode 100644 index 00000000..995e0d3c --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryToken.php @@ -0,0 +1,102 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryToken +{ +    /** +     * Token type Word. +     */ +    const TOKTYPE_WORD = 0; + +    /** +     * Token type Field. +     * Field indicator in 'field:word' pair +     */ +    const TOKTYPE_FIELD = 1; + +    /** +     * Token type Sign. +     * '+' (required) or '-' (absentee) sign +     */ +    const TOKTYPE_SIGN = 2; + +    /** +     * Token type Bracket. +     * '(' or ')' +     */ +    const TOKTYPE_BRACKET = 3; + + +    /** +     * Token type. +     * +     * @var integer +     */ +    public $type; + +    /** +     * Token text. +     * +     * @var integer +     */ +    public $text; + + +    /** +     * IndexReader constructor needs token type and token text as a parameters. +     * +     * @param $tokType integer +     * @param $tokText string +     */ +    public function __construct($tokType, $tokText) +    { +        switch ($tokType) { +            case self::TOKTYPE_BRACKET: +                // fall through to the next case +            case self::TOKTYPE_FIELD: +                // fall through to the next case +            case self::TOKTYPE_SIGN: +                // fall through to the next case +            case self::TOKTYPE_WORD: +                break; +            default: +                throw new Zend_Search_Lucene_Exception("Unrecognized token type \"$tokType\"."); +        } + +        if (!strlen($tokText)) { +            throw new Zend_Search_Lucene_Exception('Token text must be supplied.'); +        } + +        $this->type = $tokType; +        $this->text = $tokText; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php new file mode 100644 index 00000000..986f8899 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/QueryTokenizer.php @@ -0,0 +1,162 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_QueryToken */ +require_once 'Zend/Search/Lucene/Search/QueryToken.php'; + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_QueryTokenizer implements Iterator +{ +    /** +     * inputString tokens. +     * +     * @var array +     */ +    protected $_tokens = array(); + +    /** +     * tokens pointer. +     * +     * @var integer +     */ +    protected $_currToken = 0; + + +    /** +     * QueryTokenize constructor needs query string as a parameter. +     * +     * @param string $inputString +     */ +    public function __construct($inputString) +    { +        if (!strlen($inputString)) { +            throw new Zend_Search_Lucene_Exception('Cannot tokenize empty query string.'); +        } + +        $currentToken = ''; +        for ($count = 0; $count < strlen($inputString); $count++) { +            if (ctype_alnum( $inputString{$count} )) { +                $currentToken .= $inputString{$count}; +            } else { +                // Previous token is finished +                if (strlen($currentToken)) { +                    $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, +                                                                $currentToken); +                    $currentToken = ''; +                } + +                if ($inputString{$count} == '+' || $inputString{$count} == '-') { +                    $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_SIGN, +                                                                $inputString{$count}); +                } elseif ($inputString{$count} == '(' || $inputString{$count} == ')') { +                    $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_BRACKET, +                                                                $inputString{$count}); +                } elseif ($inputString{$count} == ':' && $this->count()) { +                    if ($this->_tokens[count($this->_tokens)-1]->type == Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD) { +                        $this->_tokens[count($this->_tokens)-1]->type = Zend_Search_Lucene_Search_QueryToken::TOKTYPE_FIELD; +                    } +                } +            } +        } + +        if (strlen($currentToken)) { +            $this->_tokens[] = new Zend_Search_Lucene_Search_QueryToken(Zend_Search_Lucene_Search_QueryToken::TOKTYPE_WORD, $currentToken); +        } +    } + + +    /** +     * Returns number of tokens +     * +     * @return integer +     */ +    public function count() +    { +        return count($this->_tokens); +    } + + +    /** +     * Returns TRUE if a token exists at the current position. +     * +     * @return boolean +     */ +    public function valid() +    { +        return $this->_currToken < $this->count(); +    } + + +    /** +     * Resets token stream. +     * +     * @return integer +     */ +    public function rewind() +    { +        $this->_currToken = 0; +    } + + +    /** +     * Returns the token at the current position or FALSE if +     * the position does not contain a valid token. +     * +     * @return mixed +     */ +    public function current() +    { +        return $this->valid() ? $this->_tokens[$this->_currToken] : false; +    } + + +    /** +     * Returns next token +     * +     * @return Zend_Search_Lucene_Search_QueryToken +     */ +    public function next() +    { +        return ++$this->_currToken; +    } + + +    /** +     * Return the position of the current token. +     * +     * @return integer +     */ +    public function key() +    { +        return $this->_currToken; +    } + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php new file mode 100644 index 00000000..8b758213 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity.php @@ -0,0 +1,551 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Similarity_Default */ +require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Similarity +{ +    /** +     * The Similarity implementation used by default. +     * +     * @var Zend_Search_Lucene_Search_Similarity +     */ +    static private $_defaultImpl; + +    /** +     * Cache of decoded bytes. +     * Array of floats +     * +     * @var array +     */ +    static private $_normTable = array( 0   => 0.0, +                                        1   => 5.820766E-10, +                                        2   => 6.9849193E-10, +                                        3   => 8.1490725E-10, +                                        4   => 9.313226E-10, +                                        5   => 1.1641532E-9, +                                        6   => 1.3969839E-9, +                                        7   => 1.6298145E-9, +                                        8   => 1.8626451E-9, +                                        9   => 2.3283064E-9, +                                        10  => 2.7939677E-9, +                                        11  => 3.259629E-9, +                                        12  => 3.7252903E-9, +                                        13  => 4.656613E-9, +                                        14  => 5.5879354E-9, +                                        15  => 6.519258E-9, +                                        16  => 7.4505806E-9, +                                        17  => 9.313226E-9, +                                        18  => 1.1175871E-8, +                                        19  => 1.3038516E-8, +                                        20  => 1.4901161E-8, +                                        21  => 1.8626451E-8, +                                        22  => 2.2351742E-8, +                                        23  => 2.6077032E-8, +                                        24  => 2.9802322E-8, +                                        25  => 3.7252903E-8, +                                        26  => 4.4703484E-8, +                                        27  => 5.2154064E-8, +                                        28  => 5.9604645E-8, +                                        29  => 7.4505806E-8, +                                        30  => 8.940697E-8, +                                        31  => 1.0430813E-7, +                                        32  => 1.1920929E-7, +                                        33  => 1.4901161E-7, +                                        34  => 1.7881393E-7, +                                        35  => 2.0861626E-7, +                                        36  => 2.3841858E-7, +                                        37  => 2.9802322E-7, +                                        38  => 3.5762787E-7, +                                        39  => 4.172325E-7, +                                        40  => 4.7683716E-7, +                                        41  => 5.9604645E-7, +                                        42  => 7.1525574E-7, +                                        43  => 8.34465E-7, +                                        44  => 9.536743E-7, +                                        45  => 1.1920929E-6, +                                        46  => 1.4305115E-6, +                                        47  => 1.66893E-6, +                                        48  => 1.9073486E-6, +                                        49  => 2.3841858E-6, +                                        50  => 2.861023E-6, +                                        51  => 3.33786E-6, +                                        52  => 3.8146973E-6, +                                        53  => 4.7683716E-6, +                                        54  => 5.722046E-6, +                                        55  => 6.67572E-6, +                                        56  => 7.6293945E-6, +                                        57  => 9.536743E-6, +                                        58  => 1.1444092E-5, +                                        59  => 1.335144E-5, +                                        60  => 1.5258789E-5, +                                        61  => 1.9073486E-5, +                                        62  => 2.2888184E-5, +                                        63  => 2.670288E-5, +                                        64  => 3.0517578E-5, +                                        65  => 3.8146973E-5, +                                        66  => 4.5776367E-5, +                                        67  => 5.340576E-5, +                                        68  => 6.1035156E-5, +                                        69  => 7.6293945E-5, +                                        70  => 9.1552734E-5, +                                        71  => 1.0681152E-4, +                                        72  => 1.2207031E-4, +                                        73  => 1.5258789E-4, +                                        74  => 1.8310547E-4, +                                        75  => 2.1362305E-4, +                                        76  => 2.4414062E-4, +                                        77  => 3.0517578E-4, +                                        78  => 3.6621094E-4, +                                        79  => 4.272461E-4, +                                        80  => 4.8828125E-4, +                                        81  => 6.1035156E-4, +                                        82  => 7.324219E-4, +                                        83  => 8.544922E-4, +                                        84  => 9.765625E-4, +                                        85  => 0.0012207031, +                                        86  => 0.0014648438, +                                        87  => 0.0017089844, +                                        88  => 0.001953125, +                                        89  => 0.0024414062, +                                        90  => 0.0029296875, +                                        91  => 0.0034179688, +                                        92  => 0.00390625, +                                        93  => 0.0048828125, +                                        94  => 0.005859375, +                                        95  => 0.0068359375, +                                        96  => 0.0078125, +                                        97  => 0.009765625, +                                        98  => 0.01171875, +                                        99  => 0.013671875, +                                        100 => 0.015625, +                                        101 => 0.01953125, +                                        102 => 0.0234375, +                                        103 => 0.02734375, +                                        104 => 0.03125, +                                        105 => 0.0390625, +                                        106 => 0.046875, +                                        107 => 0.0546875, +                                        108 => 0.0625, +                                        109 => 0.078125, +                                        110 => 0.09375, +                                        111 => 0.109375, +                                        112 => 0.125, +                                        113 => 0.15625, +                                        114 => 0.1875, +                                        115 => 0.21875, +                                        116 => 0.25, +                                        117 => 0.3125, +                                        118 => 0.375, +                                        119 => 0.4375, +                                        120 => 0.5, +                                        121 => 0.625, +                                        122 => 0.75, +                                        123 => 0.875, +                                        124 => 1.0, +                                        125 => 1.25, +                                        126 => 1.5, +                                        127 => 1.75, +                                        128 => 2.0, +                                        129 => 2.5, +                                        130 => 3.0, +                                        131 => 3.5, +                                        132 => 4.0, +                                        133 => 5.0, +                                        134 => 6.0, +                                        135 => 7.0, +                                        136 => 8.0, +                                        137 => 10.0, +                                        138 => 12.0, +                                        139 => 14.0, +                                        140 => 16.0, +                                        141 => 20.0, +                                        142 => 24.0, +                                        143 => 28.0, +                                        144 => 32.0, +                                        145 => 40.0, +                                        146 => 48.0, +                                        147 => 56.0, +                                        148 => 64.0, +                                        149 => 80.0, +                                        150 => 96.0, +                                        151 => 112.0, +                                        152 => 128.0, +                                        153 => 160.0, +                                        154 => 192.0, +                                        155 => 224.0, +                                        156 => 256.0, +                                        157 => 320.0, +                                        158 => 384.0, +                                        159 => 448.0, +                                        160 => 512.0, +                                        161 => 640.0, +                                        162 => 768.0, +                                        163 => 896.0, +                                        164 => 1024.0, +                                        165 => 1280.0, +                                        166 => 1536.0, +                                        167 => 1792.0, +                                        168 => 2048.0, +                                        169 => 2560.0, +                                        170 => 3072.0, +                                        171 => 3584.0, +                                        172 => 4096.0, +                                        173 => 5120.0, +                                        174 => 6144.0, +                                        175 => 7168.0, +                                        176 => 8192.0, +                                        177 => 10240.0, +                                        178 => 12288.0, +                                        179 => 14336.0, +                                        180 => 16384.0, +                                        181 => 20480.0, +                                        182 => 24576.0, +                                        183 => 28672.0, +                                        184 => 32768.0, +                                        185 => 40960.0, +                                        186 => 49152.0, +                                        187 => 57344.0, +                                        188 => 65536.0, +                                        189 => 81920.0, +                                        190 => 98304.0, +                                        191 => 114688.0, +                                        192 => 131072.0, +                                        193 => 163840.0, +                                        194 => 196608.0, +                                        195 => 229376.0, +                                        196 => 262144.0, +                                        197 => 327680.0, +                                        198 => 393216.0, +                                        199 => 458752.0, +                                        200 => 524288.0, +                                        201 => 655360.0, +                                        202 => 786432.0, +                                        203 => 917504.0, +                                        204 => 1048576.0, +                                        205 => 1310720.0, +                                        206 => 1572864.0, +                                        207 => 1835008.0, +                                        208 => 2097152.0, +                                        209 => 2621440.0, +                                        210 => 3145728.0, +                                        211 => 3670016.0, +                                        212 => 4194304.0, +                                        213 => 5242880.0, +                                        214 => 6291456.0, +                                        215 => 7340032.0, +                                        216 => 8388608.0, +                                        217 => 1.048576E7, +                                        218 => 1.2582912E7, +                                        219 => 1.4680064E7, +                                        220 => 1.6777216E7, +                                        221 => 2.097152E7, +                                        222 => 2.5165824E7, +                                        223 => 2.9360128E7, +                                        224 => 3.3554432E7, +                                        225 => 4.194304E7, +                                        226 => 5.0331648E7, +                                        227 => 5.8720256E7, +                                        228 => 6.7108864E7, +                                        229 => 8.388608E7, +                                        230 => 1.00663296E8, +                                        231 => 1.17440512E8, +                                        232 => 1.34217728E8, +                                        233 => 1.6777216E8, +                                        234 => 2.01326592E8, +                                        235 => 2.34881024E8, +                                        236 => 2.68435456E8, +                                        237 => 3.3554432E8, +                                        238 => 4.02653184E8, +                                        239 => 4.69762048E8, +                                        240 => 5.3687091E8, +                                        241 => 6.7108864E8, +                                        242 => 8.0530637E8, +                                        243 => 9.395241E8, +                                        244 => 1.07374182E9, +                                        245 => 1.34217728E9, +                                        246 => 1.61061274E9, +                                        247 => 1.87904819E9, +                                        248 => 2.14748365E9, +                                        249 => 2.68435456E9, +                                        250 => 3.22122547E9, +                                        251 => 3.75809638E9, +                                        252 => 4.2949673E9, +                                        253 => 5.3687091E9, +                                        254 => 6.4424509E9, +                                        255 => 7.5161928E9 ); + + +    /** +     * Set the default Similarity implementation used by indexing and search +     * code. +     * +     * @param Zend_Search_Lucene_Search_Similarity $similarity +     */ +    static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) +    { +        self::$_defaultImpl = $similarity; +    } + + +    /** +     * Return the default Similarity implementation used by indexing and search +     * code. +     * +     * @return Zend_Search_Lucene_Search_Similarity +     */ +    static public function getDefault() +    { +        if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { +            self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); +        } + +        return self::$_defaultImpl; +    } + + +    /** +     * Computes the normalization value for a field given the total number of +     * terms contained in a field.  These values, together with field boosts, are +     * stored in an index and multipled into scores for hits on each field by the +     * search code. +     * +     * Matches in longer fields are less precise, so implemenations of this +     * method usually return smaller values when 'numTokens' is large, +     * and larger values when 'numTokens' is small. +     * +     * That these values are computed under +     * IndexWriter::addDocument(Document) and stored then using +     * encodeNorm(float).  Thus they have limited precision, and documents +     * must be re-indexed if this method is altered. +     * +     * fieldName - name of field +     * numTokens - the total number of tokens contained in fields named +     *             'fieldName' of 'doc'. +     * Returns a normalization factor for hits on this field of this document +     * +     * @param string $fieldName +     * @param integer $numTokens +     * @return float +     */ +    abstract public function lengthNorm($fieldName, $numTokens); + +    /** +     * Computes the normalization value for a query given the sum of the squared +     * weights of each of the query terms.  This value is then multipled into the +     * weight of each query term. +     * +     * This does not affect ranking, but rather just attempts to make scores +     * from different queries comparable. +     * +     * sumOfSquaredWeights - the sum of the squares of query term weights +     * Returns a normalization factor for query weights +     * +     * @param float $sumOfSquaredWeights +     * @return float +     */ +    abstract public function queryNorm($sumOfSquaredWeights); + + +    /** +     *  Decodes a normalization factor stored in an index. +     * +     * @param integer $byte +     * @return float +     */ +    static public function decodeNorm($byte) +    { +        return self::$_normTable[$byte & 0xFF]; +    } + + +    /** +     * Encodes a normalization factor for storage in an index. +     * +     * The encoding uses a five-bit exponent and three-bit mantissa, thus +     * representing values from around 7x10^9 to 2x10^-9 with about one +     * significant decimal digit of accuracy.  Zero is also represented. +     * Negative numbers are rounded up to zero.  Values too large to represent +     * are rounded down to the largest representable value.  Positive values too +     * small to represent are rounded up to the smallest positive representable +     * value. +     * +     * @param float $f +     * @return integer +     */ +    static function encodeNorm($f) +    { +      return self::_floatToByte($f); +    } + +    /** +     * Float to byte conversion +     * +     * @param integer $b +     * @return float +     */ +    static private function _floatToByte($f) +    { +        // round negatives up to zero +        if ($f <= 0.0) { +            return 0; +        } + +        // search for appropriate value +        $lowIndex = 0; +        $highIndex = 255; +        while ($highIndex >= $lowIndex) { +            // $mid = ($highIndex - $lowIndex)/2; +            $mid = ($highIndex + $lowIndex) >> 1; +            $delta = $f - self::$_normTable[$mid]; + +            if ($delta < 0) { +                $highIndex = $mid-1; +            } elseif ($delta > 0) { +                $lowIndex  = $mid+1; +            } else { +                return $mid; // We got it! +            } +        } + +        // round to closest value +        if ($highIndex != 255 && +            $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { +            return $highIndex + 1; +        } else { +            return $highIndex; +        } +    } + + +    /** +     * Computes a score factor based on a term or phrase's frequency in a +     * document.  This value is multiplied by the idf(Term, Searcher) +     * factor for each term in the query and these products are then summed to +     * form the initial score for a document. +     * +     * Terms and phrases repeated in a document indicate the topic of the +     * document, so implementations of this method usually return larger values +     * when 'freq' is large, and smaller values when 'freq' +     * is small. +     * +     * freq - the frequency of a term within a document +     * Returns a score factor based on a term's within-document frequency +     * +     * @param float $freq +     * @return float +     */ +    abstract public function tf($freq); + +    /** +     * Computes the amount of a sloppy phrase match, based on an edit distance. +     * This value is summed for each sloppy phrase match in a document to form +     * the frequency that is passed to tf(float). +     * +     * A phrase match with a small edit distance to a document passage more +     * closely matches the document, so implementations of this method usually +     * return larger values when the edit distance is small and smaller values +     * when it is large. +     * +     * distance - the edit distance of this sloppy phrase match +     * Returns the frequency increment for this match +     * +     * @param integer $distance +     * @return float +     */ +    abstract public function sloppyFreq($distance); + + +    /** +     * Computes a score factor for a simple term or a phrase. +     * +     * The default implementation is: +     *   return idfFreq(searcher.docFreq(term), searcher.maxDoc()); +     * +     * input - the term in question or array of terms +     * reader - reader the document collection being searched +     * Returns a score factor for the term +     * +     * @param mixed $input +     * @param Zend_Search_Lucene $reader +     * @return a score factor for the term +     */ +    public function idf($input, $reader) +    { +        if (!is_array($input)) { +            return $this->idfFreq($reader->docFreq($input), $reader->count()); +        } else { +            $idf = 0.0; +            foreach ($input as $term) { +                $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); +            } +            return $idf; +        } +    } + +    /** +     * Computes a score factor based on a term's document frequency (the number +     * of documents which contain the term).  This value is multiplied by the +     * tf(int) factor for each term in the query and these products are +     * then summed to form the initial score for a document. +     * +     * Terms that occur in fewer documents are better indicators of topic, so +     * implemenations of this method usually return larger values for rare terms, +     * and smaller values for common terms. +     * +     * docFreq - the number of documents which contain the term +     * numDocs - the total number of documents in the collection +     * Returns a score factor based on the term's document frequency +     * +     * @param integer $docFreq +     * @param integer $numDocs +     * @return float +     */ +    abstract public function idfFreq($docFreq, $numDocs); + +    /** +     * Computes a score factor based on the fraction of all query terms that a +     * document contains.  This value is multiplied into scores. +     * +     * The presence of a large portion of the query terms indicates a better +     * match with the query, so implemenations of this method usually return +     * larger values when the ratio between these parameters is large and smaller +     * values when the ratio between them is small. +     * +     * overlap - the number of query terms matched in the document +     * maxOverlap - the total number of terms in the query +     * Returns a score factor based on term overlap with the query +     * +     * @param integer $overlap +     * @param integer $maxOverlap +     * @return float +     */ +    abstract public function coord($overlap, $maxOverlap); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php new file mode 100644 index 00000000..1551d8bd --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Similarity/Default.php @@ -0,0 +1,99 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity +{ + +    /** +     * Implemented as '1/sqrt(numTerms)'. +     * +     * @param string $fieldName +     * @param integer numTerms +     * @return float +     */ +    public function lengthNorm($fieldName, $numTerms) +    { +        return 1.0/sqrt($numTerms); +    } + +    /** +     * Implemented as '1/sqrt(sumOfSquaredWeights)'. +     * +     * @param float $sumOfSquaredWeights +     * @return float +     */ +    public function queryNorm($sumOfSquaredWeights) +    { +        return 1.0/sqrt($sumOfSquaredWeights); +    } + +    /** +     * Implemented as 'sqrt(freq)'. +     * +     * @param float $freq +     * @return float +     */ +    public function tf($freq) +    { +        return sqrt($freq); +    } + +    /** +     * Implemented as '1/(distance + 1)'. +     * +     * @param integer $distance +     * @return float +     */ +    public function sloppyFreq($distance) +    { +        return 1.0/($distance + 1); +    } + +    /** +     * Implemented as 'log(numDocs/(docFreq+1)) + 1'. +     * +     * @param integer $docFreq +     * @param integer $numDocs +     * @return float +     */ +    public function idfFreq($docFreq, $numDocs) +    { +        return log($numDocs/(float)($docFreq+1)) + 1.0; +    } + +    /** +     * Implemented as 'overlap/maxOverlap'. +     * +     * @param integer $overlap +     * @param integer $maxOverlap +     * @return float +     */ +    public function coord($overlap, $maxOverlap) +    { +        return $overlap/(float)$maxOverlap; +    } +} diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php new file mode 100644 index 00000000..2d5b7a72 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight.php @@ -0,0 +1,59 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Calculate query weights and build query scorers. + * + * A Weight is constructed by a query Query->createWeight(). + * The sumOfSquaredWeights() method is then called on the top-level + * query to compute the query normalization factor Similarity->queryNorm(float). + * This factor is then passed to normalize(float).  At this point the weighting + * is complete. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Weight +{ +    /** +     * The weight for this query. +     * +     * @return float +     */ +    abstract public function getValue(); + +    /** +     * The sum of squared weights of contained query clauses. +     * +     * @return float +     */ +    abstract public function sumOfSquaredWeights(); + +    /** +     * Assigns the query normalization factor to this. +     * +     * @param $norm +     */ +    abstract public function normalize($norm); +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php new file mode 100644 index 00000000..69528ba4 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/MultiTerm.php @@ -0,0 +1,133 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Weight */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight +{ +    /** +     * IndexReader. +     * +     * @var Zend_Search_Lucene +     */ +    private $_reader; + +    /** +     * The query that this concerns. +     * +     * @var Zend_Search_Lucene_Search_Query_MultiTerm +     */ +    private $_query; + +    /** +     * Query terms weights +     * Array of Zend_Search_Lucene_Search_Weight_Term +     * +     * @var array +     */ +    private $_weights; + + +    /** +     * Zend_Search_Lucene_Search_Weight_MultiTerm constructor +     * query - the query that this concerns. +     * reader - index reader +     * +     * @param Zend_Search_Lucene_Search_Query_MultiTerm $query +     * @param Zend_Search_Lucene $reader +     */ +    public function __construct($query, $reader) +    { +        $this->_query   = $query; +        $this->_reader  = $reader; +        $this->_weights = array(); + +        $signs = $query->getSigns(); + +        foreach ($query->getTerms() as $num => $term) { +            if ($signs === null || $signs[$num] === null || $signs[$num]) { +                $this->_weights[$num] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader); +                $query->setWeight($num, $this->_weights[$num]); +            } +        } +    } + + +    /** +     * The weight for this query +     * +     * @return float +     */ +    public function getValue() +    { +        return $this->_query->getBoost(); +    } + + +    /** +     * The sum of squared weights of contained query clauses. +     * +     * @return float +     */ +    public function sumOfSquaredWeights() +    { +        $sum = 0; +        foreach ($this->_weights as $weight) { +            // sum sub weights +            $sum += $weight->sumOfSquaredWeights(); +        } + +        // boost each sub-weight +        $sum *= $this->_query->getBoost() * $this->_query->getBoost(); + +        // check for empty query (like '-something -another') +        if ($sum == 0) { +            $sum = 1.0; +        } +        return $sum; +    } + + +    /** +     * Assigns the query normalization factor to this. +     * +     * @param float $queryNorm +     */ +    public function normalize($queryNorm) +    { +        // incorporate boost +        $queryNorm *= $this->_query->getBoost(); + +        foreach ($this->_weights as $weight) { +            $weight->normalize($queryNorm); +        } +    } +} + + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php new file mode 100644 index 00000000..77e94f28 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Phrase.php @@ -0,0 +1,138 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * Zend_Search_Lucene_Search_Weight + */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight +{ +    /** +     * IndexReader. +     * +     * @var Zend_Search_Lucene +     */ +    private $_reader; + +    /** +     * The query that this concerns. +     * +     * @var Zend_Search_Lucene_Search_Query_Phrase +     */ +    private $_query; + +    /** +     * Weight value +     * +     * @var float +     */ +    private $_value; + +    /** +     * Score factor +     * +     * @var float +     */ +    private $_idf; + +    /** +     * Normalization factor +     * +     * @var float +     */ +    private $_queryNorm; + + +    /** +     * Query weight +     * +     * @var float +     */ +    private $_queryWeight; + + +    /** +     * Zend_Search_Lucene_Search_Weight_Phrase constructor +     * +     * @param Zend_Search_Lucene_Search_Query_Phrase $query +     * @param Zend_Search_Lucene $reader +     */ +    public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query, Zend_Search_Lucene $reader) +    { +        $this->_query  = $query; +        $this->_reader = $reader; +    } + + +    /** +     * The weight for this query +     * +     * @return float +     */ +    public function getValue() +    { +        return $this->_value; +    } + + +    /** +     * The sum of squared weights of contained query clauses. +     * +     * @return float +     */ +    public function sumOfSquaredWeights() +    { +        // compute idf +        $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); + +        // compute query weight +        $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + +        // square it +        return $this->_queryWeight * $this->_queryWeight; +    } + + +    /** +     * Assigns the query normalization factor to this. +     * +     * @param float $queryNorm +     */ +    public function normalize($queryNorm) +    { +        $this->_queryNorm = $queryNorm; + +        // normalize query weight +        $this->_queryWeight *= $queryNorm; + +        // idf for documents +        $this->_value = $this->_queryWeight * $this->_idf; +    } +} + + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php new file mode 100644 index 00000000..3e6102f3 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Search/Weight/Term.php @@ -0,0 +1,144 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Weight */ +require_once 'Zend/Search/Lucene/Search/Weight.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Search + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight +{ +    /** +     * IndexReader. +     * +     * @var Zend_Search_Lucene +     */ +    private $_reader; + +    /** +     * Term +     * +     * @var Zend_Search_Lucene_Index_Term +     */ +    private $_term; + +    /** +     * The query that this concerns. +     * +     * @var Zend_Search_Lucene_Search_Query +     */ +    private $_query; + +    /** +     * Weight value +     * +     * @var float +     */ +    private $_value; + +    /** +     * Score factor +     * +     * @var float +     */ +    private $_idf; + +    /** +     * Normalization factor +     * +     * @var float +     */ +    private $_queryNorm; + + +    /** +     * Query weight +     * +     * @var float +     */ +    private $_queryWeight; + + +    /** +     * Zend_Search_Lucene_Search_Weight_Term constructor +     * reader - index reader +     * +     * @param Zend_Search_Lucene $reader +     */ +    public function __construct($term, $query, $reader) +    { +        $this->_term   = $term; +        $this->_query  = $query; +        $this->_reader = $reader; +    } + + +    /** +     * The weight for this query +     * +     * @return float +     */ +    public function getValue() +    { +        return $this->_value; +    } + + +    /** +     * The sum of squared weights of contained query clauses. +     * +     * @return float +     */ +    public function sumOfSquaredWeights() +    { +        // compute idf +        $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); + +        // compute query weight +        $this->_queryWeight = $this->_idf * $this->_query->getBoost(); + +        // square it +        return $this->_queryWeight * $this->_queryWeight; +    } + + +    /** +     * Assigns the query normalization factor to this. +     * +     * @param float $queryNorm +     */ +    public function normalize($queryNorm) +    { +        $this->_queryNorm = $queryNorm; + +        // normalize query weight +        $this->_queryWeight *= $queryNorm; + +        // idf for documents +        $this->_value = $this->_queryWeight * $this->_idf; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php new file mode 100644 index 00000000..48114a76 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory.php @@ -0,0 +1,118 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Storage_Directory +{ + +    /** +     * Closes the store. +     * +     * @return void +     */ +    abstract public function close(); + +    /** +     * Returns an array of strings, one for each file in the directory. +     * +     * @return array +     */ +    abstract public function fileList(); + +    /** +     * Creates a new, empty file in the directory with the given $filename. +     * +     * @param string $filename +     * @return Zend_Search_Lucene_Storage_File +     */ +    abstract public function createFile($filename); + + +    /** +     * Removes an existing $filename in the directory. +     * +     * @param string $filename +     * @return void +     */ +    abstract public function deleteFile($filename); + + +    /** +     * Returns true if a file with the given $filename exists. +     * +     * @param string $filename +     * @return boolean +     */ +    abstract public function fileExists($filename); + + +    /** +     * Returns the length of a $filename in the directory. +     * +     * @param string $filename +     * @return integer +     */ +    abstract public function fileLength($filename); + + +    /** +     * Returns the UNIX timestamp $filename was last modified. +     * +     * @param string $filename +     * @return integer +     */ +    abstract public function fileModified($filename); + + +    /** +     * Renames an existing file in the directory. +     * +     * @param string $from +     * @param string $to +     * @return void +     */ +    abstract public function renameFile($from, $to); + + +    /** +     * Sets the modified time of $filename to now. +     * +     * @param string $filename +     * @return void +     */ +    abstract public function touchFile($filename); + + +    /** +     * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. +     * +     * @param string $filename +     * @return Zend_Search_Lucene_Storage_File +     */ +    abstract public function getFileObject($filename); + +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php new file mode 100644 index 00000000..73d10659 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/Directory/Filesystem.php @@ -0,0 +1,269 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Storage_Directory */ +require_once 'Zend/Search/Lucene/Storage/Directory.php'; + +/** Zend_Search_Lucene_Storage_File_Filesystem */ +require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php'; + + +/** + * FileSystem implementation of Directory abstraction. + * + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory +{ +    /** +     * Filesystem path to the directory +     * +     * @var string +     */ +    private $_dirPath = null; + +    /** +     * Cache for Zend_Search_Lucene_Storage_File_Filesystem objects +     * Array: filename => Zend_Search_Lucene_Storage_File object +     * +     * @var array +     * @throws Zend_Search_Lucene_Exception +     */ +    private $_fileHandlers; + + +    /** +     * Utility function to recursive directory creation +     * +     * @param string $dir +     * @param integer $mode +     * @param boolean $recursive +     * @return boolean +     */ + +    static public function mkdirs($dir, $mode = 0777, $recursive = true) +    { +        if (is_null($dir) || $dir === '') { +            return false; +        } +        if (is_dir($dir) || $dir === '/') { +            return true; +        } +        if (self::mkdirs(dirname($dir), $mode, $recursive)) { +            return mkdir($dir, $mode); +        } +        return false; +    } + + +    /** +     * Object constructor +     * Checks if $path is a directory or tries to create it. +     * +     * @param string $path +     * @throws Zend_Search_Lucene_Exception +     */ +    public function __construct($path) +    { +        if (!is_dir($path)) { +            if (file_exists($path)) { +                throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory'); +            } else { +                if (!self::mkdirs($path)) { +                    throw new Zend_Search_Lucene_Exception("Can't create directory '$path'."); +                } +            } +        } +        $this->_dirPath = $path; +        $this->_fileHandlers = array(); +    } + + +    /** +     * Closes the store. +     * +     * @return void +     */ +    public function close() +    { +        foreach ($this->_fileHandlers as $fileObject) { +            $fileObject->close(); +        } + +        unset($this->_fileHandlers); +    } + + +    /** +     * Returns an array of strings, one for each file in the directory. +     * +     * @return array +     */ +    public function fileList() +    { +        $result = array(); + +        $dirContent = opendir( $this->_dirPath ); +        while ($file = readdir($dirContent)) { +            if (($file == '..')||($file == '.'))   continue; + +            $fullName = $this->_dirPath . '/' . $file; + +            if( !is_dir($this->_dirPath . '/' . $file) ) { +                $result[] = $file; +            } +        } + +        return $result; +    } + +    /** +     * Creates a new, empty file in the directory with the given $filename. +     * +     * @param string $filename +     * @return Zend_Search_Lucene_Storage_File +     */ +    public function createFile($filename) +    { +        if (isset($this->_fileHandlers[$filename])) { +            $this->_fileHandlers[$filename]->close(); +        } +        unset($this->_fileHandlers[$filename]); +        $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b'); +        return $this->_fileHandlers[$filename]; +    } + + +    /** +     * Removes an existing $filename in the directory. +     * +     * @param string $filename +     * @return void +     */ +    public function deleteFile($filename) +    { +        if (isset($this->_fileHandlers[$filename])) { +            $this->_fileHandlers[$filename]->close(); +        } +        unset($this->_fileHandlers[$filename]); +        unlink($this->_dirPath .'/'. $filename); +    } + + +    /** +     * Returns true if a file with the given $filename exists. +     * +     * @param string $filename +     * @return boolean +     */ +    public function fileExists($filename) +    { +        return file_exists($this->_dirPath .'/'. $filename); +    } + + +    /** +     * Returns the length of a $filename in the directory. +     * +     * @param string $filename +     * @return integer +     */ +    public function fileLength($filename) +    { +        if (isset( $this->_fileHandlers[$filename] )) { +            return $this->_fileHandlers[$filename]->size(); +        } +        return filesize($this->_dirPath .'/'. $filename); +    } + + +    /** +     * Returns the UNIX timestamp $filename was last modified. +     * +     * @param string $filename +     * @return integer +     */ +    public function fileModified($filename) +    { +        return filemtime($this->_dirPath .'/'. $filename); +    } + + +    /** +     * Renames an existing file in the directory. +     * +     * @param string $from +     * @param string $to +     * @return void +     */ +    public function renameFile($from, $to) +    { +        if ($this->_fileHandlers[$from] !== null) { +            $this->_fileHandlers[$from]->close(); +        } +        unset($this->_fileHandlers[$from]); + +        if ($this->_fileHandlers[$to] !== null) { +            $this->_fileHandlers[$to]->close(); +        } +        unset($this->_fileHandlers[$to]); + +        if (file_exists($this->_dirPath . '/' . $to)) { +            unlink($this->_dirPath . '/' . $to); +        } + +        return @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to); +    } + + +    /** +     * Sets the modified time of $filename to now. +     * +     * @param string $filename +     * @return void +     */ +    public function touchFile($filename) +    { +        return touch($this->_dirPath .'/'. $filename); +    } + + +    /** +     * Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory. +     * +     * @param string $filename +     * @return Zend_Search_Lucene_Storage_File +     */ +    public function getFileObject($filename) +    { +        if (isset( $this->_fileHandlers[$filename] )) { +            $this->_fileHandlers[$filename]->seek(0); +            return $this->_fileHandlers[$filename]; +        } + +        $this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'rb'); +        return $this->_fileHandlers[$filename]; +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php new file mode 100644 index 00000000..f62af33a --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File.php @@ -0,0 +1,376 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Storage_File +{ + +    /** +     * Class constructor.  Open the file. +     */ +    abstract public function __construct($filename, $mode='r'); + + +    /** +     * Reads $length number of bytes at the current position in the +     * file and advances the file pointer. +     * +     * @param integer $length +     * @return string +     */ +    abstract protected function _fread($length=1); + + +    /** +     * Sets the file position indicator and advances the file pointer. +     * The new position, measured in bytes from the beginning of the file, +     * is obtained by adding offset to the position specified by whence, +     * whose values are defined as follows: +     * SEEK_SET - Set position equal to offset bytes. +     * SEEK_CUR - Set position to current location plus offset. +     * SEEK_END - Set position to end-of-file plus offset. (To move to +     * a position before the end-of-file, you need to pass a negative value +     * in offset.) +     * Upon success, returns 0; otherwise, returns -1 +     * +     * @param integer $offset +     * @param integer $whence +     * @return integer +     */ +    abstract public function seek($offset, $whence=SEEK_SET); + +    /** +     * Get file position. +     * +     * @return integer +     */ +    abstract public function tell(); + +    /** +     * Writes $length number of bytes (all, if $length===null) to the end +     * of the file. +     * +     * @param string $data +     * @param integer $length +     */ +    abstract protected function _fwrite($data, $length=null); + + +    /** +     * Reads a byte from the current position in the file +     * and advances the file pointer. +     * +     * @return integer +     */ +    public function readByte() +    { +        return ord($this->_fread(1)); +    } + +    /** +     * Writes a byte to the end of the file. +     * +     * @param integer $byte +     */ +    public function writeByte($byte) +    { +        return $this->_fwrite(chr($byte), 1); +    } + +    /** +     * Read num bytes from the current position in the file +     * and advances the file pointer. +     * +     * @param integer $num +     * @return string +     */ +    public function readBytes($num) +    { +        return $this->_fread($num); +    } + +    /** +     * Writes num bytes of data (all, if $num===null) to the end +     * of the file. +     * +     * @param string $data +     * @param integer $num +     */ +    public function writeBytes($data, $num=null) +    { +        $this->_fwrite($data, $num); +    } + + +    /** +     * Reads an integer from the current position in the file +     * and advances the file pointer. +     * +     * @return integer +     */ +    public function readInt() +    { +        $str = $this->_fread(4); + +        return  ord($str{0}) << 24 | +                ord($str{1}) << 16 | +                ord($str{2}) << 8  | +                ord($str{3}); +    } + + +    /** +     * Writes an integer to the end of file. +     * +     * @param integer $value +     */ +    public function writeInt($value) +    { +        settype($value, 'integer'); +        $this->_fwrite( chr($value>>24 & 0xFF) . +                        chr($value>>16 & 0xFF) . +                        chr($value>>8  & 0xFF) . +                        chr($value     & 0xFF),   4  ); +    } + + +    /** +     * Returns a long integer from the current position in the file +     * and advances the file pointer. +     * +     * @return integer +     */ +    public function readLong() +    { +        $str = $this->_fread(8); + +        /** +         * PHP uses long as largest integer. fseek() uses long for offset. +         * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent +         * conversion to float. +         * So, largest index segment file is 2Gb +         */ +        return  /* ord($str{0}) << 56  | */ +                /* ord($str{1}) << 48  | */ +                /* ord($str{2}) << 40  | */ +                /* ord($str{3}) << 32  | */ +                ord($str{4}) << 24  | +                ord($str{5}) << 16  | +                ord($str{6}) << 8   | +                ord($str{7}); +    } + +    /** +     * Writes long integer to the end of file +     * +     * @param integer $value +     */ +    public function writeLong($value) +    { +        /** +         * PHP uses long as largest integer. fseek() uses long for offset. +         * long has 4 bytes in a lot of systems. 4 bytes are discarded to prevent +         * conversion to float. +         * So, largest index segment file is 2Gb +         */ +        settype($value, 'integer'); +        $this->_fwrite( "\x00\x00\x00\x00"     . +                        chr($value>>24 & 0xFF) . +                        chr($value>>16 & 0xFF) . +                        chr($value>>8  & 0xFF) . +                        chr($value     & 0xFF),   8  ); +    } + + + +    /** +     * Returns a variable-length integer from the current +     * position in the file and advances the file pointer. +     * +     * @return integer +     */ +    public function readVInt() +    { +        $nextByte = ord($this->_fread(1)); +        $val = $nextByte & 0x7F; + +        for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) { +            $nextByte = ord($this->_fread(1)); +            $val |= ($nextByte & 0x7F) << $shift; +        } +        return $val; +    } + +    /** +     * Writes a variable-length integer to the end of file. +     * +     * @param integer $value +     */ +    public function writeVInt($value) +    { +        settype($value, 'integer'); +        while ($value > 0x7F) { +            $this->_fwrite(chr( ($value & 0x7F)|0x80 )); +            $value >>= 7; +        } +        $this->_fwrite(chr($value)); +    } + + +    /** +     * Reads a string from the current position in the file +     * and advances the file pointer. +     * +     * @return string +     */ +    public function readString() +    { +        $strlen = $this->readVInt(); +        if ($strlen == 0) { +            return ''; +        } else { +            /** +             * This implementation supports only Basic Multilingual Plane +             * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support +             * "supplementary characters" (characters whose code points are +             * greater than 0xFFFF) +             * Java 2 represents these characters as a pair of char (16-bit) +             * values, the first from the high-surrogates range (0xD800-0xDBFF), +             * the second from the low-surrogates range (0xDC00-0xDFFF). Then +             * they are encoded as usual UTF-8 characters in six bytes. +             * Standard UTF-8 representation uses four bytes for supplementary +             * characters. +             */ + +            $str_val = $this->_fread($strlen); + +            for ($count = 0; $count < $strlen; $count++ ) { +                if (( ord($str_val{$count}) & 0xC0 ) == 0xC0) { +                    $addBytes = 1; +                    if (ord($str_val{$count}) & 0x20 ) { +                        $addBytes++; + +                        // Never used. Java2 doesn't encode strings in four bytes +                        if (ord($str_val{$count}) & 0x10 ) { +                            $addBytes++; +                        } +                    } +                    $str_val .= $this->_fread($addBytes); +                    $strlen += $addBytes; + +                    // Check for null character. Java2 encodes null character +                    // in two bytes. +                    if (ord($str_val{$count})   == 0xC0 && +                        ord($str_val{$count+1}) == 0x80   ) { +                        $str_val{$count} = 0; +                        $str_val = substr($str_val,0,$count+1) +                                 . substr($str_val,$count+2); +                    } +                    $count += $addBytes; +                } +            } + +            return $str_val; +        } +    } + +    /** +     * Writes a string to the end of file. +     * +     * @param string $str +     * @throws Zend_Search_Lucene_Exception +     */ +    public function writeString($str) +    { +        /** +         * This implementation supports only Basic Multilingual Plane +         * (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support +         * "supplementary characters" (characters whose code points are +         * greater than 0xFFFF) +         * Java 2 represents these characters as a pair of char (16-bit) +         * values, the first from the high-surrogates range (0xD800-0xDBFF), +         * the second from the low-surrogates range (0xDC00-0xDFFF). Then +         * they are encoded as usual UTF-8 characters in six bytes. +         * Standard UTF-8 representation uses four bytes for supplementary +         * characters. +         */ + +        // convert input to a string before iterating string characters +        settype($str, 'string'); + +        $chars = $strlen = strlen($str); +        $containNullChars = false; + +        for ($count = 0; $count < $strlen; $count++ ) { +            /** +             * String is already in Java 2 representation. +             * We should only calculate actual string length and replace +             * \x00 by \xC0\x80 +             */ +            if ((ord($str{$count}) & 0xC0) == 0xC0) { +                $addBytes = 1; +                if (ord($str{$count}) & 0x20 ) { +                    $addBytes++; + +                    // Never used. Java2 doesn't encode strings in four bytes +                    // and we dont't support non-BMP characters +                    if (ord($str{$count}) & 0x10 ) { +                        $addBytes++; +                    } +                } +                $chars -= $addBytes; + +                if (ord($str{$count}) == 0 ) { +                    $containNullChars = true; +                } +                $count += $addBytes; +            } +        } + +        if ($chars < 0) { +            throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string'); +        } + +        $this->writeVInt($chars); +        if ($containNullChars) { +            $this->_fwrite(str_replace($str, "\x00", "\xC0\x80")); +        } else { +            $this->_fwrite($str); +        } +    } + + +    /** +     * Reads binary data from the current position in the file +     * and advances the file pointer. +     * +     * @return string +     */ +    public function readBinary() +    { +        return $this->_fread($this->readVInt()); +    } +}
\ No newline at end of file diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php new file mode 100644 index 00000000..fc6adcf5 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/Lucene/Storage/File/Filesystem.php @@ -0,0 +1,170 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Storage_File */ +require_once 'Zend/Search/Lucene/Storage/File.php'; + +/** Zend_Search_Lucene_Exception */ +require_once 'Zend/Search/Lucene/Exception.php'; + + +/** + * @package    Zend_Search_Lucene + * @subpackage Storage + * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + * + */ +class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File +{ +    /** +     * Resource of the open file +     * +     * @var resource +     */ +    private $_fileHandle; + + +    /** +     * Class constructor.  Open the file. +     * +     * @param string $filename +     * @param string $mode +     */ +    public function __construct($filename, $mode='rb') +    { +        global $php_errormsg; + +        $trackErrors = ini_get( "track_errors"); +        ini_set('track_errors', '1'); + +        $this->_fileHandle = @fopen($filename, $mode); + +        if ($this->_fileHandle===false) { +            ini_set('track_errors', $trackErrors); +            throw new Zend_Search_Lucene_Exception($php_errormsg); +        } + +        ini_set('track_errors', $trackErrors); +    } + + +    /** +     * Sets the file position indicator and advances the file pointer. +     * The new position, measured in bytes from the beginning of the file, +     * is obtained by adding offset to the position specified by whence, +     * whose values are defined as follows: +     * SEEK_SET - Set position equal to offset bytes. +     * SEEK_CUR - Set position to current location plus offset. +     * SEEK_END - Set position to end-of-file plus offset. (To move to +     * a position before the end-of-file, you need to pass a negative value +     * in offset.) +     * Upon success, returns 0; otherwise, returns -1 +     * +     * @param integer $offset +     * @param integer $whence +     * @return integer +     */ +    public function seek($offset, $whence=SEEK_SET) +    { +        return fseek($this->_fileHandle, $offset, $whence); +    } + + +    /** +     * Get file position. +     * +     * @return integer +     */ +    public function tell() +    { +        return ftell($this->_fileHandle); +    } + + +    /** +     * Close File object +     */ +    public function close() +    { +        if ($this->_fileHandle !== null ) { +            @fclose($this->_fileHandle); +            $this->_fileHandle = null; +        } +    } + +    /** +     * Get the size of the already opened file +     * +     * @return integer +     */ +    public function size() +    { +        $position = ftell($this->_fileHandle); +        fseek($this->_fileHandle, 0, SEEK_END); +        $size = ftell($this->_fileHandle); +        fseek($this->_fileHandle,$position); + +        return $size; +    } + +    /** +     * Read a $length bytes from the file and advance the file pointer. +     * +     * @param integer $length +     * @return string +     */ +    protected function _fread($length=1) +    { +        if ($length == 0) { +            return ''; +        } + +        if ($length < 1024) { +            return fread($this->_fileHandle, $length); +        } + +        $data = ''; +        while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { +            $data .= $nextBlock; +            $length -= strlen($nextBlock); +        } +        return $data; +    } + + +    /** +     * Writes $length number of bytes (all, if $length===null) to the end +     * of the file. +     * +     * @param string $data +     * @param integer $length +     */ +    protected function _fwrite($data, $length=null) +    { +        if ($length === null ) { +            fwrite($this->_fileHandle, $data); +        } else { +            fwrite($this->_fileHandle, $data, $length); +        } +    } +} + diff --git a/buildscripts/texbuilder/Zend/Search/TODO.txt b/buildscripts/texbuilder/Zend/Search/TODO.txt new file mode 100644 index 00000000..06f7b487 --- /dev/null +++ b/buildscripts/texbuilder/Zend/Search/TODO.txt @@ -0,0 +1,14 @@ +@todo + +- Improve API: fix ZSearchMultiTermQuery($terms, $signs); + +- Analysis and indexing engine + +- Additional queries: phrase, wildcard, proximity, and range + +- Better class-level docblocks (most functions okay) + +- Some Windows issues(?) during indexing + +- Finish renaming classes to PEAR-like conventions + diff --git a/buildscripts/texbuilder/build.php b/buildscripts/texbuilder/build.php index 4e2d8be6..20418248 100644 --- a/buildscripts/texbuilder/build.php +++ b/buildscripts/texbuilder/build.php @@ -10,6 +10,10 @@ $mainTexFile = dirname(__FILE__).'/prado3_quick_start.tex';  //page root location
  $base = realpath(dirname(__FILE__).'/../../demos/quickstart/protected/pages/');
 +//search index data directory
 +$index_dir = realpath(dirname(__FILE__).'/../../demos/quickstart/protected/index/data');
 +
 +
  //list page into chapters
  $pages['Getting Started'] = array(
  	'GettingStarted/Introduction.page',
 @@ -94,6 +98,11 @@ $pages['Advanced Topics'] = array(  	'Advanced/Error.page',
  	'Advanced/Performance.page');
 +$pages['Client-side Scripting'] = array(
 +	'Advanced/Scripts.page',
 +	'Advanced/Scripts1.page',
 +	'Advanced/Scripts2.page',
 +	'Advanced/Scripts3.page');
  //-------------- END CONFIG ------------------
 @@ -273,12 +282,51 @@ function get_section_label($section)  	return '\hypertarget{'.str_replace('/', '.', $section).'}{}';
  }
 +
 +function set_header_id($content, $count)
 +{
 +	global $header_count;
 +	$header_count = $count*100;
 +	$content = preg_replace_callback('/<h1>/', "h1", $content);
 +	$content = preg_replace_callback('/<h2>/', "h2", $content);
 +	$content = preg_replace_callback('/<h3>/', "h3", $content);
 +	return $content;
 +}
 +
 +function h1($matches)
 +{
 +	global $header_count;
 +	return "<h1 id=\"".(++$header_count)."\">";
 +}
 +
 +function h2($matches)
 +{
 +	global $header_count;
 +	return "<h2 id=\"".(++$header_count)."\">";
 +}
 +
 +function h3($matches)
 +{
 +	global $header_count;
 +	return "<h3 id=\"".(++$header_count)."\">";
 +}
 +
 +$header_count = 0;
 +
  //--------------- BEGIN PROCESSING -------------------
 +
 +//--------------- Indexer -------------------
 +
 +require_once('create_index.php');
 +$indexer = new quickstart_index($index_dir);
 +
  // ---------------- Create the Tex files ---------
  $count = 1;
 +$j = 1;
  $current_path = '';
  echo "Compiling .page files to Latex files\n\n";
 +
  foreach($pages as $chapter => $sections)
  {
  	$content = '\chapter{'.$chapter.'}'.get_chapter_label($chapter);
 @@ -289,8 +337,16 @@ foreach($pages as $chapter => $sections)  		echo "    Adding $section\n";
  		$page = $base.'/'.$section;
  		$current_path = $page;
 +		
 +		//add id to <h1>, <h2>, <3>
 +		$content = set_header_id(file_get_contents($page),$j++);
 +		file_put_contents($page, $content);
 +		
  		$content .= get_section_label($section);
 -		$content .= parse_html($page,file_get_contents($page));
 +		$file_content = file_get_contents($page);
 +		$tex = parse_html($page,$file_content);
 +		$content .= $tex;
 +		$indexer->add($file_content,$section, filemtime($page));
  	}
  	//var_dump($content);
 @@ -299,6 +355,7 @@ foreach($pages as $chapter => $sections)  	echo "\n";
  }
 +$indexer->commit();
  if($argc <= 1 && $count > 1)
  {
 diff --git a/buildscripts/texbuilder/create_index.php b/buildscripts/texbuilder/create_index.php new file mode 100644 index 00000000..db72c453 --- /dev/null +++ b/buildscripts/texbuilder/create_index.php @@ -0,0 +1,87 @@ +<?php
 +
 +// Create quickstart search index
 +require_once (dirname(__FILE__).'/Zend/Search/Lucene.php');
 +
 +class quickstart_index
 +{
 +	private $_index;
 +	private $_dir;
 +	
 +	public function __construct($index_file)
 +	{
 +		$this->_index = new Zend_Search_Lucene($index_file, true);
 +		$this->_dir = $index_file;
 +		echo "Building search index...\n";
 +	}
 +	
 +	public function add($content, $section, $mtime)
 +	{
 +		foreach($this->split_headings($content) as $headers)
 +		{
 +			$doc = new Zend_Search_Lucene_Document();
 +			$link = "index.php?page=".preg_replace('/\/|\\\/', '.', $section);
 +			$link = str_replace('.page', '', $link).'#'.$headers['section'];
 +			
 +			//unsearchable text
 +			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('link', $link));
 +			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('mtime', $mtime));
 +			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('title', $headers['title']));
 +			$doc->addField(Zend_Search_Lucene_Field::UnIndexed('text', $headers['content']));		
 +			
 +			//searchable text
 +			$doc->addField(Zend_Search_Lucene_Field::Keyword('page', strtolower($headers['title'])));
 +			$body = strtolower($this->sanitize($headers['content']));
 +			$doc->addField(Zend_Search_Lucene_Field::Unstored('contents',$body));
 +			$this->_index->addDocument($doc);
 +		}		
 +	}
 +	
 +	function sanitize($input) 
 +	{
 +		return htmlentities(strip_tags( $input ));
 +	}	
 +	
 +	public function index()
 +	{
 +		return $this->_index;
 +	}
 +	
 +	protected function split_headings($html)
 +	{
 +		$html = preg_replace('/<\/?com:TContent[^<]*>/', '', $html);
 +		
 +		$html = preg_replace('/<b>([^<]*)<\/b>/', '$1', $html);
 +		$html = preg_replace('/<i>([^<]*)<\/i>/', '$1', $html);
 +		$html = preg_replace('/<tt>([^<]*)<\/tt>/', '$1', $html);
 +		
 +		$html = preg_replace('/<h1([^>]*)>([^<]*)<\/h1>/', '<hh$1>$2</hh>', $html);
 +		$html = preg_replace('/<h2([^>]*)>([^<]*)<\/h2>/', '<hh$1>$2</hh>', $html);
 +		$html = preg_replace('/<h3([^>]*)>([^<]*)<\/h3>/', '<hh$1>$2</hh>', $html);
 +		
 +		
 +		$sections = preg_split('/<hh[^>]*>([^<]+)<\/hh>/', $html,-1);
 +		$headers = array();
 +		preg_match_all('/<hh([^>]*)>([^<]+)<\/hh>/', $html, $headers);
 +		$contents = array();
 +		for($i = 1, $t = count($sections); $i < $t; $i++)
 +		{
 +			$content['title'] = trim($this->sanitize($headers[2][$i-1]));
 +			$sec = array();
 +			preg_match('/"([^"]*)"/', $headers[1][$i-1], $sec);
 +			$content['section'] = str_replace('"', '',$sec[0]);
 +			$content['content'] = trim($this->sanitize($sections[$i]));
 +			$contents[] = $content;
 +		}
 +
 +		return $contents;
 +	}
 +	
 +	public function commit()
 +	{
 +		$this->_index->commit();		
 +		$count = $this->_index->count();
 +		echo "\nSaving search index ({$count}) to {$this->_dir}\n\n";
 +	}
 +}
 +?>
\ No newline at end of file diff --git a/buildscripts/texbuilder/prado3_quick_start.tex b/buildscripts/texbuilder/prado3_quick_start.tex index a4105685..4fc1bbd6 100644 --- a/buildscripts/texbuilder/prado3_quick_start.tex +++ b/buildscripts/texbuilder/prado3_quick_start.tex @@ -114,5 +114,6 @@ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \include{ch7}
  \include{ch8}
  \include{ch9}
 +\include{ch10}
  \end{document}
 | 
