diff options
Diffstat (limited to 'demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php')
-rw-r--r-- | demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php | 551 |
1 files changed, 551 insertions, 0 deletions
diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php new file mode 100644 index 00000000..8b758213 --- /dev/null +++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php @@ -0,0 +1,551 @@ +<?php +/** + * Zend Framework + * + * LICENSE + * + * This source file is subject to version 1.0 of the Zend Framework + * license, that is bundled with this package in the file LICENSE, and + * is available through the world-wide-web at the following URL: + * http://www.zend.com/license/framework/1_0.txt. If you did not receive + * a copy of the Zend Framework license and are unable to obtain it + * through the world-wide-web, please send a note to license@zend.com + * so we can mail you a copy immediately. + * + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ + + +/** Zend_Search_Lucene_Search_Similarity_Default */ +require_once 'Zend/Search/Lucene/Search/Similarity/Default.php'; + + +/** + * @package Zend_Search_Lucene + * @subpackage Search + * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com) + * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0 + */ +abstract class Zend_Search_Lucene_Search_Similarity +{ + /** + * The Similarity implementation used by default. + * + * @var Zend_Search_Lucene_Search_Similarity + */ + static private $_defaultImpl; + + /** + * Cache of decoded bytes. + * Array of floats + * + * @var array + */ + static private $_normTable = array( 0 => 0.0, + 1 => 5.820766E-10, + 2 => 6.9849193E-10, + 3 => 8.1490725E-10, + 4 => 9.313226E-10, + 5 => 1.1641532E-9, + 6 => 1.3969839E-9, + 7 => 1.6298145E-9, + 8 => 1.8626451E-9, + 9 => 2.3283064E-9, + 10 => 2.7939677E-9, + 11 => 3.259629E-9, + 12 => 3.7252903E-9, + 13 => 4.656613E-9, + 14 => 5.5879354E-9, + 15 => 6.519258E-9, + 16 => 7.4505806E-9, + 17 => 9.313226E-9, + 18 => 1.1175871E-8, + 19 => 1.3038516E-8, + 20 => 1.4901161E-8, + 21 => 1.8626451E-8, + 22 => 2.2351742E-8, + 23 => 2.6077032E-8, + 24 => 2.9802322E-8, + 25 => 3.7252903E-8, + 26 => 4.4703484E-8, + 27 => 5.2154064E-8, + 28 => 5.9604645E-8, + 29 => 7.4505806E-8, + 30 => 8.940697E-8, + 31 => 1.0430813E-7, + 32 => 1.1920929E-7, + 33 => 1.4901161E-7, + 34 => 1.7881393E-7, + 35 => 2.0861626E-7, + 36 => 2.3841858E-7, + 37 => 2.9802322E-7, + 38 => 3.5762787E-7, + 39 => 4.172325E-7, + 40 => 4.7683716E-7, + 41 => 5.9604645E-7, + 42 => 7.1525574E-7, + 43 => 8.34465E-7, + 44 => 9.536743E-7, + 45 => 1.1920929E-6, + 46 => 1.4305115E-6, + 47 => 1.66893E-6, + 48 => 1.9073486E-6, + 49 => 2.3841858E-6, + 50 => 2.861023E-6, + 51 => 3.33786E-6, + 52 => 3.8146973E-6, + 53 => 4.7683716E-6, + 54 => 5.722046E-6, + 55 => 6.67572E-6, + 56 => 7.6293945E-6, + 57 => 9.536743E-6, + 58 => 1.1444092E-5, + 59 => 1.335144E-5, + 60 => 1.5258789E-5, + 61 => 1.9073486E-5, + 62 => 2.2888184E-5, + 63 => 2.670288E-5, + 64 => 3.0517578E-5, + 65 => 3.8146973E-5, + 66 => 4.5776367E-5, + 67 => 5.340576E-5, + 68 => 6.1035156E-5, + 69 => 7.6293945E-5, + 70 => 9.1552734E-5, + 71 => 1.0681152E-4, + 72 => 1.2207031E-4, + 73 => 1.5258789E-4, + 74 => 1.8310547E-4, + 75 => 2.1362305E-4, + 76 => 2.4414062E-4, + 77 => 3.0517578E-4, + 78 => 3.6621094E-4, + 79 => 4.272461E-4, + 80 => 4.8828125E-4, + 81 => 6.1035156E-4, + 82 => 7.324219E-4, + 83 => 8.544922E-4, + 84 => 9.765625E-4, + 85 => 0.0012207031, + 86 => 0.0014648438, + 87 => 0.0017089844, + 88 => 0.001953125, + 89 => 0.0024414062, + 90 => 0.0029296875, + 91 => 0.0034179688, + 92 => 0.00390625, + 93 => 0.0048828125, + 94 => 0.005859375, + 95 => 0.0068359375, + 96 => 0.0078125, + 97 => 0.009765625, + 98 => 0.01171875, + 99 => 0.013671875, + 100 => 0.015625, + 101 => 0.01953125, + 102 => 0.0234375, + 103 => 0.02734375, + 104 => 0.03125, + 105 => 0.0390625, + 106 => 0.046875, + 107 => 0.0546875, + 108 => 0.0625, + 109 => 0.078125, + 110 => 0.09375, + 111 => 0.109375, + 112 => 0.125, + 113 => 0.15625, + 114 => 0.1875, + 115 => 0.21875, + 116 => 0.25, + 117 => 0.3125, + 118 => 0.375, + 119 => 0.4375, + 120 => 0.5, + 121 => 0.625, + 122 => 0.75, + 123 => 0.875, + 124 => 1.0, + 125 => 1.25, + 126 => 1.5, + 127 => 1.75, + 128 => 2.0, + 129 => 2.5, + 130 => 3.0, + 131 => 3.5, + 132 => 4.0, + 133 => 5.0, + 134 => 6.0, + 135 => 7.0, + 136 => 8.0, + 137 => 10.0, + 138 => 12.0, + 139 => 14.0, + 140 => 16.0, + 141 => 20.0, + 142 => 24.0, + 143 => 28.0, + 144 => 32.0, + 145 => 40.0, + 146 => 48.0, + 147 => 56.0, + 148 => 64.0, + 149 => 80.0, + 150 => 96.0, + 151 => 112.0, + 152 => 128.0, + 153 => 160.0, + 154 => 192.0, + 155 => 224.0, + 156 => 256.0, + 157 => 320.0, + 158 => 384.0, + 159 => 448.0, + 160 => 512.0, + 161 => 640.0, + 162 => 768.0, + 163 => 896.0, + 164 => 1024.0, + 165 => 1280.0, + 166 => 1536.0, + 167 => 1792.0, + 168 => 2048.0, + 169 => 2560.0, + 170 => 3072.0, + 171 => 3584.0, + 172 => 4096.0, + 173 => 5120.0, + 174 => 6144.0, + 175 => 7168.0, + 176 => 8192.0, + 177 => 10240.0, + 178 => 12288.0, + 179 => 14336.0, + 180 => 16384.0, + 181 => 20480.0, + 182 => 24576.0, + 183 => 28672.0, + 184 => 32768.0, + 185 => 40960.0, + 186 => 49152.0, + 187 => 57344.0, + 188 => 65536.0, + 189 => 81920.0, + 190 => 98304.0, + 191 => 114688.0, + 192 => 131072.0, + 193 => 163840.0, + 194 => 196608.0, + 195 => 229376.0, + 196 => 262144.0, + 197 => 327680.0, + 198 => 393216.0, + 199 => 458752.0, + 200 => 524288.0, + 201 => 655360.0, + 202 => 786432.0, + 203 => 917504.0, + 204 => 1048576.0, + 205 => 1310720.0, + 206 => 1572864.0, + 207 => 1835008.0, + 208 => 2097152.0, + 209 => 2621440.0, + 210 => 3145728.0, + 211 => 3670016.0, + 212 => 4194304.0, + 213 => 5242880.0, + 214 => 6291456.0, + 215 => 7340032.0, + 216 => 8388608.0, + 217 => 1.048576E7, + 218 => 1.2582912E7, + 219 => 1.4680064E7, + 220 => 1.6777216E7, + 221 => 2.097152E7, + 222 => 2.5165824E7, + 223 => 2.9360128E7, + 224 => 3.3554432E7, + 225 => 4.194304E7, + 226 => 5.0331648E7, + 227 => 5.8720256E7, + 228 => 6.7108864E7, + 229 => 8.388608E7, + 230 => 1.00663296E8, + 231 => 1.17440512E8, + 232 => 1.34217728E8, + 233 => 1.6777216E8, + 234 => 2.01326592E8, + 235 => 2.34881024E8, + 236 => 2.68435456E8, + 237 => 3.3554432E8, + 238 => 4.02653184E8, + 239 => 4.69762048E8, + 240 => 5.3687091E8, + 241 => 6.7108864E8, + 242 => 8.0530637E8, + 243 => 9.395241E8, + 244 => 1.07374182E9, + 245 => 1.34217728E9, + 246 => 1.61061274E9, + 247 => 1.87904819E9, + 248 => 2.14748365E9, + 249 => 2.68435456E9, + 250 => 3.22122547E9, + 251 => 3.75809638E9, + 252 => 4.2949673E9, + 253 => 5.3687091E9, + 254 => 6.4424509E9, + 255 => 7.5161928E9 ); + + + /** + * Set the default Similarity implementation used by indexing and search + * code. + * + * @param Zend_Search_Lucene_Search_Similarity $similarity + */ + static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity) + { + self::$_defaultImpl = $similarity; + } + + + /** + * Return the default Similarity implementation used by indexing and search + * code. + * + * @return Zend_Search_Lucene_Search_Similarity + */ + static public function getDefault() + { + if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) { + self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default(); + } + + return self::$_defaultImpl; + } + + + /** + * Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + * Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when 'numTokens' is large, + * and larger values when 'numTokens' is small. + * + * That these values are computed under + * IndexWriter::addDocument(Document) and stored then using + * encodeNorm(float). Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * fieldName - name of field + * numTokens - the total number of tokens contained in fields named + * 'fieldName' of 'doc'. + * Returns a normalization factor for hits on this field of this document + * + * @param string $fieldName + * @param integer $numTokens + * @return float + */ + abstract public function lengthNorm($fieldName, $numTokens); + + /** + * Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + * This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * sumOfSquaredWeights - the sum of the squares of query term weights + * Returns a normalization factor for query weights + * + * @param float $sumOfSquaredWeights + * @return float + */ + abstract public function queryNorm($sumOfSquaredWeights); + + + /** + * Decodes a normalization factor stored in an index. + * + * @param integer $byte + * @return float + */ + static public function decodeNorm($byte) + { + return self::$_normTable[$byte & 0xFF]; + } + + + /** + * Encodes a normalization factor for storage in an index. + * + * The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @param float $f + * @return integer + */ + static function encodeNorm($f) + { + return self::_floatToByte($f); + } + + /** + * Float to byte conversion + * + * @param integer $b + * @return float + */ + static private function _floatToByte($f) + { + // round negatives up to zero + if ($f <= 0.0) { + return 0; + } + + // search for appropriate value + $lowIndex = 0; + $highIndex = 255; + while ($highIndex >= $lowIndex) { + // $mid = ($highIndex - $lowIndex)/2; + $mid = ($highIndex + $lowIndex) >> 1; + $delta = $f - self::$_normTable[$mid]; + + if ($delta < 0) { + $highIndex = $mid-1; + } elseif ($delta > 0) { + $lowIndex = $mid+1; + } else { + return $mid; // We got it! + } + } + + // round to closest value + if ($highIndex != 255 && + $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { + return $highIndex + 1; + } else { + return $highIndex; + } + } + + + /** + * Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the idf(Term, Searcher) + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when 'freq' is large, and smaller values when 'freq' + * is small. + * + * freq - the frequency of a term within a document + * Returns a score factor based on a term's within-document frequency + * + * @param float $freq + * @return float + */ + abstract public function tf($freq); + + /** + * Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to tf(float). + * + * A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * distance - the edit distance of this sloppy phrase match + * Returns the frequency increment for this match + * + * @param integer $distance + * @return float + */ + abstract public function sloppyFreq($distance); + + + /** + * Computes a score factor for a simple term or a phrase. + * + * The default implementation is: + * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); + * + * input - the term in question or array of terms + * reader - reader the document collection being searched + * Returns a score factor for the term + * + * @param mixed $input + * @param Zend_Search_Lucene $reader + * @return a score factor for the term + */ + public function idf($input, $reader) + { + if (!is_array($input)) { + return $this->idfFreq($reader->docFreq($input), $reader->count()); + } else { + $idf = 0.0; + foreach ($input as $term) { + $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); + } + return $idf; + } + } + + /** + * Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * tf(int) factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + * Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * docFreq - the number of documents which contain the term + * numDocs - the total number of documents in the collection + * Returns a score factor based on the term's document frequency + * + * @param integer $docFreq + * @param integer $numDocs + * @return float + */ + abstract public function idfFreq($docFreq, $numDocs); + + /** + * Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + * The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * overlap - the number of query terms matched in the document + * maxOverlap - the total number of terms in the query + * Returns a score factor based on term overlap with the query + * + * @param integer $overlap + * @param integer $maxOverlap + * @return float + */ + abstract public function coord($overlap, $maxOverlap); +} + |