summaryrefslogtreecommitdiff
path: root/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php
diff options
context:
space:
mode:
Diffstat (limited to 'demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php')
-rw-r--r--demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php551
1 files changed, 551 insertions, 0 deletions
diff --git a/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php
new file mode 100644
index 00000000..8b758213
--- /dev/null
+++ b/demos/quickstart/protected/index/Zend/Search/Lucene/Search/Similarity.php
@@ -0,0 +1,551 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Search
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Search_Similarity_Default */
+require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Search
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+abstract class Zend_Search_Lucene_Search_Similarity
+{
+ /**
+ * The Similarity implementation used by default.
+ *
+ * @var Zend_Search_Lucene_Search_Similarity
+ */
+ static private $_defaultImpl;
+
+ /**
+ * Cache of decoded bytes.
+ * Array of floats
+ *
+ * @var array
+ */
+ static private $_normTable = array( 0 => 0.0,
+ 1 => 5.820766E-10,
+ 2 => 6.9849193E-10,
+ 3 => 8.1490725E-10,
+ 4 => 9.313226E-10,
+ 5 => 1.1641532E-9,
+ 6 => 1.3969839E-9,
+ 7 => 1.6298145E-9,
+ 8 => 1.8626451E-9,
+ 9 => 2.3283064E-9,
+ 10 => 2.7939677E-9,
+ 11 => 3.259629E-9,
+ 12 => 3.7252903E-9,
+ 13 => 4.656613E-9,
+ 14 => 5.5879354E-9,
+ 15 => 6.519258E-9,
+ 16 => 7.4505806E-9,
+ 17 => 9.313226E-9,
+ 18 => 1.1175871E-8,
+ 19 => 1.3038516E-8,
+ 20 => 1.4901161E-8,
+ 21 => 1.8626451E-8,
+ 22 => 2.2351742E-8,
+ 23 => 2.6077032E-8,
+ 24 => 2.9802322E-8,
+ 25 => 3.7252903E-8,
+ 26 => 4.4703484E-8,
+ 27 => 5.2154064E-8,
+ 28 => 5.9604645E-8,
+ 29 => 7.4505806E-8,
+ 30 => 8.940697E-8,
+ 31 => 1.0430813E-7,
+ 32 => 1.1920929E-7,
+ 33 => 1.4901161E-7,
+ 34 => 1.7881393E-7,
+ 35 => 2.0861626E-7,
+ 36 => 2.3841858E-7,
+ 37 => 2.9802322E-7,
+ 38 => 3.5762787E-7,
+ 39 => 4.172325E-7,
+ 40 => 4.7683716E-7,
+ 41 => 5.9604645E-7,
+ 42 => 7.1525574E-7,
+ 43 => 8.34465E-7,
+ 44 => 9.536743E-7,
+ 45 => 1.1920929E-6,
+ 46 => 1.4305115E-6,
+ 47 => 1.66893E-6,
+ 48 => 1.9073486E-6,
+ 49 => 2.3841858E-6,
+ 50 => 2.861023E-6,
+ 51 => 3.33786E-6,
+ 52 => 3.8146973E-6,
+ 53 => 4.7683716E-6,
+ 54 => 5.722046E-6,
+ 55 => 6.67572E-6,
+ 56 => 7.6293945E-6,
+ 57 => 9.536743E-6,
+ 58 => 1.1444092E-5,
+ 59 => 1.335144E-5,
+ 60 => 1.5258789E-5,
+ 61 => 1.9073486E-5,
+ 62 => 2.2888184E-5,
+ 63 => 2.670288E-5,
+ 64 => 3.0517578E-5,
+ 65 => 3.8146973E-5,
+ 66 => 4.5776367E-5,
+ 67 => 5.340576E-5,
+ 68 => 6.1035156E-5,
+ 69 => 7.6293945E-5,
+ 70 => 9.1552734E-5,
+ 71 => 1.0681152E-4,
+ 72 => 1.2207031E-4,
+ 73 => 1.5258789E-4,
+ 74 => 1.8310547E-4,
+ 75 => 2.1362305E-4,
+ 76 => 2.4414062E-4,
+ 77 => 3.0517578E-4,
+ 78 => 3.6621094E-4,
+ 79 => 4.272461E-4,
+ 80 => 4.8828125E-4,
+ 81 => 6.1035156E-4,
+ 82 => 7.324219E-4,
+ 83 => 8.544922E-4,
+ 84 => 9.765625E-4,
+ 85 => 0.0012207031,
+ 86 => 0.0014648438,
+ 87 => 0.0017089844,
+ 88 => 0.001953125,
+ 89 => 0.0024414062,
+ 90 => 0.0029296875,
+ 91 => 0.0034179688,
+ 92 => 0.00390625,
+ 93 => 0.0048828125,
+ 94 => 0.005859375,
+ 95 => 0.0068359375,
+ 96 => 0.0078125,
+ 97 => 0.009765625,
+ 98 => 0.01171875,
+ 99 => 0.013671875,
+ 100 => 0.015625,
+ 101 => 0.01953125,
+ 102 => 0.0234375,
+ 103 => 0.02734375,
+ 104 => 0.03125,
+ 105 => 0.0390625,
+ 106 => 0.046875,
+ 107 => 0.0546875,
+ 108 => 0.0625,
+ 109 => 0.078125,
+ 110 => 0.09375,
+ 111 => 0.109375,
+ 112 => 0.125,
+ 113 => 0.15625,
+ 114 => 0.1875,
+ 115 => 0.21875,
+ 116 => 0.25,
+ 117 => 0.3125,
+ 118 => 0.375,
+ 119 => 0.4375,
+ 120 => 0.5,
+ 121 => 0.625,
+ 122 => 0.75,
+ 123 => 0.875,
+ 124 => 1.0,
+ 125 => 1.25,
+ 126 => 1.5,
+ 127 => 1.75,
+ 128 => 2.0,
+ 129 => 2.5,
+ 130 => 3.0,
+ 131 => 3.5,
+ 132 => 4.0,
+ 133 => 5.0,
+ 134 => 6.0,
+ 135 => 7.0,
+ 136 => 8.0,
+ 137 => 10.0,
+ 138 => 12.0,
+ 139 => 14.0,
+ 140 => 16.0,
+ 141 => 20.0,
+ 142 => 24.0,
+ 143 => 28.0,
+ 144 => 32.0,
+ 145 => 40.0,
+ 146 => 48.0,
+ 147 => 56.0,
+ 148 => 64.0,
+ 149 => 80.0,
+ 150 => 96.0,
+ 151 => 112.0,
+ 152 => 128.0,
+ 153 => 160.0,
+ 154 => 192.0,
+ 155 => 224.0,
+ 156 => 256.0,
+ 157 => 320.0,
+ 158 => 384.0,
+ 159 => 448.0,
+ 160 => 512.0,
+ 161 => 640.0,
+ 162 => 768.0,
+ 163 => 896.0,
+ 164 => 1024.0,
+ 165 => 1280.0,
+ 166 => 1536.0,
+ 167 => 1792.0,
+ 168 => 2048.0,
+ 169 => 2560.0,
+ 170 => 3072.0,
+ 171 => 3584.0,
+ 172 => 4096.0,
+ 173 => 5120.0,
+ 174 => 6144.0,
+ 175 => 7168.0,
+ 176 => 8192.0,
+ 177 => 10240.0,
+ 178 => 12288.0,
+ 179 => 14336.0,
+ 180 => 16384.0,
+ 181 => 20480.0,
+ 182 => 24576.0,
+ 183 => 28672.0,
+ 184 => 32768.0,
+ 185 => 40960.0,
+ 186 => 49152.0,
+ 187 => 57344.0,
+ 188 => 65536.0,
+ 189 => 81920.0,
+ 190 => 98304.0,
+ 191 => 114688.0,
+ 192 => 131072.0,
+ 193 => 163840.0,
+ 194 => 196608.0,
+ 195 => 229376.0,
+ 196 => 262144.0,
+ 197 => 327680.0,
+ 198 => 393216.0,
+ 199 => 458752.0,
+ 200 => 524288.0,
+ 201 => 655360.0,
+ 202 => 786432.0,
+ 203 => 917504.0,
+ 204 => 1048576.0,
+ 205 => 1310720.0,
+ 206 => 1572864.0,
+ 207 => 1835008.0,
+ 208 => 2097152.0,
+ 209 => 2621440.0,
+ 210 => 3145728.0,
+ 211 => 3670016.0,
+ 212 => 4194304.0,
+ 213 => 5242880.0,
+ 214 => 6291456.0,
+ 215 => 7340032.0,
+ 216 => 8388608.0,
+ 217 => 1.048576E7,
+ 218 => 1.2582912E7,
+ 219 => 1.4680064E7,
+ 220 => 1.6777216E7,
+ 221 => 2.097152E7,
+ 222 => 2.5165824E7,
+ 223 => 2.9360128E7,
+ 224 => 3.3554432E7,
+ 225 => 4.194304E7,
+ 226 => 5.0331648E7,
+ 227 => 5.8720256E7,
+ 228 => 6.7108864E7,
+ 229 => 8.388608E7,
+ 230 => 1.00663296E8,
+ 231 => 1.17440512E8,
+ 232 => 1.34217728E8,
+ 233 => 1.6777216E8,
+ 234 => 2.01326592E8,
+ 235 => 2.34881024E8,
+ 236 => 2.68435456E8,
+ 237 => 3.3554432E8,
+ 238 => 4.02653184E8,
+ 239 => 4.69762048E8,
+ 240 => 5.3687091E8,
+ 241 => 6.7108864E8,
+ 242 => 8.0530637E8,
+ 243 => 9.395241E8,
+ 244 => 1.07374182E9,
+ 245 => 1.34217728E9,
+ 246 => 1.61061274E9,
+ 247 => 1.87904819E9,
+ 248 => 2.14748365E9,
+ 249 => 2.68435456E9,
+ 250 => 3.22122547E9,
+ 251 => 3.75809638E9,
+ 252 => 4.2949673E9,
+ 253 => 5.3687091E9,
+ 254 => 6.4424509E9,
+ 255 => 7.5161928E9 );
+
+
+ /**
+ * Set the default Similarity implementation used by indexing and search
+ * code.
+ *
+ * @param Zend_Search_Lucene_Search_Similarity $similarity
+ */
+ static public function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
+ {
+ self::$_defaultImpl = $similarity;
+ }
+
+
+ /**
+ * Return the default Similarity implementation used by indexing and search
+ * code.
+ *
+ * @return Zend_Search_Lucene_Search_Similarity
+ */
+ static public function getDefault()
+ {
+ if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
+ self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
+ }
+
+ return self::$_defaultImpl;
+ }
+
+
+ /**
+ * Computes the normalization value for a field given the total number of
+ * terms contained in a field. These values, together with field boosts, are
+ * stored in an index and multipled into scores for hits on each field by the
+ * search code.
+ *
+ * Matches in longer fields are less precise, so implemenations of this
+ * method usually return smaller values when 'numTokens' is large,
+ * and larger values when 'numTokens' is small.
+ *
+ * That these values are computed under
+ * IndexWriter::addDocument(Document) and stored then using
+ * encodeNorm(float). Thus they have limited precision, and documents
+ * must be re-indexed if this method is altered.
+ *
+ * fieldName - name of field
+ * numTokens - the total number of tokens contained in fields named
+ * 'fieldName' of 'doc'.
+ * Returns a normalization factor for hits on this field of this document
+ *
+ * @param string $fieldName
+ * @param integer $numTokens
+ * @return float
+ */
+ abstract public function lengthNorm($fieldName, $numTokens);
+
+ /**
+ * Computes the normalization value for a query given the sum of the squared
+ * weights of each of the query terms. This value is then multipled into the
+ * weight of each query term.
+ *
+ * This does not affect ranking, but rather just attempts to make scores
+ * from different queries comparable.
+ *
+ * sumOfSquaredWeights - the sum of the squares of query term weights
+ * Returns a normalization factor for query weights
+ *
+ * @param float $sumOfSquaredWeights
+ * @return float
+ */
+ abstract public function queryNorm($sumOfSquaredWeights);
+
+
+ /**
+ * Decodes a normalization factor stored in an index.
+ *
+ * @param integer $byte
+ * @return float
+ */
+ static public function decodeNorm($byte)
+ {
+ return self::$_normTable[$byte & 0xFF];
+ }
+
+
+ /**
+ * Encodes a normalization factor for storage in an index.
+ *
+ * The encoding uses a five-bit exponent and three-bit mantissa, thus
+ * representing values from around 7x10^9 to 2x10^-9 with about one
+ * significant decimal digit of accuracy. Zero is also represented.
+ * Negative numbers are rounded up to zero. Values too large to represent
+ * are rounded down to the largest representable value. Positive values too
+ * small to represent are rounded up to the smallest positive representable
+ * value.
+ *
+ * @param float $f
+ * @return integer
+ */
+ static function encodeNorm($f)
+ {
+ return self::_floatToByte($f);
+ }
+
+ /**
+ * Float to byte conversion
+ *
+ * @param integer $b
+ * @return float
+ */
+ static private function _floatToByte($f)
+ {
+ // round negatives up to zero
+ if ($f <= 0.0) {
+ return 0;
+ }
+
+ // search for appropriate value
+ $lowIndex = 0;
+ $highIndex = 255;
+ while ($highIndex >= $lowIndex) {
+ // $mid = ($highIndex - $lowIndex)/2;
+ $mid = ($highIndex + $lowIndex) >> 1;
+ $delta = $f - self::$_normTable[$mid];
+
+ if ($delta < 0) {
+ $highIndex = $mid-1;
+ } elseif ($delta > 0) {
+ $lowIndex = $mid+1;
+ } else {
+ return $mid; // We got it!
+ }
+ }
+
+ // round to closest value
+ if ($highIndex != 255 &&
+ $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
+ return $highIndex + 1;
+ } else {
+ return $highIndex;
+ }
+ }
+
+
+ /**
+ * Computes a score factor based on a term or phrase's frequency in a
+ * document. This value is multiplied by the idf(Term, Searcher)
+ * factor for each term in the query and these products are then summed to
+ * form the initial score for a document.
+ *
+ * Terms and phrases repeated in a document indicate the topic of the
+ * document, so implementations of this method usually return larger values
+ * when 'freq' is large, and smaller values when 'freq'
+ * is small.
+ *
+ * freq - the frequency of a term within a document
+ * Returns a score factor based on a term's within-document frequency
+ *
+ * @param float $freq
+ * @return float
+ */
+ abstract public function tf($freq);
+
+ /**
+ * Computes the amount of a sloppy phrase match, based on an edit distance.
+ * This value is summed for each sloppy phrase match in a document to form
+ * the frequency that is passed to tf(float).
+ *
+ * A phrase match with a small edit distance to a document passage more
+ * closely matches the document, so implementations of this method usually
+ * return larger values when the edit distance is small and smaller values
+ * when it is large.
+ *
+ * distance - the edit distance of this sloppy phrase match
+ * Returns the frequency increment for this match
+ *
+ * @param integer $distance
+ * @return float
+ */
+ abstract public function sloppyFreq($distance);
+
+
+ /**
+ * Computes a score factor for a simple term or a phrase.
+ *
+ * The default implementation is:
+ * return idfFreq(searcher.docFreq(term), searcher.maxDoc());
+ *
+ * input - the term in question or array of terms
+ * reader - reader the document collection being searched
+ * Returns a score factor for the term
+ *
+ * @param mixed $input
+ * @param Zend_Search_Lucene $reader
+ * @return a score factor for the term
+ */
+ public function idf($input, $reader)
+ {
+ if (!is_array($input)) {
+ return $this->idfFreq($reader->docFreq($input), $reader->count());
+ } else {
+ $idf = 0.0;
+ foreach ($input as $term) {
+ $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
+ }
+ return $idf;
+ }
+ }
+
+ /**
+ * Computes a score factor based on a term's document frequency (the number
+ * of documents which contain the term). This value is multiplied by the
+ * tf(int) factor for each term in the query and these products are
+ * then summed to form the initial score for a document.
+ *
+ * Terms that occur in fewer documents are better indicators of topic, so
+ * implemenations of this method usually return larger values for rare terms,
+ * and smaller values for common terms.
+ *
+ * docFreq - the number of documents which contain the term
+ * numDocs - the total number of documents in the collection
+ * Returns a score factor based on the term's document frequency
+ *
+ * @param integer $docFreq
+ * @param integer $numDocs
+ * @return float
+ */
+ abstract public function idfFreq($docFreq, $numDocs);
+
+ /**
+ * Computes a score factor based on the fraction of all query terms that a
+ * document contains. This value is multiplied into scores.
+ *
+ * The presence of a large portion of the query terms indicates a better
+ * match with the query, so implemenations of this method usually return
+ * larger values when the ratio between these parameters is large and smaller
+ * values when the ratio between them is small.
+ *
+ * overlap - the number of query terms matched in the document
+ * maxOverlap - the total number of terms in the query
+ * Returns a score factor based on term overlap with the query
+ *
+ * @param integer $overlap
+ * @param integer $maxOverlap
+ * @return float
+ */
+ abstract public function coord($overlap, $maxOverlap);
+}
+