summaryrefslogtreecommitdiff
path: root/buildscripts/texbuilder/Zend/Search/Lucene/Analysis
diff options
context:
space:
mode:
Diffstat (limited to 'buildscripts/texbuilder/Zend/Search/Lucene/Analysis')
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php94
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php73
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php76
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php43
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php170
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php45
-rw-r--r--buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php55
7 files changed, 556 insertions, 0 deletions
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php
new file mode 100644
index 00000000..8e234c16
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer.php
@@ -0,0 +1,94 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Token */
+require_once 'Zend/Search/Lucene/Analysis/Token.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
+
+
+
+/**
+ * An Analyzer is used to analyze text.
+ * It thus represents a policy for extracting index terms from text.
+ *
+ * Note:
+ * Lucene Java implementation is oriented to streams. It provides effective work
+ * with a huge documents (more then 20Mb).
+ * But engine itself is not oriented such documents.
+ * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+abstract class Zend_Search_Lucene_Analysis_Analyzer
+{
+ /**
+ * The Analyzer implementation used by default.
+ *
+ * @var Zend_Search_Lucene_Analysis_Analyzer
+ */
+ static private $_defaultImpl;
+
+ /**
+ * Tokenize text to a terms
+ * Returns array of Zend_Search_Lucene_Analysis_Token objects
+ *
+ * @param string $data
+ * @return array
+ */
+ abstract public function tokenize($data);
+
+
+ /**
+ * Set the default Analyzer implementation used by indexing code.
+ *
+ * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
+ */
+ static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
+ {
+ self::$_defaultImpl = $analyzer;
+ }
+
+
+ /**
+ * Return the default Analyzer implementation used by indexing code.
+ *
+ * @return Zend_Search_Lucene_Analysis_Analyzer
+ */
+ static public function getDefault()
+ {
+ if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
+ self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
+ }
+
+ return self::$_defaultImpl;
+ }
+
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php
new file mode 100644
index 00000000..5c61e5b5
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common.php
@@ -0,0 +1,73 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
+
+
+/**
+ * Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface.
+ * There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis
+ * subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer.
+ *
+ * @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
+{
+ /**
+ * The set of Token filters applied to the Token stream.
+ * Array of Zend_Search_Lucene_Analysis_TokenFilter objects.
+ *
+ * @var array
+ */
+ private $_filters = array();
+
+ /**
+ * Add Token filter to the Analyzer
+ *
+ * @param Zend_Search_Lucene_Analysis_TokenFilter $filter
+ */
+ public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter)
+ {
+ $this->_filters[] = $filter;
+ }
+
+ /**
+ * Apply filters to the token.
+ *
+ * @param Zend_Search_Lucene_Analysis_Token $token
+ * @return Zend_Search_Lucene_Analysis_Token
+ */
+ public function normalize(Zend_Search_Lucene_Analysis_Token $token)
+ {
+ foreach ($this->_filters as $filter) {
+ $token = $filter->normalize($token);
+ }
+
+ return $token;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php
new file mode 100644
index 00000000..2a80c1f8
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php
@@ -0,0 +1,76 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
+{
+ /**
+ * Tokenize text to a terms
+ * Returns array of Zend_Search_Lucene_Analysis_Token objects
+ *
+ * @param string $data
+ * @return array
+ */
+ public function tokenize($data)
+ {
+ $tokenStream = array();
+
+ $position = 0;
+ while ($position < strlen($data)) {
+ // skip white space
+ while ($position < strlen($data) && !ctype_alpha( $data{$position} )) {
+ $position++;
+ }
+
+ $termStartPosition = $position;
+
+ // read token
+ while ($position < strlen($data) && ctype_alpha( $data{$position} )) {
+ $position++;
+ }
+
+ // Empty token, end of stream.
+ if ($position == $termStartPosition) {
+ break;
+ }
+
+ $token = new Zend_Search_Lucene_Analysis_Token(substr($data,
+ $termStartPosition,
+ $position-$termStartPosition),
+ $termStartPosition,
+ $position);
+ $tokenStream[] = $this->normalize($token);
+ }
+
+ return $tokenStream;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php
new file mode 100644
index 00000000..d77e38d5
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php
@@ -0,0 +1,43 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
+require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
+
+/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
+require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
+
+
+/**
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text
+{
+ public function __construct()
+ {
+ $this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php
new file mode 100644
index 00000000..a60d5d96
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/Token.php
@@ -0,0 +1,170 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage document
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/**
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+class Zend_Search_Lucene_Analysis_Token
+{
+ /**
+ * The text of the term.
+ *
+ * @var string
+ */
+ private $_termText;
+
+ /**
+ * Start in source text.
+ *
+ * @var integer
+ */
+ private $_startOffset;
+
+ /**
+ * End in source text
+ *
+ * @var integer
+ */
+ private $_endOffset;
+
+ /**
+ * Lexical type.
+ *
+ * @var string
+ */
+ private $_type;
+
+ /**
+ * The position of this token relative to the previous Token.
+ *
+ * The default value is one.
+ *
+ * Some common uses for this are:
+ * Set it to zero to put multiple terms in the same position. This is
+ * useful if, e.g., a word has multiple stems. Searches for phrases
+ * including either stem will match. In this case, all but the first stem's
+ * increment should be set to zero: the increment of the first instance
+ * should be one. Repeating a token with an increment of zero can also be
+ * used to boost the scores of matches on that token.
+ *
+ * Set it to values greater than one to inhibit exact phrase matches.
+ * If, for example, one does not want phrases to match across removed stop
+ * words, then one could build a stop word filter that removes stop words and
+ * also sets the increment to the number of stop words removed before each
+ * non-stop word. Then exact phrase queries will only match when the terms
+ * occur with no intervening stop words.
+ *
+ * @var integer
+ */
+ private $_positionIncrement;
+
+
+ /**
+ * Object constructor
+ *
+ * @param string $text
+ * @param integer $start
+ * @param integer $end
+ * @param string $type
+ */
+ public function __construct($text, $start, $end, $type = 'word' )
+ {
+ $this->_termText = $text;
+ $this->_startOffset = $start;
+ $this->_endOffset = $end;
+ $this->_type = $type;
+
+ $this->_positionIncrement = 1;
+ }
+
+
+ /**
+ * positionIncrement setter
+ *
+ * @param integer $positionIncrement
+ */
+ public function setPositionIncrement($positionIncrement)
+ {
+ $this->_positionIncrement = $positionIncrement;
+ }
+
+ /**
+ * Returns the position increment of this Token.
+ *
+ * @return integer
+ */
+ public function getPositionIncrement()
+ {
+ return $this->_positionIncrement;
+ }
+
+ /**
+ * Returns the Token's term text.
+ *
+ * @return string
+ */
+ public function getTermText()
+ {
+ return $this->_termText;
+ }
+
+ /**
+ * Returns this Token's starting offset, the position of the first character
+ * corresponding to this token in the source text.
+ *
+ * Note:
+ * The difference between getEndOffset() and getStartOffset() may not be equal
+ * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered
+ * by a stemmer or some other filter.
+ *
+ * @return integer
+ */
+ public function getStartOffset()
+ {
+ return $this->_startOffset;
+ }
+
+ /**
+ * Returns this Token's ending offset, one greater than the position of the
+ * last character corresponding to this token in the source text.
+ *
+ * @return integer
+ */
+ public function getEndOffset()
+ {
+ return $this->_endOffset;
+ }
+
+ /**
+ * Returns this Token's lexical type. Defaults to 'word'.
+ *
+ * @return string
+ */
+ public function getType()
+ {
+ return $this->_type;
+ }
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php
new file mode 100644
index 00000000..9ea5125f
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter.php
@@ -0,0 +1,45 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_Token */
+require_once 'Zend/Search/Lucene/Analysis/Token.php';
+
+
+/**
+ * Token filter converts (normalizes) Token ore removes it from a token stream.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+abstract class Zend_Search_Lucene_Analysis_TokenFilter
+{
+ /**
+ * Normalize Token or remove it (if null is returned)
+ *
+ * @param Zend_Search_Lucene_Analysis_Token $srcToken
+ * @return Zend_Search_Lucene_Analysis_Token
+ */
+ abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken);
+}
+
diff --git a/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php
new file mode 100644
index 00000000..53585e21
--- /dev/null
+++ b/buildscripts/texbuilder/Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php
@@ -0,0 +1,55 @@
+<?php
+/**
+ * Zend Framework
+ *
+ * LICENSE
+ *
+ * This source file is subject to version 1.0 of the Zend Framework
+ * license, that is bundled with this package in the file LICENSE, and
+ * is available through the world-wide-web at the following URL:
+ * http://www.zend.com/license/framework/1_0.txt. If you did not receive
+ * a copy of the Zend Framework license and are unable to obtain it
+ * through the world-wide-web, please send a note to license@zend.com
+ * so we can mail you a copy immediately.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+
+/** Zend_Search_Lucene_Analysis_TokenFilter */
+require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
+
+
+/**
+ * Lower case Token filter.
+ *
+ * @package Zend_Search_Lucene
+ * @subpackage Analysis
+ * @copyright Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
+ */
+
+class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter
+{
+ /**
+ * Normalize Token or remove it (if null is returned)
+ *
+ * @param Zend_Search_Lucene_Analysis_Token $srcToken
+ * @return Zend_Search_Lucene_Analysis_Token
+ */
+ public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
+ {
+ $newToken = new Zend_Search_Lucene_Analysis_Token(strtolower( $srcToken->getTermText() ),
+ $srcToken->getStartOffset(),
+ $srcToken->getEndOffset(),
+ $srcToken->getType());
+
+ $newToken->setPositionIncrement($srcToken->getPositionIncrement());
+
+ return $newToken;
+ }
+}
+