summaryrefslogtreecommitdiff
path: root/demos/quickstart/protected/index/Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php
blob: 2a80c1f80c794f103795d0c44b5fc4c7f5d8b7c3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
<?php
/**
 * Zend Framework
 *
 * LICENSE
 *
 * This source file is subject to version 1.0 of the Zend Framework
 * license, that is bundled with this package in the file LICENSE, and
 * is available through the world-wide-web at the following URL:
 * http://www.zend.com/license/framework/1_0.txt. If you did not receive
 * a copy of the Zend Framework license and are unable to obtain it
 * through the world-wide-web, please send a note to license@zend.com
 * so we can mail you a copy immediately.
 *
 * @package    Zend_Search_Lucene
 * @subpackage Analysis
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */


/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';


/**
 * @package    Zend_Search_Lucene
 * @subpackage Analysis
 * @copyright  Copyright (c) 2005-2006 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://www.zend.com/license/framework/1_0.txt Zend Framework License version 1.0
 */

class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
    /**
     * Tokenize text to a terms
     * Returns array of Zend_Search_Lucene_Analysis_Token objects
     *
     * @param string $data
     * @return array
     */
    public function tokenize($data)
    {
        $tokenStream = array();

        $position = 0;
        while ($position < strlen($data)) {
            // skip white space
            while ($position < strlen($data) && !ctype_alpha( $data{$position} )) {
                $position++;
            }

            $termStartPosition = $position;

            // read token
            while ($position < strlen($data) && ctype_alpha( $data{$position} )) {
                $position++;
            }

            // Empty token, end of stream.
            if ($position == $termStartPosition) {
                break;
            }

            $token = new Zend_Search_Lucene_Analysis_Token(substr($data,
                                             $termStartPosition,
                                             $position-$termStartPosition),
                                      $termStartPosition,
                                      $position);
            $tokenStream[] = $this->normalize($token);
        }

        return $tokenStream;
    }
}