diff options
-rw-r--r-- | app/Core/Filter/Lexer.php | 15 | ||||
-rw-r--r-- | tests/units/Core/Filter/LexerTest.php | 24 |
2 files changed, 32 insertions, 7 deletions
diff --git a/app/Core/Filter/Lexer.php b/app/Core/Filter/Lexer.php index 8a7a68b1..fa5b8d2d 100644 --- a/app/Core/Filter/Lexer.php +++ b/app/Core/Filter/Lexer.php @@ -25,13 +25,13 @@ class Lexer * @var array */ private $tokenMap = array( - "/^(\s+)/" => 'T_WHITESPACE', + '/^(\s+)/' => 'T_WHITESPACE', '/^([<=>]{0,2}[0-9]{4}-[0-9]{2}-[0-9]{2})/' => 'T_STRING', - '/^([<=>]{1,2}\w+)/' => 'T_STRING', + '/^([<=>]{1,2}\w+)/u' => 'T_STRING', '/^([<=>]{1,2}".+")/' => 'T_STRING', '/^("(.+)")/' => 'T_STRING', - "/^(\w+)/" => 'T_STRING', - "/^(#\d+)/" => 'T_STRING', + '/^(\w+)/u' => 'T_STRING', + '/^(#\d+)/' => 'T_STRING', ); /** @@ -80,9 +80,10 @@ class Lexer { $tokens = array(); $this->offset = 0; + $input_length = mb_strlen($input, 'UTF-8'); - while (isset($input[$this->offset])) { - $result = $this->match(substr($input, $this->offset)); + while ($this->offset < $input_length) { + $result = $this->match(mb_substr($input, $this->offset, $input_length, 'UTF-8')); if ($result === false) { return array(); @@ -105,7 +106,7 @@ class Lexer { foreach ($this->tokenMap as $pattern => $name) { if (preg_match($pattern, $string, $matches)) { - $this->offset += strlen($matches[1]); + $this->offset += mb_strlen($matches[1], 'UTF-8'); return array( 'match' => str_replace('"', '', $matches[1]), diff --git a/tests/units/Core/Filter/LexerTest.php b/tests/units/Core/Filter/LexerTest.php index d405e9df..c72231c4 100644 --- a/tests/units/Core/Filter/LexerTest.php +++ b/tests/units/Core/Filter/LexerTest.php @@ -178,4 +178,28 @@ class LexerTest extends Base $this->assertSame($expected, $lexer->tokenize('date:<=2016-01-01 something else')); } + + public function testTokenizeWithUtf8Letters() + { + $lexer = new Lexer(); + $lexer->setDefaultToken('myDefaultToken'); + + $expected = array( + 'myDefaultToken' => array('àa éçùe'), + ); + + $this->assertSame($expected, $lexer->tokenize('àa éçùe')); + } + + public function testTokenizeWithUtf8Numbers() + { + $lexer = new Lexer(); + $lexer->setDefaultToken('myDefaultToken'); + + $expected = array( + 'myDefaultToken' => array('६Δↈ五一'), + ); + + $this->assertSame($expected, $lexer->tokenize('६Δↈ五一')); + } } |