summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--app/Core/Filter/Lexer.php15
-rw-r--r--tests/units/Core/Filter/LexerTest.php24
2 files changed, 32 insertions, 7 deletions
diff --git a/app/Core/Filter/Lexer.php b/app/Core/Filter/Lexer.php
index 8a7a68b1..fa5b8d2d 100644
--- a/app/Core/Filter/Lexer.php
+++ b/app/Core/Filter/Lexer.php
@@ -25,13 +25,13 @@ class Lexer
* @var array
*/
private $tokenMap = array(
- "/^(\s+)/" => 'T_WHITESPACE',
+ '/^(\s+)/' => 'T_WHITESPACE',
'/^([<=>]{0,2}[0-9]{4}-[0-9]{2}-[0-9]{2})/' => 'T_STRING',
- '/^([<=>]{1,2}\w+)/' => 'T_STRING',
+ '/^([<=>]{1,2}\w+)/u' => 'T_STRING',
'/^([<=>]{1,2}".+")/' => 'T_STRING',
'/^("(.+)")/' => 'T_STRING',
- "/^(\w+)/" => 'T_STRING',
- "/^(#\d+)/" => 'T_STRING',
+ '/^(\w+)/u' => 'T_STRING',
+ '/^(#\d+)/' => 'T_STRING',
);
/**
@@ -80,9 +80,10 @@ class Lexer
{
$tokens = array();
$this->offset = 0;
+ $input_length = mb_strlen($input, 'UTF-8');
- while (isset($input[$this->offset])) {
- $result = $this->match(substr($input, $this->offset));
+ while ($this->offset < $input_length) {
+ $result = $this->match(mb_substr($input, $this->offset, $input_length, 'UTF-8'));
if ($result === false) {
return array();
@@ -105,7 +106,7 @@ class Lexer
{
foreach ($this->tokenMap as $pattern => $name) {
if (preg_match($pattern, $string, $matches)) {
- $this->offset += strlen($matches[1]);
+ $this->offset += mb_strlen($matches[1], 'UTF-8');
return array(
'match' => str_replace('"', '', $matches[1]),
diff --git a/tests/units/Core/Filter/LexerTest.php b/tests/units/Core/Filter/LexerTest.php
index d405e9df..c72231c4 100644
--- a/tests/units/Core/Filter/LexerTest.php
+++ b/tests/units/Core/Filter/LexerTest.php
@@ -178,4 +178,28 @@ class LexerTest extends Base
$this->assertSame($expected, $lexer->tokenize('date:<=2016-01-01 something else'));
}
+
+ public function testTokenizeWithUtf8Letters()
+ {
+ $lexer = new Lexer();
+ $lexer->setDefaultToken('myDefaultToken');
+
+ $expected = array(
+ 'myDefaultToken' => array('àa éçùe'),
+ );
+
+ $this->assertSame($expected, $lexer->tokenize('àa éçùe'));
+ }
+
+ public function testTokenizeWithUtf8Numbers()
+ {
+ $lexer = new Lexer();
+ $lexer->setDefaultToken('myDefaultToken');
+
+ $expected = array(
+ 'myDefaultToken' => array('६Δↈ五一'),
+ );
+
+ $this->assertSame($expected, $lexer->tokenize('६Δↈ五一'));
+ }
}