summaryrefslogtreecommitdiff
path: root/lib/querypath/src/QueryPath/CSS/Scanner.php
blob: 3513a0b23375d9569625fe7c17a0470a789b5f25 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
<?php
/** @file
 * The scanner.
 */
namespace QueryPath\CSS;
/**
 * Scanner for CSS selector parsing.
 *
 * This provides a simple scanner for traversing an input stream.
 *
 * @ingroup querypath_css
 */
final class Scanner {
  var $is = NULL;
  public $value = NULL;
  public $token = NULL;

  var $recurse = FALSE;
  var $it = 0;

  /**
   * Given a new input stream, tokenize the CSS selector string.
   * @see InputStream
   * @param InputStream $in
   *  An input stream to be scanned.
   */
  public function __construct(InputStream $in) {
    $this->is = $in;
  }

  /**
   * Return the position of the reader in the string.
   */
  public function position() {
    return $this->is->position;
  }

  /**
   * See the next char without removing it from the stack.
   *
   * @return char
   *  Returns the next character on the stack.
   */
  public function peek() {
    return $this->is->peek();
  }

  /**
   * Get the next token in the input stream.
   *
   * This sets the current token to the value of the next token in
   * the stream.
   *
   * @return int
   *  Returns an int value corresponding to one of the Token constants,
   *  or FALSE if the end of the string is reached. (Remember to use
   *  strong equality checking on FALSE, since 0 is a valid token id.)
   */
  public function nextToken() {
    $tok = -1;
    ++$this->it;
    if ($this->is->isEmpty()) {
      if ($this->recurse) {
        throw new \QueryPath\Exception("Recursion error detected at iteration " . $this->it . '.');
        exit();
      }
      //print "{$this->it}: All done\n";
      $this->recurse = TRUE;
      $this->token = FALSE;
      return FALSE;
    }
    $ch = $this->is->consume();
    //print __FUNCTION__ . " Testing $ch.\n";
    if (ctype_space($ch)) {
      $this->value = ' '; // Collapse all WS to a space.
      $this->token = $tok = Token::white;
      //$ch = $this->is->consume();
      return $tok;
    }

    if (ctype_alnum($ch) || $ch == '-' || $ch == '_') {
      // It's a character
      $this->value = $ch; //strtolower($ch);
      $this->token = $tok = Token::char;
      return $tok;
    }

    $this->value = $ch;

    switch($ch) {
      case '*':
        $tok = Token::star;
        break;
      case chr(ord('>')):
        $tok = Token::rangle;
        break;
      case '.':
        $tok = Token::dot;
        break;
      case '#':
        $tok = Token::octo;
        break;
      case '[':
        $tok = Token::lsquare;
        break;
      case ']':
        $tok = Token::rsquare;
        break;
      case ':':
        $tok = Token::colon;
        break;
      case '(':
        $tok = Token::lparen;
        break;
      case ')':
        $tok = Token::rparen;
        break;
      case '+':
        $tok = Token::plus;
        break;
      case '~':
        $tok = Token::tilde;
        break;
      case '=':
        $tok = Token::eq;
        break;
      case '|':
        $tok = Token::pipe;
        break;
      case ',':
        $tok = Token::comma;
        break;
      case chr(34):
        $tok = Token::quote;
        break;
      case "'":
        $tok = Token::squote;
        break;
      case '\\':
        $tok = Token::bslash;
        break;
      case '^':
        $tok = Token::carat;
        break;
      case '$':
        $tok = Token::dollar;
        break;
      case '@':
        $tok = Token::at;
        break;
    }


    // Catch all characters that are legal within strings.
    if ($tok == -1) {
      // TODO: This should be UTF-8 compatible, but PHP doesn't
      // have a native UTF-8 string. Should we use external
      // mbstring library?

      $ord = ord($ch);
      // Characters in this pool are legal for use inside of
      // certain strings. Extended ASCII is used here, though I
      // Don't know if these are really legal.
      if (($ord >= 32 && $ord <= 126) || ($ord >= 128 && $ord <= 255)) {
        $tok = Token::stringLegal;
      }
      else {
        throw new ParseException('Illegal character found in stream: ' . $ord);
      }
    }

    $this->token = $tok;
    return $tok;
  }

  /**
   * Get a name string from the input stream.
   * A name string must be composed of
   * only characters defined in Token:char: -_a-zA-Z0-9
   */
  public function getNameString() {
    $buf = '';
    while ($this->token === Token::char) {
      $buf .= $this->value;
      $this->nextToken();
      //print '_';
    }
    return $buf;
  }

  /**
   * This gets a string with any legal 'string' characters.
   * See CSS Selectors specification, section 11, for the
   * definition of string.
   *
   * This will check for string1, string2, and the case where a
   * string is unquoted (Oddly absent from the "official" grammar,
   * though such strings are present as examples in the spec.)
   *
   * Note:
   * Though the grammar supplied by CSS 3 Selectors section 11 does not
   * address the contents of a pseudo-class value, the spec itself indicates
   * that a pseudo-class value is a "value between parenthesis" [6.6]. The
   * examples given use URLs among other things, making them closer to the
   * definition of 'string' than to 'name'. So we handle them here as strings.
   */
  public function getQuotedString() {
    if ($this->token == Token::quote || $this->token == Token::squote || $this->token == Token::lparen) {
      $end = ($this->token == Token::lparen) ? Token::rparen : $this->token;
      $buf = '';
      $escape = FALSE;

      $this->nextToken(); // Skip the opening quote/paren

      // The second conjunct is probably not necessary.
      while ($this->token !== FALSE && $this->token > -1) {
        //print "Char: $this->value \n";
        if ($this->token == Token::bslash && !$escape) {
          // XXX: The backslash (\) is removed here.
          // Turn on escaping.
          //$buf .= $this->value;
          $escape = TRUE;
        }
        elseif ($escape) {
          // Turn off escaping
          $buf .= $this->value;
          $escape = FALSE;
        }
        elseif ($this->token === $end) {
          // At end of string; skip token and break.
          $this->nextToken();
          break;
        }
        else {
          // Append char.
          $buf .= $this->value;
        }
        $this->nextToken();
      }
      return $buf;
    }
  }

  // Get the contents inside of a pseudoClass().
  public function getPseudoClassString() {
    if ($this->token == Token::quote || $this->token == Token::squote || $this->token == Token::lparen) {
      $end = ($this->token == Token::lparen) ? Token::rparen : $this->token;
      $buf = '';
      $escape = FALSE;

      $this->nextToken(); // Skip the opening quote/paren

      // The second conjunct is probably not necessary.
      while ($this->token !== FALSE && $this->token > -1) {
        //print "Char: $this->value \n";
        if ($this->token == Token::bslash && !$escape) {
          // XXX: The backslash (\) is removed here.
          // Turn on escaping.
          //$buf .= $this->value;
          $escape = TRUE;
        }
        elseif ($escape) {
          // Turn off escaping
          $buf .= $this->value;
          $escape = FALSE;
        }
        // Allow nested pseudoclasses.
        elseif ($this->token == Token::lparen) {
          $buf .= "(";
          $buf .= $this->getPseudoClassString();
          $buf .= ")";
        }
        elseif ($this->token === $end) {
          // At end of string; skip token and break.
          $this->nextToken();
          break;
        }
        else {
          // Append char.
          $buf .= $this->value;
        }
        $this->nextToken();
      }
      return $buf;
    }
  }

  /**
   * Get a string from the input stream.
   * This is a convenience function for getting a string of
   * characters that are either alphanumber or whitespace. See
   * the Token::white and Token::char definitions.
   *
   * @deprecated This is not used anywhere in QueryPath.
   *//*
  public function getStringPlusWhitespace() {
    $buf = '';
    if($this->token === FALSE) {return '';}
    while ($this->token === Token::char || $this->token == Token::white) {
      $buf .= $this->value;
      $this->nextToken();
    }
    return $buf;
  }*/

}