diff options
Diffstat (limited to 'lib/querypath/CssParser.php')
-rw-r--r-- | lib/querypath/CssParser.php | 1108 |
1 files changed, 1108 insertions, 0 deletions
diff --git a/lib/querypath/CssParser.php b/lib/querypath/CssParser.php new file mode 100644 index 0000000..2ef2802 --- /dev/null +++ b/lib/querypath/CssParser.php @@ -0,0 +1,1108 @@ +<?php +/** @file + * CSS selector parsing classes. + * + * This file contains the tools necessary for parsing CSS 3 selectors. + * In the future it may be expanded to handle all of CSS 3. + * + * The parser contained herein is has an event-based API. Implementors should + * begin by implementing the {@link CssEventHandler} interface. For an example + * of how this is done, see {@link CssEventHandler.php}. + * + * @author M Butcher <matt@aleph-null.tv> + * @license http://opensource.org/licenses/lgpl-2.1.php The GNU Lesser GPL (LGPL) or an MIT-like license. + */ + +/** @addtogroup querypath_css CSS Parsing + * QueryPath includes a CSS 3 Selector parser. + * + * + * Typically the parser is not accessed directly. Most developers will use it indirectly from + * qp(), htmlqp(), or one of the methods on a QueryPath object. + * + * This parser is modular and is not tied to QueryPath, so you can use it in your + * own (non-QueryPath) projects if you wish. To dive in, start with CssEventHandler, the + * event interface that works like a SAX API for CSS selectors. If you want to check out + * the details, check out the parser (CssParser), scanner (CssScanner), and token list (CssToken). + */ + +require_once 'CssEventHandler.php'; + + +/** + * An event handler for handling CSS 3 Selector parsing. + * + * This provides a standard interface for CSS 3 Selector event handling. As the + * parser parses a selector, it will fire events. Implementations of CssEventHandler + * can then handle the events. + * + * This library is inspired by the SAX2 API for parsing XML. Each component of a + * selector fires an event, passing the necessary data on to the event handler. + * + * @ingroup querypath_css + */ +interface CssEventHandler { + /** The is-exactly (=) operator. */ + const isExactly = 0; // = + /** The contains-with-space operator (~=). */ + const containsWithSpace = 1; // ~= + /** The contains-with-hyphen operator (!=). */ + const containsWithHyphen = 2; // |= + /** The contains-in-string operator (*=). */ + const containsInString = 3; // *= + /** The begins-with operator (^=). */ + const beginsWith = 4; // ^= + /** The ends-with operator ($=). */ + const endsWith = 5; // $= + /** The any-element operator (*). */ + const anyElement = '*'; + + /** + * This event is fired when a CSS ID is encountered. + * An ID begins with an octothorp: #name. + * + * @param string $id + * The ID passed in. + */ + public function elementID($id); // #name + /** + * Handle an element name. + * Example: name + * @param string $name + * The name of the element. + */ + public function element($name); // name + /** + * Handle a namespaced element name. + * example: namespace|name + * @param string $name + * The tag name. + * @param string $namespace + * The namespace identifier (Not the URI) + */ + public function elementNS($name, $namespace = NULL); + /** + * Handle an any-element (*) operator. + * Example: * + */ + public function anyElement(); // * + /** + * Handle an any-element operator that is constrained to a namespace. + * Example: ns|* + * @param string $ns + * The namespace identifier (not the URI). + */ + public function anyElementInNS($ns); // ns|* + /** + * Handle a CSS class selector. + * Example: .name + * @param string $name + * The name of the class. + */ + public function elementClass($name); // .name + /** + * Handle an attribute selector. + * Example: [name=attr] + * Example: [name~=attr] + * @param string $name + * The attribute name. + * @param string $value + * The value of the attribute, if given. + * @param int $operation + * The operation to be used for matching. See {@link CssEventHandler} + * constants for a list of supported operations. + */ + public function attribute($name, $value = NULL, $operation = CssEventHandler::isExactly); // [name=attr] + /** + * Handle an attribute selector bound to a specific namespace. + * Example: [ns|name=attr] + * Example: [ns|name~=attr] + * @param string $name + * The attribute name. + * @param string $ns + * The namespace identifier (not the URI). + * @param string $value + * The value of the attribute, if given. + * @param int $operation + * The operation to be used for matching. See {@link CssEventHandler} + * constants for a list of supported operations. + */ + public function attributeNS($name, $ns, $value = NULL, $operation = CssEventHandler::isExactly); + /** + * Handle a pseudo-class. + * Example: :name(value) + * @param string $name + * The pseudo-class name. + * @param string $value + * The value, if one is found. + */ + public function pseudoClass($name, $value = NULL); //:name(value) + /** + * Handle a pseudo-element. + * Example: ::name + * @param string $name + * The pseudo-element name. + */ + public function pseudoElement($name); // ::name + /** + * Handle a direct descendant combinator. + * Example: > + */ + public function directDescendant(); // > + /** + * Handle a adjacent combinator. + * Example: + + */ + public function adjacent(); // + + /** + * Handle an another-selector combinator. + * Example: , + */ + public function anotherSelector(); // , + /** + * Handle a sibling combinator. + * Example: ~ + */ + public function sibling(); // ~ combinator + /** + * Handle an any-descendant combinator. + * Example: ' ' + */ + public function anyDescendant(); // ' ' (space) operator. + +} + +/** + * Tokens for CSS. + * This class defines the recognized tokens for the parser, and also + * provides utility functions for error reporting. + * + * @ingroup querypath_css + */ +final class CssToken { + const char = 0; + const star = 1; + const rangle = 2; + const dot = 3; + const octo = 4; + const rsquare = 5; + const lsquare = 6; + const colon = 7; + const rparen = 8; + const lparen = 9; + const plus = 10; + const tilde = 11; + const eq = 12; + const pipe = 13; + const comma = 14; + const white = 15; + const quote = 16; + const squote = 17; + const bslash = 18; + const carat = 19; + const dollar = 20; + const at = 21; // This is not in the spec. Apparently, old broken CSS uses it. + + // In legal range for string. + const stringLegal = 99; + + /** + * Get a name for a given constant. Used for error handling. + */ + static function name($const_int) { + $a = array('character', 'star', 'right angle bracket', + 'dot', 'octothorp', 'right square bracket', 'left square bracket', + 'colon', 'right parenthesis', 'left parenthesis', 'plus', 'tilde', + 'equals', 'vertical bar', 'comma', 'space', 'quote', 'single quote', + 'backslash', 'carat', 'dollar', 'at'); + if (isset($a[$const_int]) && is_numeric($const_int)) { + return $a[$const_int]; + } + elseif ($const_int == 99) { + return 'a legal non-alphanumeric character'; + } + elseif ($const_int == FALSE) { + return 'end of file'; + } + return sprintf('illegal character (%s)', $const_int); + } +} + +/** + * Parse a CSS selector. + * + * In CSS, a selector is used to identify which element or elements + * in a DOM are being selected for the application of a particular style. + * Effectively, selectors function as a query language for a structured + * document -- almost always HTML or XML. + * + * This class provides an event-based parser for CSS selectors. It can be + * used, for example, as a basis for writing a DOM query engine based on + * CSS. + * + * @ingroup querypath_css + */ +class CssParser { + protected $scanner = NULL; + protected $buffer = ''; + protected $handler = NULL; + protected $strict = FALSE; + + protected $DEBUG = FALSE; + + /** + * Construct a new CSS parser object. This will attempt to + * parse the string as a CSS selector. As it parses, it will + * send events to the CssEventHandler implementation. + */ + public function __construct($string, CssEventHandler $handler) { + $this->originalString = $string; + $is = new CssInputStream($string); + $this->scanner = new CssScanner($is); + $this->handler = $handler; + } + + /** + * Parse the selector. + * + * This begins an event-based parsing process that will + * fire events as the selector is handled. A CssEventHandler + * implementation will be responsible for handling the events. + * @throws CssParseException + */ + public function parse() { + + $this->scanner->nextToken(); + while ($this->scanner->token !== FALSE) { + // Primitive recursion detection. + $position = $this->scanner->position(); + + if ($this->DEBUG) { + print "PARSE " . $this->scanner->token. "\n"; + } + $this->selector(); + + $finalPosition = $this->scanner->position(); + + if ($this->scanner->token !== FALSE && $finalPosition == $position) { + // If we get here, then the scanner did not pop a single character + // off of the input stream during a full run of the parser, which + // means that the current input does not match any recognizable + // pattern. + throw new CssParseException('CSS selector is not well formed.'); + } + + } + + } + + /** + * A restricted parser that can only parse simple selectors. + * The pseudoClass handler for this parser will throw an + * exception if it encounters a pseudo-element or the + * negation pseudo-class. + * + * @deprecated This is not used anywhere in QueryPath and + * may be removed. + *//* + public function parseSimpleSelector() { + while ($this->scanner->token !== FALSE) { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(TRUE); // Operate in restricted mode. + $this->attribute(); + + // TODO: Need to add failure conditions here. + } + }*/ + + /** + * Handle an entire CSS selector. + */ + private function selector() { + if ($this->DEBUG) print "SELECTOR{$this->scanner->position()}\n"; + $this->consumeWhitespace(); // Remove leading whitespace + $this->simpleSelectors(); + $this->combinator(); + } + + /** + * Consume whitespace and return a count of the number of whitespace consumed. + */ + private function consumeWhitespace() { + if ($this->DEBUG) print "CONSUME WHITESPACE\n"; + $white = 0; + while ($this->scanner->token == CssToken::white) { + $this->scanner->nextToken(); + ++$white; + } + return $white; + } + + /** + * Handle one of the five combinators: '>', '+', ' ', '~', and ','. + * This will call the appropriate event handlers. + * @see CssEventHandler::directDescendant(), + * @see CssEventHandler::adjacent(), + * @see CssEventHandler::anyDescendant(), + * @see CssEventHandler::anotherSelector(). + */ + private function combinator() { + if ($this->DEBUG) print "COMBINATOR\n"; + /* + * Problem: ' ' and ' > ' are both valid combinators. + * So we have to track whitespace consumption to see + * if we are hitting the ' ' combinator or if the + * selector just has whitespace padding another combinator. + */ + + // Flag to indicate that post-checks need doing + $inCombinator = FALSE; + $white = $this->consumeWhitespace(); + $t = $this->scanner->token; + + if ($t == CssToken::rangle) { + $this->handler->directDescendant(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == CssToken::plus) { + $this->handler->adjacent(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == CssToken::comma) { + $this->handler->anotherSelector(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->scanner->selectors(); + } + elseif ($t == CssToken::tilde) { + $this->handler->sibling(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + } + + // Check that we don't get two combinators in a row. + if ($inCombinator) { + $white = 0; + if ($this->DEBUG) print "COMBINATOR: " . CssToken::name($t) . "\n"; + $this->consumeWhitespace(); + if ($this->isCombinator($this->scanner->token)) { + throw new CssParseException("Illegal combinator: Cannot have two combinators in sequence."); + } + } + // Check to see if we have whitespace combinator: + elseif ($white > 0) { + if ($this->DEBUG) print "COMBINATOR: any descendant\n"; + $inCombinator = TRUE; + $this->handler->anyDescendant(); + } + else { + if ($this->DEBUG) print "COMBINATOR: no combinator found.\n"; + } + } + + /** + * Check if the token is a combinator. + */ + private function isCombinator($tok) { + $combinators = array(CssToken::plus, CssToken::rangle, CssToken::comma, CssToken::tilde); + return in_array($tok, $combinators); + } + + /** + * Handle a simple selector. + */ + private function simpleSelectors() { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(); + $this->attribute(); + } + + /** + * Handles CSS ID selectors. + * This will call CssEventHandler::elementID(). + */ + private function elementID() { + if ($this->DEBUG) print "ELEMENT ID\n"; + if ($this->scanner->token == CssToken::octo) { + $this->scanner->nextToken(); + if ($this->scanner->token !== CssToken::char) { + throw new CssParseException("Expected string after #"); + } + $id = $this->scanner->getNameString(); + $this->handler->elementID($id); + } + } + + /** + * Handles CSS class selectors. + * This will call the CssEventHandler::elementClass() method. + */ + private function elementClass() { + if ($this->DEBUG) print "ELEMENT CLASS\n"; + if ($this->scanner->token == CssToken::dot) { + $this->scanner->nextToken(); + $this->consumeWhitespace(); // We're very fault tolerent. This should prob through error. + $cssClass = $this->scanner->getNameString(); + $this->handler->elementClass($cssClass); + } + } + + /** + * Handle a pseudo-class and pseudo-element. + * + * CSS 3 selectors support separate pseudo-elements, using :: instead + * of : for separator. This is now supported, and calls the pseudoElement + * handler, CssEventHandler::pseudoElement(). + * + * This will call CssEventHandler::pseudoClass() when a + * pseudo-class is parsed. + */ + private function pseudoClass($restricted = FALSE) { + if ($this->DEBUG) print "PSEUDO-CLASS\n"; + if ($this->scanner->token == CssToken::colon) { + + // Check for CSS 3 pseudo element: + $isPseudoElement = FALSE; + if ($this->scanner->nextToken() === CssToken::colon) { + $isPseudoElement = TRUE; + $this->scanner->nextToken(); + } + + $name = $this->scanner->getNameString(); + if ($restricted && $name == 'not') { + throw new CssParseException("The 'not' pseudo-class is illegal in this context."); + } + + $value = NULL; + if ($this->scanner->token == CssToken::lparen) { + if ($isPseudoElement) { + throw new CssParseException("Illegal left paren. Pseudo-Element cannot have arguments."); + } + $value = $this->pseudoClassValue(); + } + + // FIXME: This should throw errors when pseudo element has values. + if ($isPseudoElement) { + if ($restricted) { + throw new CssParseException("Pseudo-Elements are illegal in this context."); + } + $this->handler->pseudoElement($name); + $this->consumeWhitespace(); + + // Per the spec, pseudo-elements must be the last items in a selector, so we + // check to make sure that we are either at the end of the stream or that a + // new selector is starting. Only one pseudo-element is allowed per selector. + if ($this->scanner->token !== FALSE && $this->scanner->token !== CssToken::comma) { + throw new CssParseException("A Pseudo-Element must be the last item in a selector."); + } + } + else { + $this->handler->pseudoClass($name, $value); + } + } + } + + /** + * Get the value of a pseudo-classes. + * + * @return string + * Returns the value found from a pseudo-class. + * + * @todo Pseudoclasses can be passed pseudo-elements and + * other pseudo-classes as values, which means :pseudo(::pseudo) + * is legal. + */ + private function pseudoClassValue() { + if ($this->scanner->token == CssToken::lparen) { + $buf = ''; + + // For now, just leave pseudoClass value vague. + /* + // We have to peek to see if next char is a colon because + // pseudo-classes and pseudo-elements are legal strings here. + print $this->scanner->peek(); + if ($this->scanner->peek() == ':') { + print "Is pseudo\n"; + $this->scanner->nextToken(); + + // Pseudo class + if ($this->scanner->token == CssToken::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + // Pseudo element + if ($this->scanner->token == CssToken::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + } + // Ident + $buf .= $this->scanner->getNameString(); + } + } + else { + print "fetching string.\n"; + $buf .= $this->scanner->getQuotedString(); + if ($this->scanner->token != CssToken::rparen) { + $this->throwError(CssToken::rparen, $this->scanner->token); + } + $this->scanner->nextToken(); + } + return $buf; + */ + $buf .= $this->scanner->getQuotedString(); + return $buf; + } + } + + /** + * Handle element names. + * This will call the CssEventHandler::elementName(). + * + * This handles: + * <code> + * name (CssEventHandler::element()) + * |name (CssEventHandler::element()) + * ns|name (CssEventHandler::elementNS()) + * ns|* (CssEventHandler::elementNS()) + * </code> + */ + private function elementName() { + if ($this->DEBUG) print "ELEMENT NAME\n"; + if ($this->scanner->token === CssToken::pipe) { + // We have '|name', which is equiv to 'name' + $this->scanner->nextToken(); + $this->consumeWhitespace(); + $elementName = $this->scanner->getNameString(); + $this->handler->element($elementName); + } + elseif ($this->scanner->token === CssToken::char) { + $elementName = $this->scanner->getNameString(); + if ($this->scanner->token == CssToken::pipe) { + // Get ns|name + $elementNS = $elementName; + $this->scanner->nextToken(); + $this->consumeWhitespace(); + if ($this->scanner->token === CssToken::star) { + // We have ns|* + $this->handler->anyElementInNS($elementNS); + $this->scanner->nextToken(); + } + elseif ($this->scanner->token !== CssToken::char) { + $this->throwError(CssToken::char, $this->scanner->token); + } + else { + $elementName = $this->scanner->getNameString(); + // We have ns|name + $this->handler->elementNS($elementName, $elementNS); + } + + } + else { + $this->handler->element($elementName); + } + } + } + + /** + * Check for all elements designators. Due to the new CSS 3 namespace + * support, this is slightly more complicated, now, as it handles + * the *|name and *|* cases as well as *. + * + * Calls CssEventHandler::anyElement() or CssEventHandler::elementName(). + */ + private function allElements() { + if ($this->scanner->token === CssToken::star) { + $this->scanner->nextToken(); + if ($this->scanner->token === CssToken::pipe) { + $this->scanner->nextToken(); + if ($this->scanner->token === CssToken::star) { + // We got *|*. According to spec, this requires + // that the element has a namespace, so we pass it on + // to the handler: + $this->scanner->nextToken(); + $this->handler->anyElementInNS('*'); + } + else { + // We got *|name, which means the name MUST be in a namespce, + // so we pass this off to elementNameNS(). + $name = $this->scanner->getNameString(); + $this->handler->elementNS($name, '*'); + } + } + else { + $this->handler->anyElement(); + } + } + } + + /** + * Handler an attribute. + * An attribute can be in one of two forms: + * <code>[attrName]</code> + * or + * <code>[attrName="AttrValue"]</code> + * + * This may call the following event handlers: CssEventHandler::attribute(). + */ + private function attribute() { + if($this->scanner->token == CssToken::lsquare) { + $attrVal = $op = $ns = NULL; + + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + if ($this->scanner->token === CssToken::at) { + if ($this->strict) { + throw new CssParseException('The @ is illegal in attributes.'); + } + else { + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + } + + if ($this->scanner->token === CssToken::star) { + // Global namespace... requires that attr be prefixed, + // so we pass this on to a namespace handler. + $ns = '*'; + $this->scanner->nextToken(); + } + if ($this->scanner->token === CssToken::pipe) { + // Skip this. It's a global namespace. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + + // Check for namespace attribute: ns|attr. We have to peek() to make + // sure that we haven't hit the |= operator, which looks the same. + if ($this->scanner->token === CssToken::pipe && $this->scanner->peek() !== '=') { + // We have a namespaced attribute. + $ns = $attrName; + $this->scanner->nextToken(); + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + } + + // Note: We require that operators do not have spaces + // between characters, e.g. ~= , not ~ =. + + // Get the operator: + switch ($this->scanner->token) { + case CssToken::eq: + $this->consumeWhitespace(); + $op = CssEventHandler::isExactly; + break; + case CssToken::tilde: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsWithSpace; + break; + case CssToken::pipe: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsWithHyphen; + break; + case CssToken::star: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsInString; + break; + case CssToken::dollar; + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::endsWith; + break; + case CssToken::carat: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::beginsWith; + break; + } + + if (isset($op)) { + // Consume '=' and go on. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + // So... here we have a problem. The grammer suggests that the + // value here is String1 or String2, both of which are enclosed + // in quotes of some sort, and both of which allow lots of special + // characters. But the spec itself includes examples like this: + // [lang=fr] + // So some bareword support is assumed. To get around this, we assume + // that bare words follow the NAME rules, while quoted strings follow + // the String1/String2 rules. + + if ($this->scanner->token === CssToken::quote || $this->scanner->token === CssToken::squote) { + $attrVal = $this->scanner->getQuotedString(); + } + else { + $attrVal = $this->scanner->getNameString(); + } + + if ($this->DEBUG) { + print "ATTR: $attrVal AND OP: $op\n"; + } + } + + $this->consumeWhitespace(); + + if ($this->scanner->token != CssToken::rsquare) { + $this->throwError(CssToken::rsquare, $this->scanner->token); + } + + if (isset($ns)) { + $this->handler->attributeNS($attrName, $ns, $attrVal, $op); + } + elseif (isset($attrVal)) { + $this->handler->attribute($attrName, $attrVal, $op); + } + else { + $this->handler->attribute($attrName); + } + $this->scanner->nextToken(); + } + } + + /** + * Utility for throwing a consistantly-formatted parse error. + */ + private function throwError($expected, $got) { + $filter = sprintf('Expected %s, got %s', CssToken::name($expected), CssToken::name($got)); + throw new CssParseException($filter); + } + +} + +/** + * Scanner for CSS selector parsing. + * + * This provides a simple scanner for traversing an input stream. + * + * @ingroup querypath_css + */ +final class CssScanner { + var $is = NULL; + public $value = NULL; + public $token = NULL; + + var $recurse = FALSE; + var $it = 0; + + /** + * Given a new input stream, tokenize the CSS selector string. + * @see CssInputStream + * @param CssInputStream $in + * An input stream to be scanned. + */ + public function __construct(CssInputStream $in) { + $this->is = $in; + } + + /** + * Return the position of the reader in the string. + */ + public function position() { + return $this->is->position; + } + + /** + * See the next char without removing it from the stack. + * + * @return char + * Returns the next character on the stack. + */ + public function peek() { + return $this->is->peek(); + } + + /** + * Get the next token in the input stream. + * + * This sets the current token to the value of the next token in + * the stream. + * + * @return int + * Returns an int value corresponding to one of the CssToken constants, + * or FALSE if the end of the string is reached. (Remember to use + * strong equality checking on FALSE, since 0 is a valid token id.) + */ + public function nextToken() { + $tok = -1; + ++$this->it; + if ($this->is->isEmpty()) { + if ($this->recurse) { + throw new Exception("Recursion error detected at iteration " . $this->it . '.'); + exit(); + } + //print "{$this->it}: All done\n"; + $this->recurse = TRUE; + $this->token = FALSE; + return FALSE; + } + $ch = $this->is->consume(); + //print __FUNCTION__ . " Testing $ch.\n"; + if (ctype_space($ch)) { + $this->value = ' '; // Collapse all WS to a space. + $this->token = $tok = CssToken::white; + //$ch = $this->is->consume(); + return $tok; + } + + if (ctype_alnum($ch) || $ch == '-' || $ch == '_') { + // It's a character + $this->value = $ch; //strtolower($ch); + $this->token = $tok = CssToken::char; + return $tok; + } + + $this->value = $ch; + + switch($ch) { + case '*': + $tok = CssToken::star; + break; + case chr(ord('>')): + $tok = CssToken::rangle; + break; + case '.': + $tok = CssToken::dot; + break; + case '#': + $tok = CssToken::octo; + break; + case '[': + $tok = CssToken::lsquare; + break; + case ']': + $tok = CssToken::rsquare; + break; + case ':': + $tok = CssToken::colon; + break; + case '(': + $tok = CssToken::lparen; + break; + case ')': + $tok = CssToken::rparen; + break; + case '+': + $tok = CssToken::plus; + break; + case '~': + $tok = CssToken::tilde; + break; + case '=': + $tok = CssToken::eq; + break; + case '|': + $tok = CssToken::pipe; + break; + case ',': + $tok = CssToken::comma; + break; + case chr(34): + $tok = CssToken::quote; + break; + case "'": + $tok = CssToken::squote; + break; + case '\\': + $tok = CssToken::bslash; + break; + case '^': + $tok = CssToken::carat; + break; + case '$': + $tok = CssToken::dollar; + break; + case '@': + $tok = CssToken::at; + break; + } + + + // Catch all characters that are legal within strings. + if ($tok == -1) { + // TODO: This should be UTF-8 compatible, but PHP doesn't + // have a native UTF-8 string. Should we use external + // mbstring library? + + $ord = ord($ch); + // Characters in this pool are legal for use inside of + // certain strings. Extended ASCII is used here, though I + // Don't know if these are really legal. + if (($ord >= 32 && $ord <= 126) || ($ord >= 128 && $ord <= 255)) { + $tok = CssToken::stringLegal; + } + else { + throw new CSSParseException('Illegal character found in stream: ' . $ord); + } + } + + $this->token = $tok; + return $tok; + } + + /** + * Get a name string from the input stream. + * A name string must be composed of + * only characters defined in CssToken:char: -_a-zA-Z0-9 + */ + public function getNameString() { + $buf = ''; + while ($this->token === CssToken::char) { + $buf .= $this->value; + $this->nextToken(); + //print '_'; + } + return $buf; + } + + /** + * This gets a string with any legal 'string' characters. + * See CSS Selectors specification, section 11, for the + * definition of string. + * + * This will check for string1, string2, and the case where a + * string is unquoted (Oddly absent from the "official" grammar, + * though such strings are present as examples in the spec.) + * + * Note: + * Though the grammar supplied by CSS 3 Selectors section 11 does not + * address the contents of a pseudo-class value, the spec itself indicates + * that a pseudo-class value is a "value between parenthesis" [6.6]. The + * examples given use URLs among other things, making them closer to the + * definition of 'string' than to 'name'. So we handle them here as strings. + */ + public function getQuotedString() { + if ($this->token == CssToken::quote || $this->token == CssToken::squote || $this->token == CssToken::lparen) { + $end = ($this->token == CssToken::lparen) ? CssToken::rparen : $this->token; + $buf = ''; + $escape = FALSE; + + $this->nextToken(); // Skip the opening quote/paren + + // The second conjunct is probably not necessary. + while ($this->token !== FALSE && $this->token > -1) { + //print "Char: $this->value \n"; + if ($this->token == CssToken::bslash && !$escape) { + // XXX: The backslash (\) is removed here. + // Turn on escaping. + //$buf .= $this->value; + $escape = TRUE; + } + elseif ($escape) { + // Turn off escaping + $buf .= $this->value; + $escape = FALSE; + } + elseif ($this->token === $end) { + // At end of string; skip token and break. + $this->nextToken(); + break; + } + else { + // Append char. + $buf .= $this->value; + } + $this->nextToken(); + } + return $buf; + } + } + + /** + * Get a string from the input stream. + * This is a convenience function for getting a string of + * characters that are either alphanumber or whitespace. See + * the CssToken::white and CssToken::char definitions. + * + * @deprecated This is not used anywhere in QueryPath. + *//* + public function getStringPlusWhitespace() { + $buf = ''; + if($this->token === FALSE) {return '';} + while ($this->token === CssToken::char || $this->token == CssToken::white) { + $buf .= $this->value; + $this->nextToken(); + } + return $buf; + }*/ + +} + +/** + * Simple wrapper to turn a string into an input stream. + * This provides a standard interface on top of an array of + * characters. + */ +class CssInputStream { + protected $stream = NULL; + public $position = 0; + /** + * Build a new CSS input stream from a string. + * + * @param string + * String to turn into an input stream. + */ + function __construct($string) { + $this->stream = str_split($string); + } + /** + * Look ahead one character. + * + * @return char + * Returns the next character, but does not remove it from + * the stream. + */ + function peek() { + return $this->stream[0]; + } + /** + * Get the next unconsumed character in the stream. + * This will remove that character from the front of the + * stream and return it. + */ + function consume() { + $ret = array_shift($this->stream); + if (!empty($ret)) { + $this->position++; + } + return $ret; + } + /** + * Check if the stream is empty. + * @return boolean + * Returns TRUE when the stream is empty, FALSE otherwise. + */ + function isEmpty() { + return count($this->stream) == 0; + } +} + +/** + * Exception indicating an error in CSS parsing. + * + * @ingroup querypath_css + */ +class CSSParseException extends EXCEPTION {}
\ No newline at end of file |