From 9a9c04512e5dcb77c7fe5d850e3f2a0250cc160e Mon Sep 17 00:00:00 2001 From: emkael Date: Wed, 18 Jan 2017 20:07:16 +0100 Subject: * Motor Sport Magazine feed provider --- lib/querypath/src/QueryPath/CSS/Parser.php | 575 +++++++++++++++++++++++++++++ 1 file changed, 575 insertions(+) create mode 100644 lib/querypath/src/QueryPath/CSS/Parser.php (limited to 'lib/querypath/src/QueryPath/CSS/Parser.php') diff --git a/lib/querypath/src/QueryPath/CSS/Parser.php b/lib/querypath/src/QueryPath/CSS/Parser.php new file mode 100644 index 0000000..f041612 --- /dev/null +++ b/lib/querypath/src/QueryPath/CSS/Parser.php @@ -0,0 +1,575 @@ +originalString = $string; + $is = new InputStream($string); + $this->scanner = new Scanner($is); + $this->handler = $handler; + } + + /** + * Parse the selector. + * + * This begins an event-based parsing process that will + * fire events as the selector is handled. A EventHandler + * implementation will be responsible for handling the events. + * @throws ParseException + */ + public function parse() { + + $this->scanner->nextToken(); + while ($this->scanner->token !== FALSE) { + // Primitive recursion detection. + $position = $this->scanner->position(); + + if ($this->DEBUG) { + print "PARSE " . $this->scanner->token. "\n"; + } + $this->selector(); + + $finalPosition = $this->scanner->position(); + + if ($this->scanner->token !== FALSE && $finalPosition == $position) { + // If we get here, then the scanner did not pop a single character + // off of the input stream during a full run of the parser, which + // means that the current input does not match any recognizable + // pattern. + throw new ParseException('CSS selector is not well formed.'); + } + + } + + } + + /** + * A restricted parser that can only parse simple selectors. + * The pseudoClass handler for this parser will throw an + * exception if it encounters a pseudo-element or the + * negation pseudo-class. + * + * @deprecated This is not used anywhere in QueryPath and + * may be removed. + *//* + public function parseSimpleSelector() { + while ($this->scanner->token !== FALSE) { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(TRUE); // Operate in restricted mode. + $this->attribute(); + + // TODO: Need to add failure conditions here. + } + }*/ + + /** + * Handle an entire CSS selector. + */ + private function selector() { + if ($this->DEBUG) print "SELECTOR{$this->scanner->position()}\n"; + $this->consumeWhitespace(); // Remove leading whitespace + $this->simpleSelectors(); + $this->combinator(); + } + + /** + * Consume whitespace and return a count of the number of whitespace consumed. + */ + private function consumeWhitespace() { + if ($this->DEBUG) print "CONSUME WHITESPACE\n"; + $white = 0; + while ($this->scanner->token == Token::white) { + $this->scanner->nextToken(); + ++$white; + } + return $white; + } + + /** + * Handle one of the five combinators: '>', '+', ' ', '~', and ','. + * This will call the appropriate event handlers. + * @see EventHandler::directDescendant(), + * @see EventHandler::adjacent(), + * @see EventHandler::anyDescendant(), + * @see EventHandler::anotherSelector(). + */ + private function combinator() { + if ($this->DEBUG) print "COMBINATOR\n"; + /* + * Problem: ' ' and ' > ' are both valid combinators. + * So we have to track whitespace consumption to see + * if we are hitting the ' ' combinator or if the + * selector just has whitespace padding another combinator. + */ + + // Flag to indicate that post-checks need doing + $inCombinator = FALSE; + $white = $this->consumeWhitespace(); + $t = $this->scanner->token; + + if ($t == Token::rangle) { + $this->handler->directDescendant(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == Token::plus) { + $this->handler->adjacent(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == Token::comma) { + $this->handler->anotherSelector(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->scanner->selectors(); + } + elseif ($t == Token::tilde) { + $this->handler->sibling(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + } + + // Check that we don't get two combinators in a row. + if ($inCombinator) { + $white = 0; + if ($this->DEBUG) print "COMBINATOR: " . Token::name($t) . "\n"; + $this->consumeWhitespace(); + if ($this->isCombinator($this->scanner->token)) { + throw new ParseException("Illegal combinator: Cannot have two combinators in sequence."); + } + } + // Check to see if we have whitespace combinator: + elseif ($white > 0) { + if ($this->DEBUG) print "COMBINATOR: any descendant\n"; + $inCombinator = TRUE; + $this->handler->anyDescendant(); + } + else { + if ($this->DEBUG) print "COMBINATOR: no combinator found.\n"; + } + } + + /** + * Check if the token is a combinator. + */ + private function isCombinator($tok) { + $combinators = array(Token::plus, Token::rangle, Token::comma, Token::tilde); + return in_array($tok, $combinators); + } + + /** + * Handle a simple selector. + */ + private function simpleSelectors() { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(); + $this->attribute(); + } + + /** + * Handles CSS ID selectors. + * This will call EventHandler::elementID(). + */ + private function elementID() { + if ($this->DEBUG) print "ELEMENT ID\n"; + if ($this->scanner->token == Token::octo) { + $this->scanner->nextToken(); + if ($this->scanner->token !== Token::char) { + throw new ParseException("Expected string after #"); + } + $id = $this->scanner->getNameString(); + $this->handler->elementID($id); + } + } + + /** + * Handles CSS class selectors. + * This will call the EventHandler::elementClass() method. + */ + private function elementClass() { + if ($this->DEBUG) print "ELEMENT CLASS\n"; + if ($this->scanner->token == Token::dot) { + $this->scanner->nextToken(); + $this->consumeWhitespace(); // We're very fault tolerent. This should prob through error. + $cssClass = $this->scanner->getNameString(); + $this->handler->elementClass($cssClass); + } + } + + /** + * Handle a pseudo-class and pseudo-element. + * + * CSS 3 selectors support separate pseudo-elements, using :: instead + * of : for separator. This is now supported, and calls the pseudoElement + * handler, EventHandler::pseudoElement(). + * + * This will call EventHandler::pseudoClass() when a + * pseudo-class is parsed. + */ + private function pseudoClass($restricted = FALSE) { + if ($this->DEBUG) print "PSEUDO-CLASS\n"; + if ($this->scanner->token == Token::colon) { + + // Check for CSS 3 pseudo element: + $isPseudoElement = FALSE; + if ($this->scanner->nextToken() === Token::colon) { + $isPseudoElement = TRUE; + $this->scanner->nextToken(); + } + + $name = $this->scanner->getNameString(); + if ($restricted && $name == 'not') { + throw new ParseException("The 'not' pseudo-class is illegal in this context."); + } + + $value = NULL; + if ($this->scanner->token == Token::lparen) { + if ($isPseudoElement) { + throw new ParseException("Illegal left paren. Pseudo-Element cannot have arguments."); + } + $value = $this->pseudoClassValue(); + } + + // FIXME: This should throw errors when pseudo element has values. + if ($isPseudoElement) { + if ($restricted) { + throw new ParseException("Pseudo-Elements are illegal in this context."); + } + $this->handler->pseudoElement($name); + $this->consumeWhitespace(); + + // Per the spec, pseudo-elements must be the last items in a selector, so we + // check to make sure that we are either at the end of the stream or that a + // new selector is starting. Only one pseudo-element is allowed per selector. + if ($this->scanner->token !== FALSE && $this->scanner->token !== Token::comma) { + throw new ParseException("A Pseudo-Element must be the last item in a selector."); + } + } + else { + $this->handler->pseudoClass($name, $value); + } + } + } + + /** + * Get the value of a pseudo-classes. + * + * @return string + * Returns the value found from a pseudo-class. + * + * @todo Pseudoclasses can be passed pseudo-elements and + * other pseudo-classes as values, which means :pseudo(::pseudo) + * is legal. + */ + private function pseudoClassValue() { + if ($this->scanner->token == Token::lparen) { + $buf = ''; + + // For now, just leave pseudoClass value vague. + /* + // We have to peek to see if next char is a colon because + // pseudo-classes and pseudo-elements are legal strings here. + print $this->scanner->peek(); + if ($this->scanner->peek() == ':') { + print "Is pseudo\n"; + $this->scanner->nextToken(); + + // Pseudo class + if ($this->scanner->token == Token::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + // Pseudo element + if ($this->scanner->token == Token::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + } + // Ident + $buf .= $this->scanner->getNameString(); + } + } + else { + print "fetching string.\n"; + $buf .= $this->scanner->getQuotedString(); + if ($this->scanner->token != Token::rparen) { + $this->throwError(Token::rparen, $this->scanner->token); + } + $this->scanner->nextToken(); + } + return $buf; + */ + //$buf .= $this->scanner->getQuotedString(); + $buf .= $this->scanner->getPseudoClassString(); + return $buf; + } + } + + /** + * Handle element names. + * This will call the EventHandler::elementName(). + * + * This handles: + * + * name (EventHandler::element()) + * |name (EventHandler::element()) + * ns|name (EventHandler::elementNS()) + * ns|* (EventHandler::elementNS()) + * + */ + private function elementName() { + if ($this->DEBUG) print "ELEMENT NAME\n"; + if ($this->scanner->token === Token::pipe) { + // We have '|name', which is equiv to 'name' + $this->scanner->nextToken(); + $this->consumeWhitespace(); + $elementName = $this->scanner->getNameString(); + $this->handler->element($elementName); + } + elseif ($this->scanner->token === Token::char) { + $elementName = $this->scanner->getNameString(); + if ($this->scanner->token == Token::pipe) { + // Get ns|name + $elementNS = $elementName; + $this->scanner->nextToken(); + $this->consumeWhitespace(); + if ($this->scanner->token === Token::star) { + // We have ns|* + $this->handler->anyElementInNS($elementNS); + $this->scanner->nextToken(); + } + elseif ($this->scanner->token !== Token::char) { + $this->throwError(Token::char, $this->scanner->token); + } + else { + $elementName = $this->scanner->getNameString(); + // We have ns|name + $this->handler->elementNS($elementName, $elementNS); + } + + } + else { + $this->handler->element($elementName); + } + } + } + + /** + * Check for all elements designators. Due to the new CSS 3 namespace + * support, this is slightly more complicated, now, as it handles + * the *|name and *|* cases as well as *. + * + * Calls EventHandler::anyElement() or EventHandler::elementName(). + */ + private function allElements() { + if ($this->scanner->token === Token::star) { + $this->scanner->nextToken(); + if ($this->scanner->token === Token::pipe) { + $this->scanner->nextToken(); + if ($this->scanner->token === Token::star) { + // We got *|*. According to spec, this requires + // that the element has a namespace, so we pass it on + // to the handler: + $this->scanner->nextToken(); + $this->handler->anyElementInNS('*'); + } + else { + // We got *|name, which means the name MUST be in a namespce, + // so we pass this off to elementNameNS(). + $name = $this->scanner->getNameString(); + $this->handler->elementNS($name, '*'); + } + } + else { + $this->handler->anyElement(); + } + } + } + + /** + * Handler an attribute. + * An attribute can be in one of two forms: + * [attrName] + * or + * [attrName="AttrValue"] + * + * This may call the following event handlers: EventHandler::attribute(). + */ + private function attribute() { + if($this->scanner->token == Token::lsquare) { + $attrVal = $op = $ns = NULL; + + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + if ($this->scanner->token === Token::at) { + if ($this->strict) { + throw new ParseException('The @ is illegal in attributes.'); + } + else { + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + } + + if ($this->scanner->token === Token::star) { + // Global namespace... requires that attr be prefixed, + // so we pass this on to a namespace handler. + $ns = '*'; + $this->scanner->nextToken(); + } + if ($this->scanner->token === Token::pipe) { + // Skip this. It's a global namespace. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + + // Check for namespace attribute: ns|attr. We have to peek() to make + // sure that we haven't hit the |= operator, which looks the same. + if ($this->scanner->token === Token::pipe && $this->scanner->peek() !== '=') { + // We have a namespaced attribute. + $ns = $attrName; + $this->scanner->nextToken(); + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + } + + // Note: We require that operators do not have spaces + // between characters, e.g. ~= , not ~ =. + + // Get the operator: + switch ($this->scanner->token) { + case Token::eq: + $this->consumeWhitespace(); + $op = EventHandler::isExactly; + break; + case Token::tilde: + if ($this->scanner->nextToken() !== Token::eq) { + $this->throwError(Token::eq, $this->scanner->token); + } + $op = EventHandler::containsWithSpace; + break; + case Token::pipe: + if ($this->scanner->nextToken() !== Token::eq) { + $this->throwError(Token::eq, $this->scanner->token); + } + $op = EventHandler::containsWithHyphen; + break; + case Token::star: + if ($this->scanner->nextToken() !== Token::eq) { + $this->throwError(Token::eq, $this->scanner->token); + } + $op = EventHandler::containsInString; + break; + case Token::dollar; + if ($this->scanner->nextToken() !== Token::eq) { + $this->throwError(Token::eq, $this->scanner->token); + } + $op = EventHandler::endsWith; + break; + case Token::carat: + if ($this->scanner->nextToken() !== Token::eq) { + $this->throwError(Token::eq, $this->scanner->token); + } + $op = EventHandler::beginsWith; + break; + } + + if (isset($op)) { + // Consume '=' and go on. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + // So... here we have a problem. The grammer suggests that the + // value here is String1 or String2, both of which are enclosed + // in quotes of some sort, and both of which allow lots of special + // characters. But the spec itself includes examples like this: + // [lang=fr] + // So some bareword support is assumed. To get around this, we assume + // that bare words follow the NAME rules, while quoted strings follow + // the String1/String2 rules. + + if ($this->scanner->token === Token::quote || $this->scanner->token === Token::squote) { + $attrVal = $this->scanner->getQuotedString(); + } + else { + $attrVal = $this->scanner->getNameString(); + } + + if ($this->DEBUG) { + print "ATTR: $attrVal AND OP: $op\n"; + } + } + + $this->consumeWhitespace(); + + if ($this->scanner->token != Token::rsquare) { + $this->throwError(Token::rsquare, $this->scanner->token); + } + + if (isset($ns)) { + $this->handler->attributeNS($attrName, $ns, $attrVal, $op); + } + elseif (isset($attrVal)) { + $this->handler->attribute($attrName, $attrVal, $op); + } + else { + $this->handler->attribute($attrName); + } + $this->scanner->nextToken(); + } + } + + /** + * Utility for throwing a consistantly-formatted parse error. + */ + private function throwError($expected, $got) { + $filter = sprintf('Expected %s, got %s', Token::name($expected), Token::name($got)); + throw new ParseException($filter); + } + +} + -- cgit v1.2.3