From 51609351f2c4b5082b7e6f0744cd3811c325303f Mon Sep 17 00:00:00 2001 From: emkael Date: Tue, 11 Oct 2016 14:01:29 +0200 Subject: * initial template --- lib/querypath/CssEventHandler.php | 1432 +++++++++++ lib/querypath/CssParser.php | 1108 +++++++++ lib/querypath/Extension/QPDB.php | 711 ++++++ lib/querypath/Extension/QPList.php | 213 ++ lib/querypath/Extension/QPTPL.php | 275 ++ lib/querypath/Extension/QPXML.php | 209 ++ lib/querypath/Extension/QPXSL.php | 75 + lib/querypath/QueryPath.php | 4543 ++++++++++++++++++++++++++++++++++ lib/querypath/QueryPathExtension.php | 195 ++ 9 files changed, 8761 insertions(+) create mode 100644 lib/querypath/CssEventHandler.php create mode 100644 lib/querypath/CssParser.php create mode 100644 lib/querypath/Extension/QPDB.php create mode 100644 lib/querypath/Extension/QPList.php create mode 100644 lib/querypath/Extension/QPTPL.php create mode 100644 lib/querypath/Extension/QPXML.php create mode 100644 lib/querypath/Extension/QPXSL.php create mode 100644 lib/querypath/QueryPath.php create mode 100644 lib/querypath/QueryPathExtension.php (limited to 'lib/querypath') diff --git a/lib/querypath/CssEventHandler.php b/lib/querypath/CssEventHandler.php new file mode 100644 index 0000000..7236f01 --- /dev/null +++ b/lib/querypath/CssEventHandler.php @@ -0,0 +1,1432 @@ +stdClass objects with a text property (QP > 1.3) + * instead of elements. + * - The pseudo-classes first-of-type, nth-of-type and last-of-type may or may + * not conform to the specification. The spec is unclear. + * - pseudo-class filters of the form -an+b do not function as described in the + * specification. However, they do behave the same way here as they do in + * jQuery. + * - This library DOES provide XML namespace aware tools. Selectors can use + * namespaces to increase specificity. + * - This library does nothing with the CSS 3 Selector specificity rating. Of + * course specificity is preserved (to the best of our abilities), but there + * is no calculation done. + * + * For detailed examples of how the code works and what selectors are supported, + * see the CssEventTests file, which contains the unit tests used for + * testing this implementation. + * + * @author M Butcher + * @license http://opensource.org/licenses/lgpl-2.1.php LGPL (The GNU Lesser GPL) or an MIT-like license. + */ + +/** + * Require the parser library. + */ +require_once 'CssParser.php'; + +/** + * Handler that tracks progress of a query through a DOM. + * + * The main idea is that we keep a copy of the tree, and then use an + * array to keep track of matches. To handle a list of selectors (using + * the comma separator), we have to track both the currently progressing + * match and the previously matched elements. + * + * To use this handler: + * @code + * $filter = '#id'; // Some CSS selector + * $handler = new QueryPathCssParser(DOMNode $dom); + * $parser = new CssParser(); + * $parser->parse($filter, $handler); + * $matches = $handler->getMatches(); + * @endcode + * + * $matches will be an array of zero or more DOMElement objects. + * + * @ingroup querypath_css + */ +class QueryPathCssEventHandler implements CssEventHandler { + protected $dom = NULL; // Always points to the top level. + protected $matches = NULL; // The matches + protected $alreadyMatched = NULL; // Matches found before current selector. + protected $findAnyElement = TRUE; + + + /** + * Create a new event handler. + */ + public function __construct($dom) { + $this->alreadyMatched = new SplObjectStorage(); + $matches = new SplObjectStorage(); + + // Array of DOMElements + if (is_array($dom) || $dom instanceof SplObjectStorage) { + //$matches = array(); + foreach($dom as $item) { + if ($item instanceof DOMNode && $item->nodeType == XML_ELEMENT_NODE) { + //$matches[] = $item; + $matches->attach($item); + } + } + //$this->dom = count($matches) > 0 ? $matches[0] : NULL; + if ($matches->count() > 0) { + $matches->rewind(); + $this->dom = $matches->current(); + } + else { + //throw new Exception("Setting DOM to Null"); + $this->dom = NULL; + } + $this->matches = $matches; + } + // DOM Document -- we get the root element. + elseif ($dom instanceof DOMDocument) { + $this->dom = $dom->documentElement; + $matches->attach($dom->documentElement); + } + // DOM Element -- we use this directly + elseif ($dom instanceof DOMElement) { + $this->dom = $dom; + $matches->attach($dom); + } + // NodeList -- We turn this into an array + elseif ($dom instanceof DOMNodeList) { + $a = array(); // Not sure why we are doing this.... + foreach ($dom as $item) { + if ($item->nodeType == XML_ELEMENT_NODE) { + $matches->attach($item); + $a[] = $item; + } + } + $this->dom = $a; + } + // FIXME: Handle SimpleXML! + // Uh-oh... we don't support anything else. + else { + throw new Exception("Unhandled type: " . get_class($dom)); + } + $this->matches = $matches; + } + + /** + * Generic finding method. + * + * This is the primary searching method used throughout QueryPath. + * + * @param string $filter + * A valid CSS 3 filter. + * @return QueryPathCssEventHandler + * Returns itself. + */ + public function find($filter) { + $parser = new CssParser($filter, $this); + $parser->parse(); + return $this; + } + + /** + * Get the elements that match the evaluated selector. + * + * This should be called after the filter has been parsed. + * + * @return array + * The matched items. This is almost always an array of + * {@link DOMElement} objects. It is always an instance of + * {@link DOMNode} objects. + */ + public function getMatches() { + //$result = array_merge($this->alreadyMatched, $this->matches); + $result = new SplObjectStorage(); + foreach($this->alreadyMatched as $m) $result->attach($m); + foreach($this->matches as $m) $result->attach($m); + return $result; + } + + /** + * Find any element with the ID that matches $id. + * + * If this finds an ID, it will immediately quit. Essentially, it doesn't + * enforce ID uniqueness, but it assumes it. + * + * @param $id + * String ID for an element. + */ + public function elementID($id) { + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $item) { + // Check if any of the current items has the desired ID. + if ($item->hasAttribute('id') && $item->getAttribute('id') === $id) { + $found->attach($item); + break; + } + } + $this->matches = $found; + $this->findAnyElement = FALSE; + } + + // Inherited + public function element($name) { + $matches = $this->candidateList(); + $this->findAnyElement = FALSE; + $found = new SplObjectStorage(); + foreach ($matches as $item) { + // Should the existing item be included? + // In some cases (e.g. element is root element) + // it definitely should. But what about other cases? + if ($item->tagName == $name) { + $found->attach($item); + } + // Search for matching kids. + //$nl = $item->getElementsByTagName($name); + //$found = array_merge($found, $this->nodeListToArray($nl)); + } + + $this->matches = $found; + } + + // Inherited + public function elementNS($lname, $namespace = NULL) { + $this->findAnyElement = FALSE; + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $item) { + // Looking up NS URI only works if the XMLNS attributes are declared + // at a level equal to or above the searching doc. Normalizing a doc + // should fix this, but it doesn't. So we have to use a fallback + // detection scheme which basically searches by lname and then + // does a post hoc check on the tagname. + + //$nsuri = $item->lookupNamespaceURI($namespace); + $nsuri = $this->dom->lookupNamespaceURI($namespace); + + // XXX: Presumably the base item needs to be checked. Spec isn't + // too clear, but there are three possibilities: + // - base should always be checked (what we do here) + // - base should never be checked (only children) + // - base should only be checked if it is the root node + if ($item instanceof DOMNode + && $item->namespaceURI == $nsuri + && $lname == $item->localName) { + $found->attach($item); + } + + if (!empty($nsuri)) { + $nl = $item->getElementsByTagNameNS($nsuri, $lname); + // If something is found, merge them: + //if (!empty($nl)) $found = array_merge($found, $this->nodeListToArray($nl)); + if (!empty($nl)) $this->attachNodeList($nl, $found); + } + else { + //$nl = $item->getElementsByTagName($namespace . ':' . $lname); + $nl = $item->getElementsByTagName($lname); + $tagname = $namespace . ':' . $lname; + $nsmatches = array(); + foreach ($nl as $node) { + if ($node->tagName == $tagname) { + //$nsmatches[] = $node; + $found->attach($node); + } + } + // If something is found, merge them: + //if (!empty($nsmatches)) $found = array_merge($found, $nsmatches); + } + } + $this->matches = $found; + } + + public function anyElement() { + $found = new SplObjectStorage(); + //$this->findAnyElement = TRUE; + $matches = $this->candidateList(); + foreach ($matches as $item) { + $found->attach($item); // Add self + // See issue #20 or section 6.2 of this: + // http://www.w3.org/TR/2009/PR-css3-selectors-20091215/#universal-selector + //$nl = $item->getElementsByTagName('*'); + //$this->attachNodeList($nl, $found); + } + + $this->matches = $found; + $this->findAnyElement = FALSE; + } + public function anyElementInNS($ns) { + //$this->findAnyElement = TRUE; + $nsuri = $this->dom->lookupNamespaceURI($ns); + $found = new SplObjectStorage(); + if (!empty($nsuri)) { + $matches = $this->candidateList(); + foreach ($matches as $item) { + if ($item instanceOf DOMNode && $nsuri == $item->namespaceURI) { + $found->attach($item); + } + } + } + $this->matches = $found;//UniqueElementList::get($found); + $this->findAnyElement = FALSE; + } + public function elementClass($name) { + + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $item) { + if ($item->hasAttribute('class')) { + $classes = explode(' ', $item->getAttribute('class')); + if (in_array($name, $classes)) $found->attach($item); + } + } + + $this->matches = $found;//UniqueElementList::get($found); + $this->findAnyElement = FALSE; + } + + public function attribute($name, $value = NULL, $operation = CssEventHandler::isExactly) { + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $item) { + if ($item->hasAttribute($name)) { + if (isset($value)) { + // If a value exists, then we need a match. + if($this->attrValMatches($value, $item->getAttribute($name), $operation)) { + $found->attach($item); + } + } + else { + // If no value exists, then we consider it a match. + $found->attach($item); + } + } + } + $this->matches = $found; //UniqueElementList::get($found); + $this->findAnyElement = FALSE; + } + + /** + * Helper function to find all elements with exact matches. + * + * @deprecated All use cases seem to be covered by attribute(). + */ + protected function searchForAttr($name, $value = NULL) { + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $candidate) { + if ($candidate->hasAttribute($name)) { + // If value is required, match that, too. + if (isset($value) && $value == $candidate->getAttribute($name)) { + $found->attach($candidate); + } + // Otherwise, it's a match on name alone. + else { + $found->attach($candidate); + } + } + } + + $this->matches = $found; + } + + public function attributeNS($lname, $ns, $value = NULL, $operation = CssEventHandler::isExactly) { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + if (count($matches) == 0) { + $this->matches = $found; + return; + } + + // Get the namespace URI for the given label. + //$uri = $matches[0]->lookupNamespaceURI($ns); + $matches->rewind(); + $e = $matches->current(); + $uri = $e->lookupNamespaceURI($ns); + + foreach ($matches as $item) { + //foreach ($item->attributes as $attr) { + // print "$attr->prefix:$attr->localName ($attr->namespaceURI), Value: $attr->nodeValue\n"; + //} + if ($item->hasAttributeNS($uri, $lname)) { + if (isset($value)) { + if ($this->attrValMatches($value, $item->getAttributeNS($uri, $lname), $operation)) { + $found->attach($item); + } + } + else { + $found->attach($item); + } + } + } + $this->matches = $found; + $this->findAnyElement = FALSE; + } + + /** + * This also supports the following nonstandard pseudo classes: + * - :x-reset/:x-root (reset to the main item passed into the constructor. Less drastic than :root) + * - :odd/:even (shorthand for :nth-child(odd)/:nth-child(even)) + */ + public function pseudoClass($name, $value = NULL) { + $name = strtolower($name); + // Need to handle known pseudoclasses. + switch($name) { + case 'visited': + case 'hover': + case 'active': + case 'focus': + case 'animated': // Last 3 are from jQuery + case 'visible': + case 'hidden': + // These require a UA, which we don't have. + case 'target': + // This requires a location URL, which we don't have. + $this->matches = new SplObjectStorage(); + break; + case 'indeterminate': + // The assumption is that there is a UA and the format is HTML. + // I don't know if this should is useful without a UA. + throw new NotImplementedException(":indeterminate is not implemented."); + break; + case 'lang': + // No value = exception. + if (!isset($value)) { + throw new NotImplementedException("No handler for lang pseudoclass without value."); + } + $this->lang($value); + break; + case 'link': + $this->searchForAttr('href'); + break; + case 'root': + $found = new SplObjectStorage(); + if (empty($this->dom)) { + $this->matches = $found; + } + elseif (is_array($this->dom)) { + $found->attach($this->dom[0]->ownerDocument->documentElement); + $this->matches = $found; + } + elseif ($this->dom instanceof DOMNode) { + $found->attach($this->dom->ownerDocument->documentElement); + $this->matches = $found; + } + elseif ($this->dom instanceof DOMNodeList && $this->dom->length > 0) { + $found->attach($this->dom->item(0)->ownerDocument->documentElement); + $this->matches = $found; + } + else { + // Hopefully we never get here: + $found->attach($this->dom); + $this->matches = $found; + } + break; + + // NON-STANDARD extensions for reseting to the "top" items set in + // the constructor. + case 'x-root': + case 'x-reset': + $this->matches = new SplObjectStorage(); + $this->matches->attach($this->dom); + break; + + // NON-STANDARD extensions for simple support of even and odd. These + // are supported by jQuery, FF, and other user agents. + case 'even': + $this->nthChild(2, 0); + break; + case 'odd': + $this->nthChild(2, 1); + break; + + // Standard child-checking items. + case 'nth-child': + list($aVal, $bVal) = $this->parseAnB($value); + $this->nthChild($aVal, $bVal); + break; + case 'nth-last-child': + list($aVal, $bVal) = $this->parseAnB($value); + $this->nthLastChild($aVal, $bVal); + break; + case 'nth-of-type': + list($aVal, $bVal) = $this->parseAnB($value); + $this->nthOfTypeChild($aVal, $bVal, FALSE); + break; + case 'nth-last-of-type': + list($aVal, $bVal) = $this->parseAnB($value); + $this->nthLastOfTypeChild($aVal, $bVal); + break; + case 'first-child': + $this->nthChild(0, 1); + break; + case 'last-child': + $this->nthLastChild(0, 1); + break; + case 'first-of-type': + $this->firstOfType(); + break; + case 'last-of-type': + $this->lastOfType(); + break; + case 'only-child': + $this->onlyChild(); + break; + case 'only-of-type': + $this->onlyOfType(); + break; + case 'empty': + $this->emptyElement(); + break; + case 'not': + if (empty($value)) { + throw new CssParseException(":not() requires a value."); + } + $this->not($value); + break; + // Additional pseudo-classes defined in jQuery: + case 'lt': + case 'gt': + case 'nth': + case 'eq': + case 'first': + case 'last': + //case 'even': + //case 'odd': + $this->getByPosition($name, $value); + break; + case 'parent': + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $match) { + if (!empty($match->firstChild)) { + $found->attach($match); + } + } + $this->matches = $found; + break; + + case 'enabled': + case 'disabled': + case 'checked': + $this->attribute($name); + break; + case 'text': + case 'radio': + case 'checkbox': + case 'file': + case 'password': + case 'submit': + case 'image': + case 'reset': + case 'button': + $this->attribute('type', $name); + break; + + case 'header': + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + $tag = $item->tagName; + $f = strtolower(substr($tag, 0, 1)); + if ($f == 'h' && strlen($tag) == 2 && ctype_digit(substr($tag, 1, 1))) { + $found->attach($item); + } + } + $this->matches = $found; + break; + case 'has': + $this->has($value); + break; + // Contains == text matches. + // In QP 2.1, this was changed. + case 'contains': + $value = $this->removeQuotes($value); + + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + if (strpos($item->textContent, $value) !== FALSE) { + $found->attach($item); + } + } + $this->matches = $found; + break; + + // Since QP 2.1 + case 'contains-exactly': + $value = $this->removeQuotes($value); + + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + if ($item->textContent == $value) { + $found->attach($item); + } + } + $this->matches = $found; + break; + default: + throw new CssParseException("Unknown Pseudo-Class: " . $name); + } + $this->findAnyElement = FALSE; + } + + /** + * Remove leading and trailing quotes. + */ + private function removeQuotes($str) { + $f = substr($str, 0, 1); + $l = substr($str, -1); + if ($f === $l && ($f == '"' || $f == "'")) { + $str = substr($str, 1, -1); + } + return $str; + } + + /** + * Pseudo-class handler for a variety of jQuery pseudo-classes. + * Handles lt, gt, eq, nth, first, last pseudo-classes. + */ + private function getByPosition($operator, $pos) { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + if ($matches->count() == 0) { + return; + } + + switch ($operator) { + case 'nth': + case 'eq': + if ($matches->count() >= $pos) { + //$found[] = $matches[$pos -1]; + foreach ($matches as $match) { + // CSS is 1-based, so we pre-increment. + if ($matches->key() + 1 == $pos) { + $found->attach($match); + break; + } + } + } + break; + case 'first': + if ($matches->count() > 0) { + $matches->rewind(); // This is necessary to init. + $found->attach($matches->current()); + } + break; + case 'last': + if ($matches->count() > 0) { + + // Spin through iterator. + foreach ($matches as $item) {}; + + $found->attach($item); + } + break; + // case 'even': + // for ($i = 1; $i <= count($matches); ++$i) { + // if ($i % 2 == 0) { + // $found[] = $matches[$i]; + // } + // } + // break; + // case 'odd': + // for ($i = 1; $i <= count($matches); ++$i) { + // if ($i % 2 == 0) { + // $found[] = $matches[$i]; + // } + // } + // break; + case 'lt': + $i = 0; + foreach ($matches as $item) { + if (++$i < $pos) { + $found->attach($item); + } + } + break; + case 'gt': + $i = 0; + foreach ($matches as $item) { + if (++$i > $pos) { + $found->attach($item); + } + } + break; + } + + $this->matches = $found; + } + + /** + * Parse an an+b rule for CSS pseudo-classes. + * @param $rule + * Some rule in the an+b format. + * @return + * Array (list($aVal, $bVal)) of the two values. + * @throws CssParseException + * If the rule does not follow conventions. + */ + protected function parseAnB($rule) { + if ($rule == 'even') { + return array(2, 0); + } + elseif ($rule == 'odd') { + return array(2, 1); + } + elseif ($rule == 'n') { + return array(1, 0); + } + elseif (is_numeric($rule)) { + return array(0, (int)$rule); + } + + $rule = explode('n', $rule); + if (count($rule) == 0) { + throw new CssParseException("nth-child value is invalid."); + } + + // Each of these is legal: 1, -1, and -. '-' is shorthand for -1. + $aVal = trim($rule[0]); + $aVal = ($aVal == '-') ? -1 : (int)$aVal; + + $bVal = !empty($rule[1]) ? (int)trim($rule[1]) : 0; + return array($aVal, $bVal); + } + + /** + * Pseudo-class handler for nth-child and all related pseudo-classes. + * + * @param int $groupSize + * The size of the group (in an+b, this is a). + * @param int $elementInGroup + * The offset in a group. (in an+b this is b). + * @param boolean $lastChild + * Whether counting should begin with the last child. By default, this is false. + * Pseudo-classes that start with the last-child can set this to true. + */ + protected function nthChild($groupSize, $elementInGroup, $lastChild = FALSE) { + // EXPERIMENTAL: New in Quark. This should be substantially faster + // than the old (jQuery-ish) version. It still has E_STRICT violations + // though. + $parents = new SplObjectStorage(); + $matches = new SplObjectStorage(); + + $i = 0; + foreach ($this->matches as $item) { + $parent = $item->parentNode; + + // Build up an array of all of children of this parent, and store the + // index of each element for reference later. We only need to do this + // once per parent, though. + if (!$parents->contains($parent)) { + + $c = 0; + foreach ($parent->childNodes as $child) { + // We only want nodes, and if this call is preceded by an element + // selector, we only want to match elements with the same tag name. + // !!! This last part is a grey area in the CSS 3 Selector spec. It seems + // necessary to make the implementation match the examples in the spec. However, + // jQuery 1.2 does not do this. + if ($child->nodeType == XML_ELEMENT_NODE && ($this->findAnyElement || $child->tagName == $item->tagName)) { + // This may break E_STRICT. + $child->nodeIndex = ++$c; + } + } + // This may break E_STRICT. + $parent->numElements = $c; + $parents->attach($parent); + } + + // If we are looking for the last child, we count from the end of a list. + // Note that we add 1 because CSS indices begin at 1, not 0. + if ($lastChild) { + $indexToMatch = $item->parentNode->numElements - $item->nodeIndex + 1; + } + // Otherwise we count from the beginning of the list. + else { + $indexToMatch = $item->nodeIndex; + } + + // If group size is 0, then we return element at the right index. + if ($groupSize == 0) { + if ($indexToMatch == $elementInGroup) + $matches->attach($item); + } + // If group size != 0, then we grab nth element from group offset by + // element in group. + else { + if (($indexToMatch - $elementInGroup) % $groupSize == 0 + && ($indexToMatch - $elementInGroup) / $groupSize >= 0) { + $matches->attach($item); + } + } + + // Iterate. + ++$i; + } + $this->matches = $matches; + } + + /** + * Reverse a set of matches. + * + * This is now necessary because internal matches are no longer represented + * as arrays. + * @since QueryPath 2.0 + *//* + private function reverseMatches() { + // Reverse the candidate list. There must be a better way of doing + // this. + $arr = array(); + foreach ($this->matches as $m) array_unshift($arr, $m); + + $this->found = new SplObjectStorage(); + foreach ($arr as $item) $this->found->attach($item); + }*/ + + /** + * Pseudo-class handler for :nth-last-child and related pseudo-classes. + */ + protected function nthLastChild($groupSize, $elementInGroup) { + // New in Quark. + $this->nthChild($groupSize, $elementInGroup, TRUE); + } + + /** + * Get a list of peer elements. + * If $requireSameTag is TRUE, then only peer elements with the same + * tagname as the given element will be returned. + * + * @param $element + * A DomElement. + * @param $requireSameTag + * Boolean flag indicating whether all matches should have the same + * element name (tagName) as $element. + * @return + * Array of peer elements. + *//* + protected function listPeerElements($element, $requireSameTag = FALSE) { + $peers = array(); + $parent = $element->parentNode; + foreach ($parent->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + if ($requireSameTag) { + // Need to make sure that the tag matches: + if ($element->tagName == $node->tagName) { + $peers[] = $node; + } + } + else { + $peers[] = $node; + } + } + } + return $peers; + } + */ + /** + * Get the nth child (by index) from matching candidates. + * + * This is used by pseudo-class handlers. + */ + /* + protected function childAtIndex($index, $tagName = NULL) { + $restrictToElement = !$this->findAnyElement; + $matches = $this->candidateList(); + $defaultTagName = $tagName; + + // XXX: Added in Quark: I believe this should return an empty + // match set if no child was found tat the index. + $this->matches = new SplObjectStorage(); + + foreach ($matches as $item) { + $parent = $item->parentNode; + + // If a default tag name is supplied, we always use it. + if (!empty($defaultTagName)) { + $tagName = $defaultTagName; + } + // If we are inside of an element selector, we use the + // tag name of the given elements. + elseif ($restrictToElement) { + $tagName = $item->tagName; + } + // Otherwise, we skip the tag name match. + else { + $tagName = NULL; + } + + // Loop through all children looking for matches. + $i = 0; + foreach ($parent->childNodes as $child) { + if ($child->nodeType !== XML_ELEMENT_NODE) { + break; // Skip non-elements + } + + // If type is set, then we do type comparison + if (!empty($tagName)) { + // Check whether tag name matches the type. + if ($child->tagName == $tagName) { + // See if this is the index we are looking for. + if ($i == $index) { + //$this->matches = new SplObjectStorage(); + $this->matches->attach($child); + return; + } + // If it's not the one we are looking for, increment. + ++$i; + } + } + // We don't care about type. Any tagName will match. + else { + if ($i == $index) { + $this->matches->attach($child); + return; + } + ++$i; + } + } // End foreach + } + + }*/ + + /** + * Pseudo-class handler for nth-of-type-child. + * Not implemented. + */ + protected function nthOfTypeChild($groupSize, $elementInGroup, $lastChild) { + // EXPERIMENTAL: New in Quark. This should be substantially faster + // than the old (jQuery-ish) version. It still has E_STRICT violations + // though. + $parents = new SplObjectStorage(); + $matches = new SplObjectStorage(); + + $i = 0; + foreach ($this->matches as $item) { + $parent = $item->parentNode; + + // Build up an array of all of children of this parent, and store the + // index of each element for reference later. We only need to do this + // once per parent, though. + if (!$parents->contains($parent)) { + + $c = 0; + foreach ($parent->childNodes as $child) { + // This doesn't totally make sense, since the CSS 3 spec does not require that + // this pseudo-class be adjoined to an element (e.g. ' :nth-of-type' is allowed). + if ($child->nodeType == XML_ELEMENT_NODE && $child->tagName == $item->tagName) { + // This may break E_STRICT. + $child->nodeIndex = ++$c; + } + } + // This may break E_STRICT. + $parent->numElements = $c; + $parents->attach($parent); + } + + // If we are looking for the last child, we count from the end of a list. + // Note that we add 1 because CSS indices begin at 1, not 0. + if ($lastChild) { + $indexToMatch = $item->parentNode->numElements - $item->nodeIndex + 1; + } + // Otherwise we count from the beginning of the list. + else { + $indexToMatch = $item->nodeIndex; + } + + // If group size is 0, then we return element at the right index. + if ($groupSize == 0) { + if ($indexToMatch == $elementInGroup) + $matches->attach($item); + } + // If group size != 0, then we grab nth element from group offset by + // element in group. + else { + if (($indexToMatch - $elementInGroup) % $groupSize == 0 + && ($indexToMatch - $elementInGroup) / $groupSize >= 0) { + $matches->attach($item); + } + } + + // Iterate. + ++$i; + } + $this->matches = $matches; + } + + /** + * Pseudo-class handler for nth-last-of-type-child. + * Not implemented. + */ + protected function nthLastOfTypeChild($groupSize, $elementInGroup) { + $this->nthOfTypeChild($groupSize, $elementInGroup, TRUE); + } + + /** + * Pseudo-class handler for :lang + */ + protected function lang($value) { + // TODO: This checks for cases where an explicit language is + // set. The spec seems to indicate that an element should inherit + // language from the parent... but this is unclear. + $operator = (strpos($value, '-') !== FALSE) ? self::isExactly : self::containsWithHyphen; + + $orig = $this->matches; + $origDepth = $this->findAnyElement; + + // Do first pass: attributes in default namespace + $this->attribute('lang', $value, $operator); + $lang = $this->matches; // Temp array for merging. + + // Reset + $this->matches = $orig; + $this->findAnyElement = $origDepth; + + // Do second pass: attributes in 'xml' namespace. + $this->attributeNS('lang', 'xml', $value, $operator); + + + // Merge results. + // FIXME: Note that we lose natural ordering in + // the document because we search for xml:lang separately + // from lang. + foreach ($this->matches as $added) $lang->attach($added); + $this->matches = $lang; + } + + /** + * Pseudo-class handler for :not(filter). + * + * This does not follow the specification in the following way: The CSS 3 + * selector spec says the value of not() must be a simple selector. This + * function allows complex selectors. + * + * @param string $filter + * A CSS selector. + */ + protected function not($filter) { + $matches = $this->candidateList(); + //$found = array(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + $handler = new QueryPathCssEventHandler($item); + $not_these = $handler->find($filter)->getMatches(); + if ($not_these->count() == 0) { + $found->attach($item); + } + } + // No need to check for unique elements, since the list + // we began from already had no duplicates. + $this->matches = $found; + } + + /** + * Pseudo-class handler for :has(filter). + * This can also be used as a general filtering routine. + */ + public function has($filter) { + $matches = $this->candidateList(); + //$found = array(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + $handler = new QueryPathCssEventHandler($item); + $these = $handler->find($filter)->getMatches(); + if (count($these) > 0) { + $found->attach($item); + } + } + $this->matches = $found; + return $this; + } + + /** + * Pseudo-class handler for :first-of-type. + */ + protected function firstOfType() { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + $type = $item->tagName; + $parent = $item->parentNode; + foreach ($parent->childNodes as $kid) { + if ($kid->nodeType == XML_ELEMENT_NODE && $kid->tagName == $type) { + if (!$found->contains($kid)) { + $found->attach($kid); + } + break; + } + } + } + $this->matches = $found; + } + + /** + * Pseudo-class handler for :last-of-type. + */ + protected function lastOfType() { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + $type = $item->tagName; + $parent = $item->parentNode; + for ($i = $parent->childNodes->length - 1; $i >= 0; --$i) { + $kid = $parent->childNodes->item($i); + if ($kid->nodeType == XML_ELEMENT_NODE && $kid->tagName == $type) { + if (!$found->contains($kid)) { + $found->attach($kid); + } + break; + } + } + } + $this->matches = $found; + } + + /** + * Pseudo-class handler for :only-child. + */ + protected function onlyChild() { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach($matches as $item) { + $parent = $item->parentNode; + $kids = array(); + foreach($parent->childNodes as $kid) { + if ($kid->nodeType == XML_ELEMENT_NODE) { + $kids[] = $kid; + } + } + // There should be only one child element, and + // it should be the one being tested. + if (count($kids) == 1 && $kids[0] === $item) { + $found->attach($kids[0]); + } + } + $this->matches = $found; + } + + /** + * Pseudo-class handler for :empty. + */ + protected function emptyElement() { + $found = new SplObjectStorage(); + $matches = $this->candidateList(); + foreach ($matches as $item) { + $empty = TRUE; + foreach($item->childNodes as $kid) { + // From the spec: Elements and Text nodes are the only ones to + // affect emptiness. + if ($kid->nodeType == XML_ELEMENT_NODE || $kid->nodeType == XML_TEXT_NODE) { + $empty = FALSE; + break; + } + } + if ($empty) { + $found->attach($item); + } + } + $this->matches = $found; + } + + /** + * Pseudo-class handler for :only-of-type. + */ + protected function onlyOfType() { + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + foreach ($matches as $item) { + if (!$item->parentNode) { + $this->matches = new SplObjectStorage(); + } + $parent = $item->parentNode; + $onlyOfType = TRUE; + + // See if any peers are of the same type + foreach($parent->childNodes as $kid) { + if ($kid->nodeType == XML_ELEMENT_NODE + && $kid->tagName == $item->tagName + && $kid !== $item) { + //$this->matches = new SplObjectStorage(); + $onlyOfType = FALSE; + break; + } + } + + // If no others were found, attach this one. + if ($onlyOfType) $found->attach($item); + } + $this->matches = $found; + } + + /** + * Check for attr value matches based on an operation. + */ + protected function attrValMatches($needle, $haystack, $operation) { + + if (strlen($haystack) < strlen($needle)) return FALSE; + + // According to the spec: + // "The case-sensitivity of attribute names in selectors depends on the document language." + // (6.3.2) + // To which I say, "huh?". We assume case sensitivity. + switch ($operation) { + case CssEventHandler::isExactly: + return $needle == $haystack; + case CssEventHandler::containsWithSpace: + return in_array($needle, explode(' ', $haystack)); + case CssEventHandler::containsWithHyphen: + return in_array($needle, explode('-', $haystack)); + case CssEventHandler::containsInString: + return strpos($haystack, $needle) !== FALSE; + case CssEventHandler::beginsWith: + return strpos($haystack, $needle) === 0; + case CssEventHandler::endsWith: + //return strrpos($haystack, $needle) === strlen($needle) - 1; + return preg_match('/' . $needle . '$/', $haystack) == 1; + } + return FALSE; // Shouldn't be able to get here. + } + + /** + * As the spec mentions, these must be at the end of a selector or + * else they will cause errors. Most selectors return elements. Pseudo-elements + * do not. + */ + public function pseudoElement($name) { + // process the pseudoElement + switch ($name) { + // XXX: Should this return an array -- first line of + // each of the matched elements? + case 'first-line': + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + $o = new stdClass(); + foreach ($matches as $item) { + $str = $item->textContent; + $lines = explode("\n", $str); + if (!empty($lines)) { + $line = trim($lines[0]); + if (!empty($line)) + $o->textContent = $line; + $found->attach($o);//trim($lines[0]); + } + } + $this->matches = $found; + break; + // XXX: Should this return an array -- first letter of each + // of the matched elements? + case 'first-letter': + $matches = $this->candidateList(); + $found = new SplObjectStorage(); + $o = new stdClass(); + foreach ($matches as $item) { + $str = $item->textContent; + if (!empty($str)) { + $str = substr($str,0, 1); + $o->textContent = $str; + $found->attach($o); + } + } + $this->matches = $found; + break; + case 'before': + case 'after': + // There is nothing in a DOM to return for the before and after + // selectors. + case 'selection': + // With no user agent, we don't have a concept of user selection. + throw new NotImplementedException("The $name pseudo-element is not implemented."); + break; + } + $this->findAnyElement = FALSE; + } + public function directDescendant() { + $this->findAnyElement = FALSE; + + $kids = new SplObjectStorage(); + foreach ($this->matches as $item) { + $kidsNL = $item->childNodes; + foreach ($kidsNL as $kidNode) { + if ($kidNode->nodeType == XML_ELEMENT_NODE) { + $kids->attach($kidNode); + } + } + } + $this->matches = $kids; + } + /** + * For an element to be adjacent to another, it must be THE NEXT NODE + * in the node list. So if an element is surrounded by pcdata, there are + * no adjacent nodes. E.g. in FOO, the a and b elements are not + * adjacent. + * + * In a strict DOM parser, line breaks and empty spaces are nodes. That means + * nodes like this will not be adjacent: . The space between + * them makes them non-adjacent. If this is not the desired behavior, pass + * in the appropriate flags to your parser. Example: + * + * $doc = new DomDocument(); + * $doc->loadXML(' ', LIBXML_NOBLANKS); + * + */ + public function adjacent() { + $this->findAnyElement = FALSE; + // List of nodes that are immediately adjacent to the current one. + //$found = array(); + $found = new SplObjectStorage(); + foreach ($this->matches as $item) { + while (isset($item->nextSibling)) { + if (isset($item->nextSibling) && $item->nextSibling->nodeType === XML_ELEMENT_NODE) { + $found->attach($item->nextSibling); + break; + } + $item = $item->nextSibling; + } + } + $this->matches = $found; + } + + public function anotherSelector() { + $this->findAnyElement = FALSE; + // Copy old matches into buffer. + if ($this->matches->count() > 0) { + //$this->alreadyMatched = array_merge($this->alreadyMatched, $this->matches); + foreach ($this->matches as $item) $this->alreadyMatched->attach($item); + } + + // Start over at the top of the tree. + $this->findAnyElement = TRUE; // Reset depth flag. + $this->matches = new SplObjectStorage(); + $this->matches->attach($this->dom); + } + + /** + * Get all nodes that are siblings to currently selected nodes. + * + * If two passed in items are siblings of each other, neither will + * be included in the list of siblings. Their status as being candidates + * excludes them from being considered siblings. + */ + public function sibling() { + $this->findAnyElement = FALSE; + // Get the nodes at the same level. + + if ($this->matches->count() > 0) { + $sibs = new SplObjectStorage(); + foreach ($this->matches as $item) { + /*$candidates = $item->parentNode->childNodes; + foreach ($candidates as $candidate) { + if ($candidate->nodeType === XML_ELEMENT_NODE && $candidate !== $item) { + $sibs->attach($candidate); + } + } + */ + while ($item->nextSibling != NULL) { + $item = $item->nextSibling; + if ($item->nodeType === XML_ELEMENT_NODE) $sibs->attach($item); + } + } + $this->matches = $sibs; + } + } + + /** + * Get any descendant. + */ + public function anyDescendant() { + // Get children: + $found = new SplObjectStorage(); + foreach ($this->matches as $item) { + $kids = $item->getElementsByTagName('*'); + //$found = array_merge($found, $this->nodeListToArray($kids)); + $this->attachNodeList($kids, $found); + } + $this->matches = $found; + + // Set depth flag: + $this->findAnyElement = TRUE; + } + + /** + * Determine what candidates are in the current scope. + * + * This is a utility method that gets the list of elements + * that should be evaluated in the context. If $this->findAnyElement + * is TRUE, this will return a list of every element that appears in + * the subtree of $this->matches. Otherwise, it will just return + * $this->matches. + */ + private function candidateList() { + if ($this->findAnyElement) { + return $this->getAllCandidates($this->matches); + } + return $this->matches; + } + + /** + * Get a list of all of the candidate elements. + * + * This is used when $this->findAnyElement is TRUE. + * @param $elements + * A list of current elements (usually $this->matches). + * + * @return + * A list of all candidate elements. + */ + private function getAllCandidates($elements) { + $found = new SplObjectStorage(); + foreach ($elements as $item) { + $found->attach($item); // put self in + $nl = $item->getElementsByTagName('*'); + //foreach ($nl as $node) $found[] = $node; + $this->attachNodeList($nl, $found); + } + return $found; + } + /* + public function nodeListToArray($nodeList) { + $array = array(); + foreach ($nodeList as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $array[] = $node; + } + } + return $array; + } + */ + + /** + * Attach all nodes in a node list to the given SplObjectStorage. + */ + public function attachNodeList(DOMNodeList $nodeList, SplObjectStorage $splos) { + foreach ($nodeList as $item) $splos->attach($item); + } + +} + +/** + * Exception thrown for unimplemented CSS. + * + * This is thrown in cases where some feature is expected, but the current + * implementation does not support that feature. + * + * @ingroup querypath_css + */ +class NotImplementedException extends Exception {} diff --git a/lib/querypath/CssParser.php b/lib/querypath/CssParser.php new file mode 100644 index 0000000..2ef2802 --- /dev/null +++ b/lib/querypath/CssParser.php @@ -0,0 +1,1108 @@ + + * @license http://opensource.org/licenses/lgpl-2.1.php The GNU Lesser GPL (LGPL) or an MIT-like license. + */ + +/** @addtogroup querypath_css CSS Parsing + * QueryPath includes a CSS 3 Selector parser. + * + * + * Typically the parser is not accessed directly. Most developers will use it indirectly from + * qp(), htmlqp(), or one of the methods on a QueryPath object. + * + * This parser is modular and is not tied to QueryPath, so you can use it in your + * own (non-QueryPath) projects if you wish. To dive in, start with CssEventHandler, the + * event interface that works like a SAX API for CSS selectors. If you want to check out + * the details, check out the parser (CssParser), scanner (CssScanner), and token list (CssToken). + */ + +require_once 'CssEventHandler.php'; + + +/** + * An event handler for handling CSS 3 Selector parsing. + * + * This provides a standard interface for CSS 3 Selector event handling. As the + * parser parses a selector, it will fire events. Implementations of CssEventHandler + * can then handle the events. + * + * This library is inspired by the SAX2 API for parsing XML. Each component of a + * selector fires an event, passing the necessary data on to the event handler. + * + * @ingroup querypath_css + */ +interface CssEventHandler { + /** The is-exactly (=) operator. */ + const isExactly = 0; // = + /** The contains-with-space operator (~=). */ + const containsWithSpace = 1; // ~= + /** The contains-with-hyphen operator (!=). */ + const containsWithHyphen = 2; // |= + /** The contains-in-string operator (*=). */ + const containsInString = 3; // *= + /** The begins-with operator (^=). */ + const beginsWith = 4; // ^= + /** The ends-with operator ($=). */ + const endsWith = 5; // $= + /** The any-element operator (*). */ + const anyElement = '*'; + + /** + * This event is fired when a CSS ID is encountered. + * An ID begins with an octothorp: #name. + * + * @param string $id + * The ID passed in. + */ + public function elementID($id); // #name + /** + * Handle an element name. + * Example: name + * @param string $name + * The name of the element. + */ + public function element($name); // name + /** + * Handle a namespaced element name. + * example: namespace|name + * @param string $name + * The tag name. + * @param string $namespace + * The namespace identifier (Not the URI) + */ + public function elementNS($name, $namespace = NULL); + /** + * Handle an any-element (*) operator. + * Example: * + */ + public function anyElement(); // * + /** + * Handle an any-element operator that is constrained to a namespace. + * Example: ns|* + * @param string $ns + * The namespace identifier (not the URI). + */ + public function anyElementInNS($ns); // ns|* + /** + * Handle a CSS class selector. + * Example: .name + * @param string $name + * The name of the class. + */ + public function elementClass($name); // .name + /** + * Handle an attribute selector. + * Example: [name=attr] + * Example: [name~=attr] + * @param string $name + * The attribute name. + * @param string $value + * The value of the attribute, if given. + * @param int $operation + * The operation to be used for matching. See {@link CssEventHandler} + * constants for a list of supported operations. + */ + public function attribute($name, $value = NULL, $operation = CssEventHandler::isExactly); // [name=attr] + /** + * Handle an attribute selector bound to a specific namespace. + * Example: [ns|name=attr] + * Example: [ns|name~=attr] + * @param string $name + * The attribute name. + * @param string $ns + * The namespace identifier (not the URI). + * @param string $value + * The value of the attribute, if given. + * @param int $operation + * The operation to be used for matching. See {@link CssEventHandler} + * constants for a list of supported operations. + */ + public function attributeNS($name, $ns, $value = NULL, $operation = CssEventHandler::isExactly); + /** + * Handle a pseudo-class. + * Example: :name(value) + * @param string $name + * The pseudo-class name. + * @param string $value + * The value, if one is found. + */ + public function pseudoClass($name, $value = NULL); //:name(value) + /** + * Handle a pseudo-element. + * Example: ::name + * @param string $name + * The pseudo-element name. + */ + public function pseudoElement($name); // ::name + /** + * Handle a direct descendant combinator. + * Example: > + */ + public function directDescendant(); // > + /** + * Handle a adjacent combinator. + * Example: + + */ + public function adjacent(); // + + /** + * Handle an another-selector combinator. + * Example: , + */ + public function anotherSelector(); // , + /** + * Handle a sibling combinator. + * Example: ~ + */ + public function sibling(); // ~ combinator + /** + * Handle an any-descendant combinator. + * Example: ' ' + */ + public function anyDescendant(); // ' ' (space) operator. + +} + +/** + * Tokens for CSS. + * This class defines the recognized tokens for the parser, and also + * provides utility functions for error reporting. + * + * @ingroup querypath_css + */ +final class CssToken { + const char = 0; + const star = 1; + const rangle = 2; + const dot = 3; + const octo = 4; + const rsquare = 5; + const lsquare = 6; + const colon = 7; + const rparen = 8; + const lparen = 9; + const plus = 10; + const tilde = 11; + const eq = 12; + const pipe = 13; + const comma = 14; + const white = 15; + const quote = 16; + const squote = 17; + const bslash = 18; + const carat = 19; + const dollar = 20; + const at = 21; // This is not in the spec. Apparently, old broken CSS uses it. + + // In legal range for string. + const stringLegal = 99; + + /** + * Get a name for a given constant. Used for error handling. + */ + static function name($const_int) { + $a = array('character', 'star', 'right angle bracket', + 'dot', 'octothorp', 'right square bracket', 'left square bracket', + 'colon', 'right parenthesis', 'left parenthesis', 'plus', 'tilde', + 'equals', 'vertical bar', 'comma', 'space', 'quote', 'single quote', + 'backslash', 'carat', 'dollar', 'at'); + if (isset($a[$const_int]) && is_numeric($const_int)) { + return $a[$const_int]; + } + elseif ($const_int == 99) { + return 'a legal non-alphanumeric character'; + } + elseif ($const_int == FALSE) { + return 'end of file'; + } + return sprintf('illegal character (%s)', $const_int); + } +} + +/** + * Parse a CSS selector. + * + * In CSS, a selector is used to identify which element or elements + * in a DOM are being selected for the application of a particular style. + * Effectively, selectors function as a query language for a structured + * document -- almost always HTML or XML. + * + * This class provides an event-based parser for CSS selectors. It can be + * used, for example, as a basis for writing a DOM query engine based on + * CSS. + * + * @ingroup querypath_css + */ +class CssParser { + protected $scanner = NULL; + protected $buffer = ''; + protected $handler = NULL; + protected $strict = FALSE; + + protected $DEBUG = FALSE; + + /** + * Construct a new CSS parser object. This will attempt to + * parse the string as a CSS selector. As it parses, it will + * send events to the CssEventHandler implementation. + */ + public function __construct($string, CssEventHandler $handler) { + $this->originalString = $string; + $is = new CssInputStream($string); + $this->scanner = new CssScanner($is); + $this->handler = $handler; + } + + /** + * Parse the selector. + * + * This begins an event-based parsing process that will + * fire events as the selector is handled. A CssEventHandler + * implementation will be responsible for handling the events. + * @throws CssParseException + */ + public function parse() { + + $this->scanner->nextToken(); + while ($this->scanner->token !== FALSE) { + // Primitive recursion detection. + $position = $this->scanner->position(); + + if ($this->DEBUG) { + print "PARSE " . $this->scanner->token. "\n"; + } + $this->selector(); + + $finalPosition = $this->scanner->position(); + + if ($this->scanner->token !== FALSE && $finalPosition == $position) { + // If we get here, then the scanner did not pop a single character + // off of the input stream during a full run of the parser, which + // means that the current input does not match any recognizable + // pattern. + throw new CssParseException('CSS selector is not well formed.'); + } + + } + + } + + /** + * A restricted parser that can only parse simple selectors. + * The pseudoClass handler for this parser will throw an + * exception if it encounters a pseudo-element or the + * negation pseudo-class. + * + * @deprecated This is not used anywhere in QueryPath and + * may be removed. + *//* + public function parseSimpleSelector() { + while ($this->scanner->token !== FALSE) { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(TRUE); // Operate in restricted mode. + $this->attribute(); + + // TODO: Need to add failure conditions here. + } + }*/ + + /** + * Handle an entire CSS selector. + */ + private function selector() { + if ($this->DEBUG) print "SELECTOR{$this->scanner->position()}\n"; + $this->consumeWhitespace(); // Remove leading whitespace + $this->simpleSelectors(); + $this->combinator(); + } + + /** + * Consume whitespace and return a count of the number of whitespace consumed. + */ + private function consumeWhitespace() { + if ($this->DEBUG) print "CONSUME WHITESPACE\n"; + $white = 0; + while ($this->scanner->token == CssToken::white) { + $this->scanner->nextToken(); + ++$white; + } + return $white; + } + + /** + * Handle one of the five combinators: '>', '+', ' ', '~', and ','. + * This will call the appropriate event handlers. + * @see CssEventHandler::directDescendant(), + * @see CssEventHandler::adjacent(), + * @see CssEventHandler::anyDescendant(), + * @see CssEventHandler::anotherSelector(). + */ + private function combinator() { + if ($this->DEBUG) print "COMBINATOR\n"; + /* + * Problem: ' ' and ' > ' are both valid combinators. + * So we have to track whitespace consumption to see + * if we are hitting the ' ' combinator or if the + * selector just has whitespace padding another combinator. + */ + + // Flag to indicate that post-checks need doing + $inCombinator = FALSE; + $white = $this->consumeWhitespace(); + $t = $this->scanner->token; + + if ($t == CssToken::rangle) { + $this->handler->directDescendant(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == CssToken::plus) { + $this->handler->adjacent(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->simpleSelectors(); + } + elseif ($t == CssToken::comma) { + $this->handler->anotherSelector(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + //$this->scanner->selectors(); + } + elseif ($t == CssToken::tilde) { + $this->handler->sibling(); + $this->scanner->nextToken(); + $inCombinator = TRUE; + } + + // Check that we don't get two combinators in a row. + if ($inCombinator) { + $white = 0; + if ($this->DEBUG) print "COMBINATOR: " . CssToken::name($t) . "\n"; + $this->consumeWhitespace(); + if ($this->isCombinator($this->scanner->token)) { + throw new CssParseException("Illegal combinator: Cannot have two combinators in sequence."); + } + } + // Check to see if we have whitespace combinator: + elseif ($white > 0) { + if ($this->DEBUG) print "COMBINATOR: any descendant\n"; + $inCombinator = TRUE; + $this->handler->anyDescendant(); + } + else { + if ($this->DEBUG) print "COMBINATOR: no combinator found.\n"; + } + } + + /** + * Check if the token is a combinator. + */ + private function isCombinator($tok) { + $combinators = array(CssToken::plus, CssToken::rangle, CssToken::comma, CssToken::tilde); + return in_array($tok, $combinators); + } + + /** + * Handle a simple selector. + */ + private function simpleSelectors() { + if ($this->DEBUG) print "SIMPLE SELECTOR\n"; + $this->allElements(); + $this->elementName(); + $this->elementClass(); + $this->elementID(); + $this->pseudoClass(); + $this->attribute(); + } + + /** + * Handles CSS ID selectors. + * This will call CssEventHandler::elementID(). + */ + private function elementID() { + if ($this->DEBUG) print "ELEMENT ID\n"; + if ($this->scanner->token == CssToken::octo) { + $this->scanner->nextToken(); + if ($this->scanner->token !== CssToken::char) { + throw new CssParseException("Expected string after #"); + } + $id = $this->scanner->getNameString(); + $this->handler->elementID($id); + } + } + + /** + * Handles CSS class selectors. + * This will call the CssEventHandler::elementClass() method. + */ + private function elementClass() { + if ($this->DEBUG) print "ELEMENT CLASS\n"; + if ($this->scanner->token == CssToken::dot) { + $this->scanner->nextToken(); + $this->consumeWhitespace(); // We're very fault tolerent. This should prob through error. + $cssClass = $this->scanner->getNameString(); + $this->handler->elementClass($cssClass); + } + } + + /** + * Handle a pseudo-class and pseudo-element. + * + * CSS 3 selectors support separate pseudo-elements, using :: instead + * of : for separator. This is now supported, and calls the pseudoElement + * handler, CssEventHandler::pseudoElement(). + * + * This will call CssEventHandler::pseudoClass() when a + * pseudo-class is parsed. + */ + private function pseudoClass($restricted = FALSE) { + if ($this->DEBUG) print "PSEUDO-CLASS\n"; + if ($this->scanner->token == CssToken::colon) { + + // Check for CSS 3 pseudo element: + $isPseudoElement = FALSE; + if ($this->scanner->nextToken() === CssToken::colon) { + $isPseudoElement = TRUE; + $this->scanner->nextToken(); + } + + $name = $this->scanner->getNameString(); + if ($restricted && $name == 'not') { + throw new CssParseException("The 'not' pseudo-class is illegal in this context."); + } + + $value = NULL; + if ($this->scanner->token == CssToken::lparen) { + if ($isPseudoElement) { + throw new CssParseException("Illegal left paren. Pseudo-Element cannot have arguments."); + } + $value = $this->pseudoClassValue(); + } + + // FIXME: This should throw errors when pseudo element has values. + if ($isPseudoElement) { + if ($restricted) { + throw new CssParseException("Pseudo-Elements are illegal in this context."); + } + $this->handler->pseudoElement($name); + $this->consumeWhitespace(); + + // Per the spec, pseudo-elements must be the last items in a selector, so we + // check to make sure that we are either at the end of the stream or that a + // new selector is starting. Only one pseudo-element is allowed per selector. + if ($this->scanner->token !== FALSE && $this->scanner->token !== CssToken::comma) { + throw new CssParseException("A Pseudo-Element must be the last item in a selector."); + } + } + else { + $this->handler->pseudoClass($name, $value); + } + } + } + + /** + * Get the value of a pseudo-classes. + * + * @return string + * Returns the value found from a pseudo-class. + * + * @todo Pseudoclasses can be passed pseudo-elements and + * other pseudo-classes as values, which means :pseudo(::pseudo) + * is legal. + */ + private function pseudoClassValue() { + if ($this->scanner->token == CssToken::lparen) { + $buf = ''; + + // For now, just leave pseudoClass value vague. + /* + // We have to peek to see if next char is a colon because + // pseudo-classes and pseudo-elements are legal strings here. + print $this->scanner->peek(); + if ($this->scanner->peek() == ':') { + print "Is pseudo\n"; + $this->scanner->nextToken(); + + // Pseudo class + if ($this->scanner->token == CssToken::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + // Pseudo element + if ($this->scanner->token == CssToken::colon) { + $buf .= ':'; + $this->scanner->nextToken(); + } + // Ident + $buf .= $this->scanner->getNameString(); + } + } + else { + print "fetching string.\n"; + $buf .= $this->scanner->getQuotedString(); + if ($this->scanner->token != CssToken::rparen) { + $this->throwError(CssToken::rparen, $this->scanner->token); + } + $this->scanner->nextToken(); + } + return $buf; + */ + $buf .= $this->scanner->getQuotedString(); + return $buf; + } + } + + /** + * Handle element names. + * This will call the CssEventHandler::elementName(). + * + * This handles: + * + * name (CssEventHandler::element()) + * |name (CssEventHandler::element()) + * ns|name (CssEventHandler::elementNS()) + * ns|* (CssEventHandler::elementNS()) + * + */ + private function elementName() { + if ($this->DEBUG) print "ELEMENT NAME\n"; + if ($this->scanner->token === CssToken::pipe) { + // We have '|name', which is equiv to 'name' + $this->scanner->nextToken(); + $this->consumeWhitespace(); + $elementName = $this->scanner->getNameString(); + $this->handler->element($elementName); + } + elseif ($this->scanner->token === CssToken::char) { + $elementName = $this->scanner->getNameString(); + if ($this->scanner->token == CssToken::pipe) { + // Get ns|name + $elementNS = $elementName; + $this->scanner->nextToken(); + $this->consumeWhitespace(); + if ($this->scanner->token === CssToken::star) { + // We have ns|* + $this->handler->anyElementInNS($elementNS); + $this->scanner->nextToken(); + } + elseif ($this->scanner->token !== CssToken::char) { + $this->throwError(CssToken::char, $this->scanner->token); + } + else { + $elementName = $this->scanner->getNameString(); + // We have ns|name + $this->handler->elementNS($elementName, $elementNS); + } + + } + else { + $this->handler->element($elementName); + } + } + } + + /** + * Check for all elements designators. Due to the new CSS 3 namespace + * support, this is slightly more complicated, now, as it handles + * the *|name and *|* cases as well as *. + * + * Calls CssEventHandler::anyElement() or CssEventHandler::elementName(). + */ + private function allElements() { + if ($this->scanner->token === CssToken::star) { + $this->scanner->nextToken(); + if ($this->scanner->token === CssToken::pipe) { + $this->scanner->nextToken(); + if ($this->scanner->token === CssToken::star) { + // We got *|*. According to spec, this requires + // that the element has a namespace, so we pass it on + // to the handler: + $this->scanner->nextToken(); + $this->handler->anyElementInNS('*'); + } + else { + // We got *|name, which means the name MUST be in a namespce, + // so we pass this off to elementNameNS(). + $name = $this->scanner->getNameString(); + $this->handler->elementNS($name, '*'); + } + } + else { + $this->handler->anyElement(); + } + } + } + + /** + * Handler an attribute. + * An attribute can be in one of two forms: + * [attrName] + * or + * [attrName="AttrValue"] + * + * This may call the following event handlers: CssEventHandler::attribute(). + */ + private function attribute() { + if($this->scanner->token == CssToken::lsquare) { + $attrVal = $op = $ns = NULL; + + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + if ($this->scanner->token === CssToken::at) { + if ($this->strict) { + throw new CssParseException('The @ is illegal in attributes.'); + } + else { + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + } + + if ($this->scanner->token === CssToken::star) { + // Global namespace... requires that attr be prefixed, + // so we pass this on to a namespace handler. + $ns = '*'; + $this->scanner->nextToken(); + } + if ($this->scanner->token === CssToken::pipe) { + // Skip this. It's a global namespace. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + } + + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + + // Check for namespace attribute: ns|attr. We have to peek() to make + // sure that we haven't hit the |= operator, which looks the same. + if ($this->scanner->token === CssToken::pipe && $this->scanner->peek() !== '=') { + // We have a namespaced attribute. + $ns = $attrName; + $this->scanner->nextToken(); + $attrName = $this->scanner->getNameString(); + $this->consumeWhitespace(); + } + + // Note: We require that operators do not have spaces + // between characters, e.g. ~= , not ~ =. + + // Get the operator: + switch ($this->scanner->token) { + case CssToken::eq: + $this->consumeWhitespace(); + $op = CssEventHandler::isExactly; + break; + case CssToken::tilde: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsWithSpace; + break; + case CssToken::pipe: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsWithHyphen; + break; + case CssToken::star: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::containsInString; + break; + case CssToken::dollar; + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::endsWith; + break; + case CssToken::carat: + if ($this->scanner->nextToken() !== CssToken::eq) { + $this->throwError(CssToken::eq, $this->scanner->token); + } + $op = CssEventHandler::beginsWith; + break; + } + + if (isset($op)) { + // Consume '=' and go on. + $this->scanner->nextToken(); + $this->consumeWhitespace(); + + // So... here we have a problem. The grammer suggests that the + // value here is String1 or String2, both of which are enclosed + // in quotes of some sort, and both of which allow lots of special + // characters. But the spec itself includes examples like this: + // [lang=fr] + // So some bareword support is assumed. To get around this, we assume + // that bare words follow the NAME rules, while quoted strings follow + // the String1/String2 rules. + + if ($this->scanner->token === CssToken::quote || $this->scanner->token === CssToken::squote) { + $attrVal = $this->scanner->getQuotedString(); + } + else { + $attrVal = $this->scanner->getNameString(); + } + + if ($this->DEBUG) { + print "ATTR: $attrVal AND OP: $op\n"; + } + } + + $this->consumeWhitespace(); + + if ($this->scanner->token != CssToken::rsquare) { + $this->throwError(CssToken::rsquare, $this->scanner->token); + } + + if (isset($ns)) { + $this->handler->attributeNS($attrName, $ns, $attrVal, $op); + } + elseif (isset($attrVal)) { + $this->handler->attribute($attrName, $attrVal, $op); + } + else { + $this->handler->attribute($attrName); + } + $this->scanner->nextToken(); + } + } + + /** + * Utility for throwing a consistantly-formatted parse error. + */ + private function throwError($expected, $got) { + $filter = sprintf('Expected %s, got %s', CssToken::name($expected), CssToken::name($got)); + throw new CssParseException($filter); + } + +} + +/** + * Scanner for CSS selector parsing. + * + * This provides a simple scanner for traversing an input stream. + * + * @ingroup querypath_css + */ +final class CssScanner { + var $is = NULL; + public $value = NULL; + public $token = NULL; + + var $recurse = FALSE; + var $it = 0; + + /** + * Given a new input stream, tokenize the CSS selector string. + * @see CssInputStream + * @param CssInputStream $in + * An input stream to be scanned. + */ + public function __construct(CssInputStream $in) { + $this->is = $in; + } + + /** + * Return the position of the reader in the string. + */ + public function position() { + return $this->is->position; + } + + /** + * See the next char without removing it from the stack. + * + * @return char + * Returns the next character on the stack. + */ + public function peek() { + return $this->is->peek(); + } + + /** + * Get the next token in the input stream. + * + * This sets the current token to the value of the next token in + * the stream. + * + * @return int + * Returns an int value corresponding to one of the CssToken constants, + * or FALSE if the end of the string is reached. (Remember to use + * strong equality checking on FALSE, since 0 is a valid token id.) + */ + public function nextToken() { + $tok = -1; + ++$this->it; + if ($this->is->isEmpty()) { + if ($this->recurse) { + throw new Exception("Recursion error detected at iteration " . $this->it . '.'); + exit(); + } + //print "{$this->it}: All done\n"; + $this->recurse = TRUE; + $this->token = FALSE; + return FALSE; + } + $ch = $this->is->consume(); + //print __FUNCTION__ . " Testing $ch.\n"; + if (ctype_space($ch)) { + $this->value = ' '; // Collapse all WS to a space. + $this->token = $tok = CssToken::white; + //$ch = $this->is->consume(); + return $tok; + } + + if (ctype_alnum($ch) || $ch == '-' || $ch == '_') { + // It's a character + $this->value = $ch; //strtolower($ch); + $this->token = $tok = CssToken::char; + return $tok; + } + + $this->value = $ch; + + switch($ch) { + case '*': + $tok = CssToken::star; + break; + case chr(ord('>')): + $tok = CssToken::rangle; + break; + case '.': + $tok = CssToken::dot; + break; + case '#': + $tok = CssToken::octo; + break; + case '[': + $tok = CssToken::lsquare; + break; + case ']': + $tok = CssToken::rsquare; + break; + case ':': + $tok = CssToken::colon; + break; + case '(': + $tok = CssToken::lparen; + break; + case ')': + $tok = CssToken::rparen; + break; + case '+': + $tok = CssToken::plus; + break; + case '~': + $tok = CssToken::tilde; + break; + case '=': + $tok = CssToken::eq; + break; + case '|': + $tok = CssToken::pipe; + break; + case ',': + $tok = CssToken::comma; + break; + case chr(34): + $tok = CssToken::quote; + break; + case "'": + $tok = CssToken::squote; + break; + case '\\': + $tok = CssToken::bslash; + break; + case '^': + $tok = CssToken::carat; + break; + case '$': + $tok = CssToken::dollar; + break; + case '@': + $tok = CssToken::at; + break; + } + + + // Catch all characters that are legal within strings. + if ($tok == -1) { + // TODO: This should be UTF-8 compatible, but PHP doesn't + // have a native UTF-8 string. Should we use external + // mbstring library? + + $ord = ord($ch); + // Characters in this pool are legal for use inside of + // certain strings. Extended ASCII is used here, though I + // Don't know if these are really legal. + if (($ord >= 32 && $ord <= 126) || ($ord >= 128 && $ord <= 255)) { + $tok = CssToken::stringLegal; + } + else { + throw new CSSParseException('Illegal character found in stream: ' . $ord); + } + } + + $this->token = $tok; + return $tok; + } + + /** + * Get a name string from the input stream. + * A name string must be composed of + * only characters defined in CssToken:char: -_a-zA-Z0-9 + */ + public function getNameString() { + $buf = ''; + while ($this->token === CssToken::char) { + $buf .= $this->value; + $this->nextToken(); + //print '_'; + } + return $buf; + } + + /** + * This gets a string with any legal 'string' characters. + * See CSS Selectors specification, section 11, for the + * definition of string. + * + * This will check for string1, string2, and the case where a + * string is unquoted (Oddly absent from the "official" grammar, + * though such strings are present as examples in the spec.) + * + * Note: + * Though the grammar supplied by CSS 3 Selectors section 11 does not + * address the contents of a pseudo-class value, the spec itself indicates + * that a pseudo-class value is a "value between parenthesis" [6.6]. The + * examples given use URLs among other things, making them closer to the + * definition of 'string' than to 'name'. So we handle them here as strings. + */ + public function getQuotedString() { + if ($this->token == CssToken::quote || $this->token == CssToken::squote || $this->token == CssToken::lparen) { + $end = ($this->token == CssToken::lparen) ? CssToken::rparen : $this->token; + $buf = ''; + $escape = FALSE; + + $this->nextToken(); // Skip the opening quote/paren + + // The second conjunct is probably not necessary. + while ($this->token !== FALSE && $this->token > -1) { + //print "Char: $this->value \n"; + if ($this->token == CssToken::bslash && !$escape) { + // XXX: The backslash (\) is removed here. + // Turn on escaping. + //$buf .= $this->value; + $escape = TRUE; + } + elseif ($escape) { + // Turn off escaping + $buf .= $this->value; + $escape = FALSE; + } + elseif ($this->token === $end) { + // At end of string; skip token and break. + $this->nextToken(); + break; + } + else { + // Append char. + $buf .= $this->value; + } + $this->nextToken(); + } + return $buf; + } + } + + /** + * Get a string from the input stream. + * This is a convenience function for getting a string of + * characters that are either alphanumber or whitespace. See + * the CssToken::white and CssToken::char definitions. + * + * @deprecated This is not used anywhere in QueryPath. + *//* + public function getStringPlusWhitespace() { + $buf = ''; + if($this->token === FALSE) {return '';} + while ($this->token === CssToken::char || $this->token == CssToken::white) { + $buf .= $this->value; + $this->nextToken(); + } + return $buf; + }*/ + +} + +/** + * Simple wrapper to turn a string into an input stream. + * This provides a standard interface on top of an array of + * characters. + */ +class CssInputStream { + protected $stream = NULL; + public $position = 0; + /** + * Build a new CSS input stream from a string. + * + * @param string + * String to turn into an input stream. + */ + function __construct($string) { + $this->stream = str_split($string); + } + /** + * Look ahead one character. + * + * @return char + * Returns the next character, but does not remove it from + * the stream. + */ + function peek() { + return $this->stream[0]; + } + /** + * Get the next unconsumed character in the stream. + * This will remove that character from the front of the + * stream and return it. + */ + function consume() { + $ret = array_shift($this->stream); + if (!empty($ret)) { + $this->position++; + } + return $ret; + } + /** + * Check if the stream is empty. + * @return boolean + * Returns TRUE when the stream is empty, FALSE otherwise. + */ + function isEmpty() { + return count($this->stream) == 0; + } +} + +/** + * Exception indicating an error in CSS parsing. + * + * @ingroup querypath_css + */ +class CSSParseException extends EXCEPTION {} \ No newline at end of file diff --git a/lib/querypath/Extension/QPDB.php b/lib/querypath/Extension/QPDB.php new file mode 100644 index 0000000..1a41657 --- /dev/null +++ b/lib/querypath/Extension/QPDB.php @@ -0,0 +1,711 @@ +'; + * $qp = qp(QueryPath::HTML_STUB, 'body') // Open a stub HTML doc and select + * ->append('
') + * ->dbInit($this->dsn) + * ->queryInto('SELECT * FROM qpdb_test WHERE 1', array(), $template) + * ->doneWithQuery() + * ->writeHTML(); + * ?> + * @endcode + * + * The code above will take the results of a SQL query and insert them into a n + * HTML table. + * + * If you are doing many database operations across multiple QueryPath objects, + * it is better to avoid using {@link QPDB::dbInit()}. Instead, you should + * call the static {@link QPDB::baseDB()} method to configure a single database + * connection that can be shared by all {@link QueryPath} instances. + * + * Thus, we could rewrite the above to look like this: + * @code + * '; + * $qp = qp(QueryPath::HTML_STUB, 'body') // Open a stub HTML doc and select + * ->append('
') + * ->queryInto('SELECT * FROM qpdb_test WHERE 1', array(), $template) + * ->doneWithQuery() + * ->writeHTML(); + * ?> + * @endcode + * + * Note that in this case, the QueryPath object doesn't need to call a method to + * activate the database. There is no call to {@link dbInit()}. Instead, it checks + * the base class to find the shared database. + * + * (Note that if you were to add a dbInit() call to the above, it would create + * a new database connection.) + * + * The result of both of these examples will be identical. + * The output looks something like this: + * + * @code + * + * + * + * + * Untitled + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Title 0Body 0Footer 0
Title 1Body 1Footer 1
Title 2Body 2Footer 2
Title 3Body 3Footer 3
Title 4Body 4Footer 4
+ * + * + * @endcode + * + * Note how the CSS classes are used to correlate DB table names to template + * locations. + * + * + * @author M Butcher + * @license http://opensource.org/licenses/lgpl-2.1.php LGPL or MIT-like license. + * @see QueryPathExtension + * @see QueryPathExtensionRegistry::extend() + * @see QPDB + */ + +/** + * Provide DB access to a QueryPath object. + * + * This extension provides tools for communicating with a database using the + * QueryPath library. It relies upon PDO for underlying database communiction. This + * means that it supports all databases that PDO supports, including MySQL, + * PostgreSQL, and SQLite. + * + * Here is an extended example taken from the unit tests for this library. + * + * Let's say we create a database with code like this: + * @code + *db = new PDO($this->dsn); + * $this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + * $this->db->exec('CREATE TABLE IF NOT EXISTS qpdb_test (colOne, colTwo, colThree)'); + * + * $stmt = $this->db->prepare( + * 'INSERT INTO qpdb_test (colOne, colTwo, colThree) VALUES (:one, :two, :three)' + * ); + * + * for ($i = 0; $i < 5; ++$i) { + * $vals = array(':one' => 'Title ' . $i, ':two' => 'Body ' . $i, ':three' => 'Footer ' . $i); + * $stmt->execute($vals); + * $stmt->closeCursor(); + * } + * } + * ?> + * @endcode + * + * From QueryPath with QPDB, we can now do very elaborate DB chains like this: + * + * @code + * + * ->append('

') // Add

+ * ->children() // Select the

+ * ->dbInit($this->dsn) // Connect to the database + * ->query($sql, $args) // Execute the SQL query + * ->nextRow() // Select a row. By default, no row is selected. + * ->appendColumn('colOne') // Append Row 1, Col 1 (Title 0) + * ->parent() // Go back to the + * ->append('

') // Append a

to the body + * ->find('p') // Find the

we just created. + * ->nextRow() // Advance to row 2 + * ->prependColumn('colTwo') // Get row 2, col 2. (Body 1) + * ->columnAfter('colThree') // Get row 2 col 3. (Footer 1) + * ->doneWithQuery() // Let QueryPath clean up. YOU SHOULD ALWAYS DO THIS. + * ->writeHTML(); // Write the output as HTML. + * ?> + * @endcode + * + * With the code above, we step through the document, selectively building elements + * as we go, and then populating this elements with data from our initial query. + * + * When the last command, {@link QueryPath:::writeHTML()}, is run, we will get output + * like this: + * + * @code + * + * + * + * + * Untitled + * + * + *

Title 0

+ *

Body 1

+ * Footer 1 + * + * @endcode + * + * Notice the body section in particular. This is where the data has been + * inserted. + * + * Sometimes you want to do something a lot simpler, like give QueryPath a + * template and have it navigate a query, inserting the data into a template, and + * then inserting the template into the document. This can be done simply with + * the {@link queryInto()} function. + * + * Here's an example from another unit test: + * + * @code + *
  • '; + * $sql = 'SELECT * FROM qpdb_test'; + * $args = array(); + * $qp = qp(QueryPath::HTML_STUB, 'body') + * ->append('