summaryrefslogtreecommitdiff
path: root/lib/querypath/src/QueryPath/CSS/DOMTraverser.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/querypath/src/QueryPath/CSS/DOMTraverser.php')
-rw-r--r--lib/querypath/src/QueryPath/CSS/DOMTraverser.php775
1 files changed, 775 insertions, 0 deletions
diff --git a/lib/querypath/src/QueryPath/CSS/DOMTraverser.php b/lib/querypath/src/QueryPath/CSS/DOMTraverser.php
new file mode 100644
index 0000000..be8c2af
--- /dev/null
+++ b/lib/querypath/src/QueryPath/CSS/DOMTraverser.php
@@ -0,0 +1,775 @@
+<?php
+/** @file
+ * Traverse a DOM.
+ */
+
+namespace QueryPath\CSS;
+
+use \QueryPath\CSS\DOMTraverser\Util;
+use \QueryPath\CSS\DOMTraverser\PseudoClass;
+use \QueryPath\CSS\DOMTraverser\PseudoElement;
+
+/**
+ * Traverse a DOM, finding matches to the selector.
+ *
+ * This traverses a DOMDocument and attempts to find
+ * matches to the provided selector.
+ *
+ * \b How this works
+ *
+ * This performs a bottom-up search. On the first pass,
+ * it attempts to find all of the matching elements for the
+ * last simple selector in a selector.
+ *
+ * Subsequent passes attempt to eliminate matches from the
+ * initial matching set.
+ *
+ * Example:
+ *
+ * Say we begin with the selector `foo.bar baz`. This is processed
+ * as follows:
+ *
+ * - First, find all baz elements.
+ * - Next, for any baz element that does not have foo as an ancestor,
+ * eliminate it from the matches.
+ * - Finally, for those that have foo as an ancestor, does that foo
+ * also have a class baz? If not, it is removed from the matches.
+ *
+ * \b Extrapolation
+ *
+ * Partial simple selectors are almost always expanded to include an
+ * element.
+ *
+ * Examples:
+ *
+ * - `:first` is expanded to `*:first`
+ * - `.bar` is expanded to `*.bar`.
+ * - `.outer .inner` is expanded to `*.outer *.inner`
+ *
+ * The exception is that IDs are sometimes not expanded, e.g.:
+ *
+ * - `#myElement` does not get expanded
+ * - `#myElement .class` \i may be expanded to `*#myElement *.class`
+ * (which will obviously not perform well).
+ */
+class DOMTraverser implements Traverser {
+
+ protected $matches = array();
+ protected $selector;
+ protected $dom;
+ protected $initialized = TRUE;
+ protected $psHandler;
+ protected $scopeNode;
+
+ /**
+ * Build a new DOMTraverser.
+ *
+ * This requires a DOM-like object or collection of DOM nodes.
+ */
+ public function __construct(\SPLObjectStorage $splos, $initialized = FALSE, $scopeNode = NULL) {
+
+ $this->psHandler = new \QueryPath\CSS\DOMTraverser\PseudoClass();
+ $this->initialized = $initialized;
+
+ // Re-use the initial splos
+ $this->matches = $splos;
+
+ if (count($splos) != 0) {
+ $splos->rewind();
+ $first = $splos->current();
+ if ($first instanceof \DOMDocument) {
+ $this->dom = $first;//->documentElement;
+ }
+ else {
+ $this->dom = $first->ownerDocument;//->documentElement;
+ }
+ if (empty($scopeNode)) {
+ $this->scopeNode = $this->dom->documentElement;
+ }
+ else {
+ $this->scopeNode = $scopeNode;
+ }
+ }
+
+ // This assumes a DOM. Need to also accomodate the case
+ // where we get a set of elements.
+ /*
+ $this->dom = $dom;
+ $this->matches = new \SplObjectStorage();
+ $this->matches->attach($this->dom);
+ */
+ }
+
+ public function debug($msg) {
+ fwrite(STDOUT, PHP_EOL . $msg);
+ }
+
+ /**
+ * Given a selector, find the matches in the given DOM.
+ *
+ * This is the main function for querying the DOM using a CSS
+ * selector.
+ *
+ * @param string $selector
+ * The selector.
+ * @return \SPLObjectStorage
+ * An SPLObjectStorage containing a list of matched
+ * DOMNode objects.
+ */
+ public function find($selector) {
+ // Setup
+ $handler = new Selector();
+ $parser = new Parser($selector, $handler);
+ $parser->parse();
+ $this->selector = $handler;
+
+ //$selector = $handler->toArray();
+ $found = $this->newMatches();
+ foreach ($handler as $selectorGroup) {
+ // fprintf(STDOUT, "Selector group.\n");
+ // Initialize matches if necessary.
+ if ($this->initialized) {
+ $candidates = $this->matches;
+ }
+ else {
+ //if (empty($selectorGroup)) {
+ // fprintf(STDOUT, "%s", print_r($handler->toArray(), TRUE));
+ //}
+ $candidates = $this->initialMatch($selectorGroup[0], $this->matches);
+ //$this->initialized = TRUE;
+ }
+
+ foreach ($candidates as $candidate) {
+ // fprintf(STDOUT, "Testing %s against %s.\n", $candidate->tagName, $selectorGroup[0]);
+ if ($this->matchesSelector($candidate, $selectorGroup)) {
+ // $this->debug('Attaching ' . $candidate->nodeName);
+ $found->attach($candidate);
+ }
+ }
+ }
+ $this->setMatches($found);
+
+
+ return $this;
+ }
+
+ public function matches() {
+ return $this->matches;
+ }
+
+ /**
+ * Check whether the given node matches the given selector.
+ *
+ * A selector is a group of one or more simple selectors combined
+ * by combinators. This determines if a given selector
+ * matches the given node.
+ *
+ * @attention
+ * Evaluation of selectors is done recursively. Thus the length
+ * of the selector is limited to the recursion depth allowed by
+ * the PHP configuration. This should only cause problems for
+ * absolutely huge selectors or for versions of PHP tuned to
+ * strictly limit recursion depth.
+ *
+ * @param object DOMNode
+ * The DOMNode to check.
+ * @param array Selector->toArray()
+ * The Selector to check.
+ * @return boolean
+ * A boolean TRUE if the node matches, false otherwise.
+ */
+ public function matchesSelector($node, $selector) {
+ return $this->matchesSimpleSelector($node, $selector, 0);
+ }
+
+ /**
+ * Performs a match check on a SimpleSelector.
+ *
+ * Where matchesSelector() does a check on an entire selector,
+ * this checks only a simple selector (plus an optional
+ * combinator).
+ *
+ * @param object DOMNode
+ * The DOMNode to check.
+ * @param object SimpleSelector
+ * The Selector to check.
+ * @return boolean
+ * A boolean TRUE if the node matches, false otherwise.
+ */
+ public function matchesSimpleSelector($node, $selectors, $index) {
+ $selector = $selectors[$index];
+ // Note that this will short circuit as soon as one of these
+ // returns FALSE.
+ $result = $this->matchElement($node, $selector->element, $selector->ns)
+ && $this->matchAttributes($node, $selector->attributes)
+ && $this->matchId($node, $selector->id)
+ && $this->matchClasses($node, $selector->classes)
+ && $this->matchPseudoClasses($node, $selector->pseudoClasses)
+ && $this->matchPseudoElements($node, $selector->pseudoElements);
+
+ $isNextRule = isset($selectors[++$index]);
+ // If there is another selector, we process that if there a match
+ // hasn't been found.
+ /*
+ if ($isNextRule && $selectors[$index]->combinator == SimpleSelector::anotherSelector) {
+ // We may need to re-initialize the match set for the next selector.
+ if (!$this->initialized) {
+ $this->initialMatch($selectors[$index]);
+ }
+ if (!$result) fprintf(STDOUT, "Element: %s, Next selector: %s\n", $node->tagName, $selectors[$index]);
+ return $result || $this->matchesSimpleSelector($node, $selectors, $index);
+ }
+ // If we have a match and we have a combinator, we need to
+ // recurse up the tree.
+ else*/if ($isNextRule && $result) {
+ $result = $this->combine($node, $selectors, $index);
+ }
+
+ return $result;
+ }
+
+ /**
+ * Combine the next selector with the given match
+ * using the next combinator.
+ *
+ * If the next selector is combined with another
+ * selector, that will be evaluated too, and so on.
+ * So if this function returns TRUE, it means that all
+ * child selectors are also matches.
+ *
+ * @param DOMNode $node
+ * The DOMNode to test.
+ * @param array $selectors
+ * The array of simple selectors.
+ * @param int $index
+ * The index of the current selector.
+ * @return boolean
+ * TRUE if the next selector(s) match.
+ */
+ public function combine($node, $selectors, $index) {
+ $selector = $selectors[$index];
+ //$this->debug(implode(' ', $selectors));
+ switch ($selector->combinator) {
+ case SimpleSelector::adjacent:
+ return $this->combineAdjacent($node, $selectors, $index);
+ case SimpleSelector::sibling:
+ return $this->combineSibling($node, $selectors, $index);
+ case SimpleSelector::directDescendant:
+ return $this->combineDirectDescendant($node, $selectors, $index);
+ case SimpleSelector::anyDescendant:
+ return $this->combineAnyDescendant($node, $selectors, $index);
+ case SimpleSelector::anotherSelector:
+ // fprintf(STDOUT, "Next selector: %s\n", $selectors[$index]);
+ return $this->matchesSimpleSelector($node, $selectors, $index);
+ ;
+ }
+ return FALSE;
+ }
+
+ /**
+ * Process an Adjacent Sibling.
+ *
+ * The spec does not indicate whether Adjacent should ignore non-Element
+ * nodes, so we choose to ignore them.
+ *
+ * @param DOMNode $node
+ * A DOM Node.
+ * @param array $selectors
+ * The selectors array.
+ * @param int $index
+ * The current index to the operative simple selector in the selectors
+ * array.
+ * @return boolean
+ * TRUE if the combination matches, FALSE otherwise.
+ */
+ public function combineAdjacent($node, $selectors, $index) {
+ while (!empty($node->previousSibling)) {
+ $node = $node->previousSibling;
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ //$this->debug(sprintf('Testing %s against "%s"', $node->tagName, $selectors[$index]));
+ return $this->matchesSimpleSelector($node, $selectors, $index);
+ }
+ }
+ return FALSE;
+ }
+
+ /**
+ * Check all siblings.
+ *
+ * According to the spec, this only tests elements LEFT of the provided
+ * node.
+ *
+ * @param DOMNode $node
+ * A DOM Node.
+ * @param array $selectors
+ * The selectors array.
+ * @param int $index
+ * The current index to the operative simple selector in the selectors
+ * array.
+ * @return boolean
+ * TRUE if the combination matches, FALSE otherwise.
+ */
+ public function combineSibling($node, $selectors, $index) {
+ while (!empty($node->previousSibling)) {
+ $node = $node->previousSibling;
+ if ($node->nodeType == XML_ELEMENT_NODE && $this->matchesSimpleSelector($node, $selectors, $index)) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+ }
+
+ /**
+ * Handle a Direct Descendant combination.
+ *
+ * Check whether the given node is a rightly-related descendant
+ * of its parent node.
+ *
+ * @param DOMNode $node
+ * A DOM Node.
+ * @param array $selectors
+ * The selectors array.
+ * @param int $index
+ * The current index to the operative simple selector in the selectors
+ * array.
+ * @return boolean
+ * TRUE if the combination matches, FALSE otherwise.
+ */
+ public function combineDirectDescendant($node, $selectors, $index) {
+ $parent = $node->parentNode;
+ if (empty($parent)) {
+ return FALSE;
+ }
+ return $this->matchesSimpleSelector($parent, $selectors, $index);
+ }
+
+ /**
+ * Handle Any Descendant combinations.
+ *
+ * This checks to see if there are any matching routes from the
+ * selector beginning at the present node.
+ *
+ * @param DOMNode $node
+ * A DOM Node.
+ * @param array $selectors
+ * The selectors array.
+ * @param int $index
+ * The current index to the operative simple selector in the selectors
+ * array.
+ * @return boolean
+ * TRUE if the combination matches, FALSE otherwise.
+ */
+ public function combineAnyDescendant($node, $selectors, $index) {
+ while (!empty($node->parentNode)) {
+ $node = $node->parentNode;
+
+ // Catch case where element is child of something
+ // else. This should really only happen with a
+ // document element.
+ if ($node->nodeType != XML_ELEMENT_NODE) {
+ continue;
+ }
+
+ if ($this->matchesSimpleSelector($node, $selectors, $index)) {
+ return TRUE;
+ }
+ }
+ }
+
+ /**
+ * Get the intial match set.
+ *
+ * This should only be executed when not working with
+ * an existing match set.
+ */
+ protected function initialMatch($selector, $matches) {
+ $element = $selector->element;
+
+ // If no element is specified, we have to start with the
+ // entire document.
+ if ($element == NULL) {
+ $element = '*';
+ }
+
+ // fprintf(STDOUT, "Initial match using %s.\n", $selector);
+
+ // We try to do some optimization here to reduce the
+ // number of matches to the bare minimum. This will
+ // reduce the subsequent number of operations that
+ // must be performed in the query.
+
+ // Experimental: ID queries use XPath to match, since
+ // this should give us only a single matched element
+ // to work with.
+ if (/*$element == '*' &&*/ !empty($selector->id)) {
+ // fprintf(STDOUT, "ID Fastrack on %s\n", $selector);
+ $initialMatches = $this->initialMatchOnID($selector, $matches);
+ }
+ // If a namespace is set, find the namespace matches.
+ elseif (!empty($selector->ns)) {
+ $initialMatches = $this->initialMatchOnElementNS($selector, $matches);
+ }
+ // If the element is a wildcard, using class can
+ // substantially reduce the number of elements that
+ // we start with.
+ elseif ($element == '*' && !empty($selector->classes)) {
+ // fprintf(STDOUT, "Class Fastrack on %s\n", $selector);
+ $initialMatches = $this->initialMatchOnClasses($selector, $matches);
+ }
+ else {
+ $initialMatches = $this->initialMatchOnElement($selector, $matches);
+ }
+
+ //fprintf(STDOUT, "Found %d nodes.\n", count($this->matches));
+ return $initialMatches;
+ }
+
+ /**
+ * Shortcut for finding initial match by ID.
+ *
+ * If the element is set to '*' and an ID is
+ * set, then this should be used to find by ID,
+ * which will drastically reduce the amount of
+ * comparison operations done in PHP.
+ *
+ */
+ protected function initialMatchOnID($selector, $matches) {
+ $id = $selector->id;
+ $found = $this->newMatches();
+
+ // Issue #145: DOMXPath will through an exception if the DOM is
+ // not set.
+ if (!($this->dom instanceof \DOMDocument)) {
+ return $found;
+ }
+ $baseQuery = ".//*[@id='{$id}']";
+ $xpath = new \DOMXPath($this->dom);
+
+ // Now we try to find any matching IDs.
+ foreach ($matches as $node) {
+ if ($node->getAttribute('id') == $id) {
+ $found->attach($node);
+ }
+ $nl = $this->initialXpathQuery($xpath, $node, $baseQuery);
+ $this->attachNodeList($nl, $found);
+ }
+ // Unset the ID selector.
+ $selector->id = NULL;
+ return $found;
+ }
+
+ /**
+ * Shortcut for setting the intial match.
+ *
+ * This shortcut should only be used when the initial
+ * element is '*' and there are classes set.
+ *
+ * In any other case, the element finding algo is
+ * faster and should be used instead.
+ */
+ protected function initialMatchOnClasses($selector, $matches) {
+ $found = $this->newMatches();
+
+ // Issue #145: DOMXPath will through an exception if the DOM is
+ // not set.
+ if (!($this->dom instanceof \DOMDocument)) {
+ return $found;
+ }
+ $baseQuery = ".//*[@class]";
+ $xpath = new \DOMXPath($this->dom);
+
+ // Now we try to find any matching IDs.
+ foreach ($matches as $node) {
+ // Refactor me!
+ if ($node->hasAttribute('class')) {
+ $intersect = array_intersect($selector->classes, explode(' ', $node->getAttribute('class')));
+ if (count($intersect) == count($selector->classes)) {
+ $found->attach($node);
+ }
+ }
+
+ $nl = $this->initialXpathQuery($xpath, $node, $baseQuery);
+ foreach ($nl as $node) {
+ $classes = $node->getAttribute('class');
+ $classArray = explode(' ', $classes);
+
+ $intersect = array_intersect($selector->classes, $classArray);
+ if (count($intersect) == count($selector->classes)) {
+ $found->attach($node);
+ }
+ }
+ }
+
+ // Unset the classes selector.
+ $selector->classes = array();
+
+ return $found;
+ }
+
+ /**
+ * Internal xpath query.
+ *
+ * This is optimized for very specific use, and is not a general
+ * purpose function.
+ */
+ private function initialXpathQuery($xpath, $node, $query) {
+ // This works around a bug in which the document element
+ // does not correctly search with the $baseQuery.
+ if ($node->isSameNode($this->dom->documentElement)) {
+ $query = substr($query, 1);
+ }
+
+ return $xpath->query($query, $node);
+ }
+
+ /**
+ * Shortcut for setting the initial match.
+ */
+ protected function initialMatchOnElement($selector, $matches) {
+ $element = $selector->element;
+ if (is_null($element)) {
+ $element = '*';
+ }
+ $found = $this->newMatches();
+ foreach ($matches as $node) {
+ // Capture the case where the initial element is the root element.
+ if ($node->tagName == $element
+ || $element == '*' && $node->parentNode instanceof \DOMDocument) {
+ $found->attach($node);
+ }
+ $nl = $node->getElementsByTagName($element);
+ $this->attachNodeList($nl, $found);
+ }
+
+ $selector->element = NULL;
+ return $found;
+ }
+
+ /**
+ * Get elements and filter by namespace.
+ */
+ protected function initialMatchOnElementNS($selector, $matches) {
+ $ns = $selector->ns;
+
+ $elements = $this->initialMatchOnElement($selector, $matches);
+
+ // "any namespace" matches anything.
+ if ($ns == '*') {
+ return $elements;
+ }
+
+ // Loop through and make a list of items that need to be filtered
+ // out, then filter them. This is required b/c ObjectStorage iterates
+ // wrongly when an item is detached in an access loop.
+ $detach = array();
+ foreach ($elements as $node) {
+ // This lookup must be done PER NODE.
+ $nsuri = $node->lookupNamespaceURI($ns);
+ if (empty($nsuri) || $node->namespaceURI != $nsuri) {
+ $detach[] = $node;
+ }
+ }
+ foreach ($detach as $rem) {
+ $elements->detach($rem);
+ }
+ $selector->ns = NULL;
+ return $elements;
+ }
+
+ /**
+ * Checks to see if the DOMNode matches the given element selector.
+ *
+ * This handles the following cases:
+ *
+ * - element (foo)
+ * - namespaced element (ns|foo)
+ * - namespaced wildcard (ns|*)
+ * - wildcard (* or *|*)
+ */
+ protected function matchElement($node, $element, $ns = NULL) {
+ if (empty($element)) {
+ return TRUE;
+ }
+
+ // Handle namespace.
+ if (!empty($ns) && $ns != '*') {
+ // Check whether we have a matching NS URI.
+ $nsuri = $node->lookupNamespaceURI($ns);
+ if(empty($nsuri) || $node->namespaceURI !== $nsuri) {
+ return FALSE;
+ }
+ }
+
+ // Compare local name to given element name.
+ return $element == '*' || $node->localName == $element;
+ }
+
+ /**
+ * Checks to see if the given DOMNode matches an "any element" (*).
+ *
+ * This does not handle namespaced whildcards.
+ */
+ /*
+ protected function matchAnyElement($node) {
+ $ancestors = $this->ancestors($node);
+
+ return count($ancestors) > 0;
+ }
+ */
+
+ /**
+ * Get a list of ancestors to the present node.
+ */
+ protected function ancestors($node) {
+ $buffer = array();
+ $parent = $node;
+ while (($parent = $parent->parentNode) !== NULL) {
+ $buffer[] = $parent;
+ }
+ return $buffer;
+ }
+
+ /**
+ * Check to see if DOMNode has all of the given attributes.
+ *
+ * This can handle namespaced attributes, including namespace
+ * wildcards.
+ */
+ protected function matchAttributes($node, $attributes) {
+ if (empty($attributes)) {
+ return TRUE;
+ }
+
+ foreach($attributes as $attr) {
+ $val = isset($attr['value']) ? $attr['value'] : NULL;
+
+ // Namespaced attributes.
+ if (isset($attr['ns']) && $attr['ns'] != '*') {
+ $nsuri = $node->lookupNamespaceURI($attr['ns']);
+ if (empty($nsuri) || !$node->hasAttributeNS($nsuri, $attr['name'])) {
+ return FALSE;
+ }
+ $matches = Util::matchesAttributeNS($node, $attr['name'], $nsuri, $val, $attr['op']);
+ }
+ elseif (isset($attr['ns']) && $attr['ns'] == '*' && $node->hasAttributes()) {
+ // Cycle through all of the attributes in the node. Note that
+ // these are DOMAttr objects.
+ $matches = FALSE;
+ $name = $attr['name'];
+ foreach ($node->attributes as $attrNode) {
+ if ($attrNode->localName == $name) {
+ $nsuri = $attrNode->namespaceURI;
+ $matches = Util::matchesAttributeNS($node, $name, $nsuri, $val, $attr['op']);
+ }
+ }
+ }
+ // No namespace.
+ else {
+ $matches = Util::matchesAttribute($node, $attr['name'], $val, $attr['op']);
+ }
+
+ if (!$matches) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+ }
+ /**
+ * Check that the given DOMNode has the given ID.
+ */
+ protected function matchId($node, $id) {
+ if (empty($id)) {
+ return TRUE;
+ }
+ return $node->hasAttribute('id') && $node->getAttribute('id') == $id;
+ }
+ /**
+ * Check that the given DOMNode has all of the given classes.
+ */
+ protected function matchClasses($node, $classes) {
+ if (empty($classes)) {
+ return TRUE;
+ }
+
+ if (!$node->hasAttribute('class')) {
+ return FALSE;
+ }
+
+ $eleClasses = preg_split('/\s+/', $node->getAttribute('class'));
+ if (empty($eleClasses)) {
+ return FALSE;
+ }
+
+ // The intersection should match the given $classes.
+ $missing = array_diff($classes, array_intersect($classes, $eleClasses));
+
+ return count($missing) == 0;
+ }
+ protected function matchPseudoClasses($node, $pseudoClasses) {
+ $ret = TRUE;
+ foreach ($pseudoClasses as $pseudoClass) {
+ $name = $pseudoClass['name'];
+ // Avoid E_STRICT violation.
+ $value = isset($pseudoClass['value']) ? $pseudoClass['value'] : NULL;
+ $ret &= $this->psHandler->elementMatches($name, $node, $this->scopeNode, $value);
+ }
+ return $ret;
+ }
+ /**
+ * Test whether the given node matches the pseudoElements.
+ *
+ * If any pseudo-elements are passed, this will test to see
+ * <i>if conditions obtain that would allow the pseudo-element
+ * to be created</i>. This does not modify the match in any way.
+ */
+ protected function matchPseudoElements($node, $pseudoElements) {
+ if (empty($pseudoElements)) {
+ return TRUE;
+ }
+
+ foreach ($pseudoElements as $pse) {
+ switch ($pse) {
+ case 'first-line':
+ case 'first-letter':
+ case 'before':
+ case 'after':
+ return strlen($node->textContent) > 0;
+ case 'selection':
+ throw new \QueryPath\CSS\NotImplementedException("::$name is not implemented.");
+ }
+ }
+ }
+
+ protected function newMatches() {
+ return new \SplObjectStorage();
+ }
+
+ /**
+ * Get the internal match set.
+ * Internal utility function.
+ */
+ protected function getMatches() {
+ return $this->matches();
+ }
+
+ /**
+ * Set the internal match set.
+ *
+ * Internal utility function.
+ */
+ protected function setMatches($matches) {
+ $this->matches = $matches;
+ }
+
+ /**
+ * Attach all nodes in a node list to the given \SplObjectStorage.
+ */
+ public function attachNodeList(\DOMNodeList $nodeList, \SplObjectStorage $splos) {
+ foreach ($nodeList as $item) $splos->attach($item);
+ }
+
+ public function getDocument() {
+ return $this->dom;
+ }
+
+}