diff options
author | emkael <emkael@tlen.pl> | 2018-11-07 22:08:03 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2018-11-07 23:03:32 +0100 |
commit | 5f655f3b9fff2f41754b9612ce178b495910f4d4 (patch) | |
tree | d0e5371d8b7e8a11b03953325bf98005c3fe824b /tdd-simple-html-dom.php | |
parent | f5998c8bad50636ce95567eec4f9f3b837c017cc (diff) |
Moving app to separate directory, scanning for PBN files in a separate directory
Fixes #2
Diffstat (limited to 'tdd-simple-html-dom.php')
-rw-r--r-- | tdd-simple-html-dom.php | 975 |
1 files changed, 0 insertions, 975 deletions
diff --git a/tdd-simple-html-dom.php b/tdd-simple-html-dom.php deleted file mode 100644 index b3b9ddd..0000000 --- a/tdd-simple-html-dom.php +++ /dev/null @@ -1,975 +0,0 @@ -<?php -/******************************************************************************* -Version: 1.11 ($Rev: 175 $) -Website: http://sourceforge.net/projects/simplehtmldom/ -Author: S.C. Chen <me578022@gmail.com> -Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) -Contributions by: - Yousuke Kumakura (Attribute filters) - Vadim Voituk (Negative indexes supports of "find" method) - Antcs (Constructor with automatically load contents either text or file/url) -Licensed under The MIT License -Redistributions of files must retain the above copyright notice. -*******************************************************************************/ - -define('HDOM_TYPE_ELEMENT', 1); -define('HDOM_TYPE_COMMENT', 2); -define('HDOM_TYPE_TEXT', 3); -define('HDOM_TYPE_ENDTAG', 4); -define('HDOM_TYPE_ROOT', 5); -define('HDOM_TYPE_UNKNOWN', 6); -define('HDOM_QUOTE_DOUBLE', 0); -define('HDOM_QUOTE_SINGLE', 1); -define('HDOM_QUOTE_NO', 3); -define('HDOM_INFO_BEGIN', 0); -define('HDOM_INFO_END', 1); -define('HDOM_INFO_QUOTE', 2); -define('HDOM_INFO_SPACE', 3); -define('HDOM_INFO_TEXT', 4); -define('HDOM_INFO_INNER', 5); -define('HDOM_INFO_OUTER', 6); -define('HDOM_INFO_ENDSPACE',7); - -// helper functions -// ----------------------------------------------------------------------------- -// get html dom form file -function file_get_html() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array('file_get_contents', $args), true); - return $dom; -} - -// get html dom form string -function str_get_html($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; -} - -// dump html dom tree -function dump_html_tree($node, $show_attr=true, $deep=0) { - $lead = str_repeat(' ', $deep); - echo $lead.$node->tag; - if ($show_attr && count($node->attr)>0) { - echo '('; - foreach($node->attr as $k=>$v) - echo "[$k]=>\"".$node->$k.'", '; - echo ')'; - } - echo "\n"; - - foreach($node->nodes as $c) - dump_html_tree($c, $show_attr, $deep+1); -} - -// get dom form file (deprecated) -function file_get_dom() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array('file_get_contents', $args), true); - return $dom; -} - -// get dom form string (deprecated) -function str_get_dom($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; -} - -// simple html dom node -// ----------------------------------------------------------------------------- -class simple_html_dom_node { - public $nodetype = HDOM_TYPE_TEXT; - public $tag = 'text'; - public $attr = array(); - public $children = array(); - public $nodes = array(); - public $parent = null; - public $_ = array(); - private $dom = null; - - function __construct($dom) { - $this->dom = $dom; - $dom->nodes[] = $this; - } - - function __destruct() { - $this->clear(); - } - - function __toString() { - return $this->outertext(); - } - - // clean up memory due to php5 circular references memory leak... - function clear() { - $this->dom = null; - $this->nodes = null; - $this->parent = null; - $this->children = null; - } - - // dump node's tree - function dump($show_attr=true) { - dump_html_tree($this, $show_attr); - } - - // returns the parent of node - function parent() { - return $this->parent; - } - - // returns children of node - function children($idx=-1) { - if ($idx===-1) return $this->children; - if (isset($this->children[$idx])) return $this->children[$idx]; - return null; - } - - // returns the first child of node - function first_child() { - if (count($this->children)>0) return $this->children[0]; - return null; - } - - // returns the last child of node - function last_child() { - if (($count=count($this->children))>0) return $this->children[$count-1]; - return null; - } - - // returns the next sibling of node - function next_sibling() { - if ($this->parent===null) return null; - $idx = 0; - $count = count($this->parent->children); - while ($idx<$count && $this!==$this->parent->children[$idx]) - ++$idx; - if (++$idx>=$count) return null; - return $this->parent->children[$idx]; - } - - // returns the previous sibling of node - function prev_sibling() { - if ($this->parent===null) return null; - $idx = 0; - $count = count($this->parent->children); - while ($idx<$count && $this!==$this->parent->children[$idx]) - ++$idx; - if (--$idx<0) return null; - return $this->parent->children[$idx]; - } - - // get dom node's inner html - function innertext() { - if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - - $ret = ''; - foreach($this->nodes as $n) - $ret .= $n->outertext(); - return $ret; - } - - // get dom node's outer text (with tag) - function outertext() { - if ($this->tag==='root') return $this->innertext(); - - // trigger callback - if ($this->dom->callback!==null) - call_user_func_array($this->dom->callback, array($this)); - - if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - - // render begin tag - $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); - - // render inner text - if (isset($this->_[HDOM_INFO_INNER])) - $ret .= $this->_[HDOM_INFO_INNER]; - else { - foreach($this->nodes as $n) - $ret .= $n->outertext(); - } - - // render end tag - if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) - $ret .= '</'.$this->tag.'>'; - return $ret; - } - - // get dom node's plain text - function text() { - if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; - switch ($this->nodetype) { - case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - case HDOM_TYPE_COMMENT: return ''; - case HDOM_TYPE_UNKNOWN: return ''; - } - if (strcasecmp($this->tag, 'script')===0) return ''; - if (strcasecmp($this->tag, 'style')===0) return ''; - - $ret = ''; - foreach($this->nodes as $n) - $ret .= $n->text(); - return $ret; - } - - function xmltext() { - $ret = $this->innertext(); - $ret = str_ireplace('<![CDATA[', '', $ret); - $ret = str_replace(']]>', '', $ret); - return $ret; - } - - // build node's text with tag - function makeup() { - // text, comment, unknown - if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); - - $ret = '<'.$this->tag; - $i = -1; - - foreach($this->attr as $key=>$val) { - ++$i; - - // skip removed attribute - if ($val===null || $val===false) - continue; - - $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; - //no value attr: nowrap, checked selected... - if ($val===true) - $ret .= $key; - else { - switch($this->_[HDOM_INFO_QUOTE][$i]) { - case HDOM_QUOTE_DOUBLE: $quote = '"'; break; - case HDOM_QUOTE_SINGLE: $quote = '\''; break; - default: $quote = ''; - } - $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; - } - } - $ret = $this->dom->restore_noise($ret); - return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; - } - - // find elements by css selector - function find($selector, $idx=null) { - $selectors = $this->parse_selector($selector); - if (($count=count($selectors))===0) return array(); - $found_keys = array(); - - // find each selector - for ($c=0; $c<$count; ++$c) { - if (($levle=count($selectors[0]))===0) return array(); - if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); - - $head = array($this->_[HDOM_INFO_BEGIN]=>1); - - // handle descendant selectors, no recursive! - for ($l=0; $l<$levle; ++$l) { - $ret = array(); - foreach($head as $k=>$v) { - $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; - $n->seek($selectors[$c][$l], $ret); - } - $head = $ret; - } - - foreach($head as $k=>$v) { - if (!isset($found_keys[$k])) - $found_keys[$k] = 1; - } - } - - // sort keys - ksort($found_keys); - - $found = array(); - foreach($found_keys as $k=>$v) - $found[] = $this->dom->nodes[$k]; - - // return nth-element or array - if (is_null($idx)) return $found; - else if ($idx<0) $idx = count($found) + $idx; - return (isset($found[$idx])) ? $found[$idx] : null; - } - - // seek for given conditions - protected function seek($selector, &$ret) { - list($tag, $key, $val, $exp, $no_key) = $selector; - - // xpath index - if ($tag && $key && is_numeric($key)) { - $count = 0; - foreach ($this->children as $c) { - if ($tag==='*' || $tag===$c->tag) { - if (++$count==$key) { - $ret[$c->_[HDOM_INFO_BEGIN]] = 1; - return; - } - } - } - return; - } - - $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; - if ($end==0) { - $parent = $this->parent; - while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { - $end -= 1; - $parent = $parent->parent; - } - $end += $parent->_[HDOM_INFO_END]; - } - - for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { - $node = $this->dom->nodes[$i]; - $pass = true; - - if ($tag==='*' && !$key) { - if (in_array($node, $this->children, true)) - $ret[$i] = 1; - continue; - } - - // compare tag - if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} - // compare key - if ($pass && $key) { - if ($no_key) { - if (isset($node->attr[$key])) $pass=false; - } - else if (!isset($node->attr[$key])) $pass=false; - } - // compare value - if ($pass && $key && $val && $val!=='*') { - $check = $this->match($exp, $val, $node->attr[$key]); - // handle multiple class - if (!$check && strcasecmp($key, 'class')===0) { - foreach(explode(' ',$node->attr[$key]) as $k) { - $check = $this->match($exp, $val, $k); - if ($check) break; - } - } - if (!$check) $pass = false; - } - if ($pass) $ret[$i] = 1; - unset($node); - } - } - - protected function match($exp, $pattern, $value) { - switch ($exp) { - case '=': - return ($value===$pattern); - case '!=': - return ($value!==$pattern); - case '^=': - return preg_match("/^".preg_quote($pattern,'/')."/", $value); - case '$=': - return preg_match("/".preg_quote($pattern,'/')."$/", $value); - case '*=': - if ($pattern[0]=='/') - return preg_match($pattern, $value); - return preg_match("/".$pattern."/i", $value); - } - return false; - } - - protected function parse_selector($selector_string) { - // pattern of CSS selectors, modified from mootools - $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; - preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); - $selectors = array(); - $result = array(); - //print_r($matches); - - foreach ($matches as $m) { - $m[0] = trim($m[0]); - if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; - // for borwser grnreated xpath - if ($m[1]==='tbody') continue; - - list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); - if(!empty($m[2])) {$key='id'; $val=$m[2];} - if(!empty($m[3])) {$key='class'; $val=$m[3];} - if(!empty($m[4])) {$key=$m[4];} - if(!empty($m[5])) {$exp=$m[5];} - if(!empty($m[6])) {$val=$m[6];} - - // convert to lowercase - if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} - //elements that do NOT have the specified attribute - if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} - - $result[] = array($tag, $key, $val, $exp, $no_key); - if (trim($m[7])===',') { - $selectors[] = $result; - $result = array(); - } - } - if (count($result)>0) - $selectors[] = $result; - return $selectors; - } - - function __get($name) { - if (isset($this->attr[$name])) return $this->attr[$name]; - switch($name) { - case 'outertext': return $this->outertext(); - case 'innertext': return $this->innertext(); - case 'plaintext': return $this->text(); - case 'xmltext': return $this->xmltext(); - default: return array_key_exists($name, $this->attr); - } - } - - function __set($name, $value) { - switch($name) { - case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; - case 'innertext': - if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; - return $this->_[HDOM_INFO_INNER] = $value; - } - if (!isset($this->attr[$name])) { - $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); - $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; - } - $this->attr[$name] = $value; - } - - function __isset($name) { - switch($name) { - case 'outertext': return true; - case 'innertext': return true; - case 'plaintext': return true; - } - //no value attr: nowrap, checked selected... - return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); - } - - function __unset($name) { - if (isset($this->attr[$name])) - unset($this->attr[$name]); - } - - // camel naming conventions - function getAllAttributes() {return $this->attr;} - function getAttribute($name) {return $this->__get($name);} - function setAttribute($name, $value) {$this->__set($name, $value);} - function hasAttribute($name) {return $this->__isset($name);} - function removeAttribute($name) {$this->__set($name, null);} - function getElementById($id) {return $this->find("#$id", 0);} - function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} - function getElementByTagName($name) {return $this->find($name, 0);} - function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} - function parentNode() {return $this->parent();} - function childNodes($idx=-1) {return $this->children($idx);} - function firstChild() {return $this->first_child();} - function lastChild() {return $this->last_child();} - function nextSibling() {return $this->next_sibling();} - function previousSibling() {return $this->prev_sibling();} -} - -// simple html dom parser -// ----------------------------------------------------------------------------- -class simple_html_dom { - public $root = null; - public $nodes = array(); - public $callback = null; - public $lowercase = false; - protected $pos; - protected $doc; - protected $char; - protected $size; - protected $cursor; - protected $parent; - protected $noise = array(); - protected $token_blank = " \t\r\n"; - protected $token_equal = ' =/>'; - protected $token_slash = " />\r\n\t"; - protected $token_attr = ' >'; - // use isset instead of in_array, performance boost about 30%... - protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); - protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); - protected $optional_closing_tags = array( - 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), - 'th'=>array('th'=>1), - 'td'=>array('td'=>1), - 'li'=>array('li'=>1), - 'dt'=>array('dt'=>1, 'dd'=>1), - 'dd'=>array('dd'=>1, 'dt'=>1), - 'dl'=>array('dd'=>1, 'dt'=>1), - 'p'=>array('p'=>1), - 'nobr'=>array('nobr'=>1), - ); - - function __construct($str=null) { - if ($str) { - if (preg_match("/^http:\/\//i",$str) || is_file($str)) - $this->load_file($str); - else - $this->load($str); - } - } - - function __destruct() { - $this->clear(); - } - - // load html from string - function load($str, $lowercase=true) { - // prepare - $this->prepare($str, $lowercase); - // strip out comments - $this->remove_noise("'<!--(.*?)-->'is"); - // strip out cdata - $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); - // strip out <style> tags - $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); - $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); - // strip out <script> tags - $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); - $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); - // strip out preformatted tags - $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); - // strip out server side scripts - $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); - // strip smarty scripts - $this->remove_noise("'(\{\w)(.*?)(\})'s", true); - - // parsing - while ($this->parse()); - // end - $this->root->_[HDOM_INFO_END] = $this->cursor; - } - - // load html from file - function load_file() { - $args = func_get_args(); - $this->load(call_user_func_array('file_get_contents', $args), true); - } - - // set callback function - function set_callback($function_name) { - $this->callback = $function_name; - } - - // remove callback function - function remove_callback() { - $this->callback = null; - } - - // save dom as string - function save($filepath='') { - $ret = $this->root->innertext(); - if ($filepath!=='') file_put_contents($filepath, $ret); - return $ret; - } - - // find dom node by css selector - function find($selector, $idx=null) { - return $this->root->find($selector, $idx); - } - - // clean up memory due to php5 circular references memory leak... - function clear() { - foreach($this->nodes as $n) {$n->clear(); $n = null;} - if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} - if (isset($this->root)) {$this->root->clear(); unset($this->root);} - unset($this->doc); - unset($this->noise); - } - - function dump($show_attr=true) { - $this->root->dump($show_attr); - } - - // prepare HTML data and init everything - protected function prepare($str, $lowercase=true) { - $this->clear(); - $this->doc = $str; - $this->pos = 0; - $this->cursor = 1; - $this->noise = array(); - $this->nodes = array(); - $this->lowercase = $lowercase; - $this->root = new simple_html_dom_node($this); - $this->root->tag = 'root'; - $this->root->_[HDOM_INFO_BEGIN] = -1; - $this->root->nodetype = HDOM_TYPE_ROOT; - $this->parent = $this->root; - // set the length of content - $this->size = strlen($str); - if ($this->size>0) $this->char = $this->doc[0]; - } - - // parse html content - protected function parse() { - if (($s = $this->copy_until_char('<'))==='') - return $this->read_tag(); - - // text - $node = new simple_html_dom_node($this); - ++$this->cursor; - $node->_[HDOM_INFO_TEXT] = $s; - $this->link_nodes($node, false); - return true; - } - - // read tag info - protected function read_tag() { - if ($this->char!=='<') { - $this->root->_[HDOM_INFO_END] = $this->cursor; - return false; - } - $begin_tag_pos = $this->pos; - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - - // end tag - if ($this->char==='/') { - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - $this->skip($this->token_blank_t); - $tag = $this->copy_until_char('>'); - - // skip attributes in end tag - if (($pos = strpos($tag, ' '))!==false) - $tag = substr($tag, 0, $pos); - - $parent_lower = strtolower($this->parent->tag); - $tag_lower = strtolower($tag); - - if ($parent_lower!==$tag_lower) { - if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { - $this->parent->_[HDOM_INFO_END] = 0; - $org_parent = $this->parent; - - while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) - $this->parent = $this->parent->parent; - - if (strtolower($this->parent->tag)!==$tag_lower) { - $this->parent = $org_parent; // restore origonal parent - if ($this->parent->parent) $this->parent = $this->parent->parent; - $this->parent->_[HDOM_INFO_END] = $this->cursor; - return $this->as_text_node($tag); - } - } - else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { - $this->parent->_[HDOM_INFO_END] = 0; - $org_parent = $this->parent; - - while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) - $this->parent = $this->parent->parent; - - if (strtolower($this->parent->tag)!==$tag_lower) { - $this->parent = $org_parent; // restore origonal parent - $this->parent->_[HDOM_INFO_END] = $this->cursor; - return $this->as_text_node($tag); - } - } - else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { - $this->parent->_[HDOM_INFO_END] = 0; - $this->parent = $this->parent->parent; - } - else - return $this->as_text_node($tag); - } - - $this->parent->_[HDOM_INFO_END] = $this->cursor; - if ($this->parent->parent) $this->parent = $this->parent->parent; - - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return true; - } - - $node = new simple_html_dom_node($this); - $node->_[HDOM_INFO_BEGIN] = $this->cursor; - ++$this->cursor; - $tag = $this->copy_until($this->token_slash); - - // doctype, cdata & comments... - if (isset($tag[0]) && $tag[0]==='!') { - $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); - - if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { - $node->nodetype = HDOM_TYPE_COMMENT; - $node->tag = 'comment'; - } else { - $node->nodetype = HDOM_TYPE_UNKNOWN; - $node->tag = 'unknown'; - } - - if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; - $this->link_nodes($node, true); - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return true; - } - - // text - if ($pos=strpos($tag, '<')!==false) { - $tag = '<' . substr($tag, 0, -1); - $node->_[HDOM_INFO_TEXT] = $tag; - $this->link_nodes($node, false); - $this->char = $this->doc[--$this->pos]; // prev - return true; - } - - if (!preg_match("/^[\w-:]+$/", $tag)) { - $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); - if ($this->char==='<') { - $this->link_nodes($node, false); - return true; - } - - if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; - $this->link_nodes($node, false); - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return true; - } - - // begin tag - $node->nodetype = HDOM_TYPE_ELEMENT; - $tag_lower = strtolower($tag); - $node->tag = ($this->lowercase) ? $tag_lower : $tag; - - // handle optional closing tags - if (isset($this->optional_closing_tags[$tag_lower]) ) { - while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { - $this->parent->_[HDOM_INFO_END] = 0; - $this->parent = $this->parent->parent; - } - $node->parent = $this->parent; - } - - $guard = 0; // prevent infinity loop - $space = array($this->copy_skip($this->token_blank), '', ''); - - // attributes - do { - if ($this->char!==null && $space[0]==='') break; - $name = $this->copy_until($this->token_equal); - if($guard===$this->pos) { - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - continue; - } - $guard = $this->pos; - - // handle endless '<' - if($this->pos>=$this->size-1 && $this->char!=='>') { - $node->nodetype = HDOM_TYPE_TEXT; - $node->_[HDOM_INFO_END] = 0; - $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; - $node->tag = 'text'; - $this->link_nodes($node, false); - return true; - } - - // handle mismatch '<' - if($this->doc[$this->pos-1]=='<') { - $node->nodetype = HDOM_TYPE_TEXT; - $node->tag = 'text'; - $node->attr = array(); - $node->_[HDOM_INFO_END] = 0; - $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); - $this->pos -= 2; - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - $this->link_nodes($node, false); - return true; - } - - if ($name!=='/' && $name!=='') { - $space[1] = $this->copy_skip($this->token_blank); - $name = $this->restore_noise($name); - if ($this->lowercase) $name = strtolower($name); - if ($this->char==='=') { - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - $this->parse_attr($node, $name, $space); - } - else { - //no value attr: nowrap, checked selected... - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; - $node->attr[$name] = true; - if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev - } - $node->_[HDOM_INFO_SPACE][] = $space; - $space = array($this->copy_skip($this->token_blank), '', ''); - } - else - break; - } while($this->char!=='>' && $this->char!=='/'); - - $this->link_nodes($node, true); - $node->_[HDOM_INFO_ENDSPACE] = $space[0]; - - // check self closing - if ($this->copy_until_char_escape('>')==='/') { - $node->_[HDOM_INFO_ENDSPACE] .= '/'; - $node->_[HDOM_INFO_END] = 0; - } - else { - // reset parent - if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; - } - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return true; - } - - // parse attributes - protected function parse_attr($node, $name, &$space) { - $space[2] = $this->copy_skip($this->token_blank); - switch($this->char) { - case '"': - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - break; - case '\'': - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - break; - default: - $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; - $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); - } - } - - // link node's parent - protected function link_nodes(&$node, $is_child) { - $node->parent = $this->parent; - $this->parent->nodes[] = $node; - if ($is_child) - $this->parent->children[] = $node; - } - - // as a text node - protected function as_text_node($tag) { - $node = new simple_html_dom_node($this); - ++$this->cursor; - $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; - $this->link_nodes($node, false); - $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return true; - } - - protected function skip($chars) { - $this->pos += strspn($this->doc, $chars, $this->pos); - $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - } - - protected function copy_skip($chars) { - $pos = $this->pos; - $len = strspn($this->doc, $chars, $pos); - $this->pos += $len; - $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - if ($len===0) return ''; - return substr($this->doc, $pos, $len); - } - - protected function copy_until($chars) { - $pos = $this->pos; - $len = strcspn($this->doc, $chars, $pos); - $this->pos += $len; - $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next - return substr($this->doc, $pos, $len); - } - - protected function copy_until_char($char) { - if ($this->char===null) return ''; - - if (($pos = strpos($this->doc, $char, $this->pos))===false) { - $ret = substr($this->doc, $this->pos, $this->size-$this->pos); - $this->char = null; - $this->pos = $this->size; - return $ret; - } - - if ($pos===$this->pos) return ''; - $pos_old = $this->pos; - $this->char = $this->doc[$pos]; - $this->pos = $pos; - return substr($this->doc, $pos_old, $pos-$pos_old); - } - - protected function copy_until_char_escape($char) { - if ($this->char===null) return ''; - - $start = $this->pos; - while(1) { - if (($pos = strpos($this->doc, $char, $start))===false) { - $ret = substr($this->doc, $this->pos, $this->size-$this->pos); - $this->char = null; - $this->pos = $this->size; - return $ret; - } - - if ($pos===$this->pos) return ''; - - if ($this->doc[$pos-1]==='\\') { - $start = $pos+1; - continue; - } - - $pos_old = $this->pos; - $this->char = $this->doc[$pos]; - $this->pos = $pos; - return substr($this->doc, $pos_old, $pos-$pos_old); - } - } - - // remove noise from html content - protected function remove_noise($pattern, $remove_tag=false) { - $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); - - for ($i=$count-1; $i>-1; --$i) { - $key = '___noise___'.sprintf('% 3d', count($this->noise)+100); - $idx = ($remove_tag) ? 0 : 1; - $this->noise[$key] = $matches[$i][$idx][0]; - $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); - } - - // reset the length of content - $this->size = strlen($this->doc); - if ($this->size>0) $this->char = $this->doc[0]; - } - - // restore noise to html content - function restore_noise($text) { - while(($pos=strpos($text, '___noise___'))!==false) { - $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13]; - if (isset($this->noise[$key])) - $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14); - } - return $text; - } - - function __toString() { - return $this->root->innertext(); - } - - function __get($name) { - switch($name) { - case 'outertext': return $this->root->innertext(); - case 'innertext': return $this->root->innertext(); - case 'plaintext': return $this->root->text(); - } - } - - // camel naming conventions - function childNodes($idx=-1) {return $this->root->childNodes($idx);} - function firstChild() {return $this->root->first_child();} - function lastChild() {return $this->root->last_child();} - function getElementById($id) {return $this->find("#$id", 0);} - function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} - function getElementByTagName($name) {return $this->find($name, 0);} - function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} - function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);} -} -?>
\ No newline at end of file |