summaryrefslogtreecommitdiff
path: root/lib/phptal/PHPTAL/Dom/SaxXmlParser.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/phptal/PHPTAL/Dom/SaxXmlParser.php')
-rw-r--r--lib/phptal/PHPTAL/Dom/SaxXmlParser.php480
1 files changed, 480 insertions, 0 deletions
diff --git a/lib/phptal/PHPTAL/Dom/SaxXmlParser.php b/lib/phptal/PHPTAL/Dom/SaxXmlParser.php
new file mode 100644
index 0000000..b59a26d
--- /dev/null
+++ b/lib/phptal/PHPTAL/Dom/SaxXmlParser.php
@@ -0,0 +1,480 @@
+<?php
+/**
+ * PHPTAL templating engine
+ *
+ * PHP Version 5
+ *
+ * @category HTML
+ * @package PHPTAL
+ * @author Laurent Bedubourg <lbedubourg@motion-twin.com>
+ * @author Kornel LesiƄski <kornel@aardvarkmedia.co.uk>
+ * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
+ * @version SVN: $Id$
+ * @link http://phptal.org/
+ */
+
+/**
+ * Simple sax like xml parser for PHPTAL
+ * ("Dom" in the class name comes from name of the directory, not mode of operation)
+ *
+ * At the time this parser was created, standard PHP libraries were not suitable
+ * (could not retrieve doctypes, xml declaration, problems with comments and CDATA).
+ *
+ * There are still some problems: XML parsers don't care about exact format of enties
+ * or CDATA sections (PHPTAL tries to preserve them),
+ * <?php ?> blocks are not allowed in attributes.
+ *
+ * This parser failed to enforce some XML well-formedness constraints,
+ * and there are ill-formed templates "in the wild" because of this.
+ *
+ * @package PHPTAL
+ * @subpackage Dom
+ * @see PHPTAL_DOM_DocumentBuilder
+ */
+class PHPTAL_Dom_SaxXmlParser
+{
+ private $_file;
+ private $_line;
+ private $_source;
+
+ // available parser states
+ const ST_ROOT = 0;
+ const ST_TEXT = 1;
+ const ST_LT = 2;
+ const ST_TAG_NAME = 3;
+ const ST_TAG_CLOSE = 4;
+ const ST_TAG_SINGLE = 5;
+ const ST_TAG_ATTRIBUTES = 6;
+ const ST_TAG_BETWEEN_ATTRIBUTE = 7;
+ const ST_CDATA = 8;
+ const ST_COMMENT = 9;
+ const ST_DOCTYPE = 10;
+ const ST_XMLDEC = 11;
+ const ST_PREPROC = 12;
+ const ST_ATTR_KEY = 13;
+ const ST_ATTR_EQ = 14;
+ const ST_ATTR_QUOTE = 15;
+ const ST_ATTR_VALUE = 16;
+
+ const BOM_STR = "\xef\xbb\xbf";
+
+
+ static $state_names = array(
+ self::ST_ROOT => 'root node',
+ self::ST_TEXT => 'text',
+ self::ST_LT => 'start of tag',
+ self::ST_TAG_NAME => 'tag name',
+ self::ST_TAG_CLOSE => 'closing tag',
+ self::ST_TAG_SINGLE => 'self-closing tag',
+ self::ST_TAG_ATTRIBUTES => 'tag',
+ self::ST_TAG_BETWEEN_ATTRIBUTE => 'tag attributes',
+ self::ST_CDATA => 'CDATA',
+ self::ST_COMMENT => 'comment',
+ self::ST_DOCTYPE => 'doctype',
+ self::ST_XMLDEC => 'XML declaration',
+ self::ST_PREPROC => 'preprocessor directive',
+ self::ST_ATTR_KEY => 'attribute name',
+ self::ST_ATTR_EQ => 'attribute value',
+ self::ST_ATTR_QUOTE => 'quoted attribute value',
+ self::ST_ATTR_VALUE => 'unquoted attribute value',
+ );
+
+ private $input_encoding;
+ public function __construct($input_encoding)
+ {
+ $this->input_encoding = $input_encoding;
+ $this->_file = "<string>";
+ }
+
+ public function parseFile(PHPTAL_Dom_DocumentBuilder $builder, $src)
+ {
+ if (!file_exists($src)) {
+ throw new PHPTAL_IOException("file $src not found");
+ }
+ return $this->parseString($builder, file_get_contents($src), $src);
+ }
+
+ public function parseString(PHPTAL_Dom_DocumentBuilder $builder, $src, $filename = '<string>')
+ {
+ try
+ {
+ $builder->setEncoding($this->input_encoding);
+ $this->_file = $filename;
+
+ $this->_line = 1;
+ $state = self::ST_ROOT;
+ $mark = 0;
+ $len = strlen($src);
+
+ $quoteStyle = '"';
+ $tagname = "";
+ $attribute = "";
+ $attributes = array();
+
+ $customDoctype = false;
+
+ $builder->setSource($this->_file, $this->_line);
+ $builder->onDocumentStart();
+
+ $i=0;
+ // remove BOM (UTF-8 byte order mark)...
+ if (substr($src, 0, 3) === self::BOM_STR) {
+ $i=3;
+ }
+ for (; $i<$len; $i++) {
+ $c = $src[$i]; // Change to substr($src, $i, 1); if you want to use mb_string.func_overload
+
+ if ($c === "\n") $builder->setSource($this->_file, ++$this->_line);
+
+ switch ($state) {
+ case self::ST_ROOT:
+ if ($c === '<') {
+ $mark = $i; // mark tag start
+ $state = self::ST_LT;
+ } elseif (!self::isWhiteChar($c)) {
+ $this->raiseError("Characters found before beginning of the document! (wrap document in < tal:block > to avoid this error)");
+ }
+ break;
+
+ case self::ST_TEXT:
+ if ($c === '<') {
+ if ($mark != $i) {
+ $builder->onElementData($this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark))));
+ }
+ $mark = $i;
+ $state = self::ST_LT;
+ }
+ break;
+
+ case self::ST_LT:
+ if ($c === '/') {
+ $mark = $i+1;
+ $state = self::ST_TAG_CLOSE;
+ } elseif ($c === '?' and strtolower(substr($src, $i, 5)) === '?xml ') {
+ $state = self::ST_XMLDEC;
+ } elseif ($c === '?') {
+ $state = self::ST_PREPROC;
+ } elseif ($c === '!' and substr($src, $i, 3) === '!--') {
+ $state = self::ST_COMMENT;
+ } elseif ($c === '!' and substr($src, $i, 8) === '![CDATA[') {
+ $state = self::ST_CDATA;
+ $mark = $i+8; // past opening tag
+ } elseif ($c === '!' and strtoupper(substr($src, $i, 8)) === '!DOCTYPE') {
+ $state = self::ST_DOCTYPE;
+ } elseif (self::isWhiteChar($c)) {
+ $state = self::ST_TEXT;
+ } else {
+ $mark = $i; // mark node name start
+ $attributes = array();
+ $attribute = "";
+ $state = self::ST_TAG_NAME;
+ }
+ break;
+
+ case self::ST_TAG_NAME:
+ if (self::isWhiteChar($c) || $c === '/' || $c === '>') {
+ $tagname = substr($src, $mark, $i-$mark);
+ if (!$this->isValidQName($tagname)) $this->raiseError("Invalid tag name '$tagname'");
+
+ if ($c === '/') {
+ $state = self::ST_TAG_SINGLE;
+ } elseif ($c === '>') {
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ $builder->onElementStart($tagname, $attributes);
+ } else /* isWhiteChar */ {
+ $state = self::ST_TAG_ATTRIBUTES;
+ }
+ }
+ break;
+
+ case self::ST_TAG_CLOSE:
+ if ($c === '>') {
+ $tagname = rtrim(substr($src, $mark, $i-$mark));
+ $builder->onElementClose($tagname);
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_TAG_SINGLE:
+ if ($c !== '>') {
+ $this->raiseError("Expected '/>', but found '/$c' inside tag < $tagname >");
+ }
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ $builder->onElementStart($tagname, $attributes);
+ $builder->onElementClose($tagname);
+ break;
+
+ case self::ST_TAG_BETWEEN_ATTRIBUTE:
+ case self::ST_TAG_ATTRIBUTES:
+ if ($c === '>') {
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ $builder->onElementStart($tagname, $attributes);
+ } elseif ($c === '/') {
+ $state = self::ST_TAG_SINGLE;
+ } elseif (self::isWhiteChar($c)) {
+ $state = self::ST_TAG_ATTRIBUTES;
+ } elseif ($state === self::ST_TAG_ATTRIBUTES && $this->isValidQName($c)) {
+ $mark = $i; // mark attribute key start
+ $state = self::ST_ATTR_KEY;
+ } else $this->raiseError("Unexpected character '$c' between attributes of < $tagname >");
+ break;
+
+ case self::ST_COMMENT:
+ if ($c === '>' && $i > $mark+4 && substr($src, $i-2, 2) === '--') {
+
+ if (preg_match('/^-|--|-$/', substr($src, $mark +4, $i-$mark+1 -7))) {
+ $this->raiseError("Ill-formed comment. XML comments are not allowed to contain '--' or start/end with '-': ".substr($src, $mark+4, $i-$mark+1-7));
+ }
+
+ $builder->onComment($this->checkEncoding(substr($src, $mark+4, $i-$mark+1-7)));
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_CDATA:
+ if ($c === '>' and substr($src, $i-2, 2) === ']]') {
+ $builder->onCDATASection($this->checkEncoding(substr($src, $mark, $i-$mark-2)));
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_XMLDEC:
+ if ($c === '?' && substr($src, $i, 2) === '?>') {
+ $builder->onXmlDecl($this->checkEncoding(substr($src, $mark, $i-$mark+2)));
+ $i++; // skip '>'
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_DOCTYPE:
+ if ($c === '[') {
+ $customDoctype = true;
+ } elseif ($customDoctype && $c === '>' && substr($src, $i-1, 2) === ']>') {
+ $customDoctype = false;
+ $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ } elseif (!$customDoctype && $c === '>') {
+ $customDoctype = false;
+ $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_PREPROC:
+ if ($c === '>' and substr($src, $i-1, 1) === '?') {
+ $builder->onProcessingInstruction($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
+ $mark = $i+1; // mark text start
+ $state = self::ST_TEXT;
+ }
+ break;
+
+ case self::ST_ATTR_KEY:
+ if ($c === '=' || self::isWhiteChar($c)) {
+ $attribute = substr($src, $mark, $i-$mark);
+ if (!$this->isValidQName($attribute)) {
+ $this->raiseError("Invalid attribute name '$attribute' in < $tagname >");
+ }
+ if (isset($attributes[$attribute])) {
+ $this->raiseError("Attribute $attribute in < $tagname > is defined more than once");
+ }
+
+ if ($c === '=') $state = self::ST_ATTR_VALUE;
+ else /* white char */ $state = self::ST_ATTR_EQ;
+ } elseif ($c === '/' || $c==='>') {
+ $attribute = substr($src, $mark, $i-$mark);
+ if (!$this->isValidQName($attribute)) {
+ $this->raiseError("Invalid attribute name '$attribute'");
+ }
+ $this->raiseError("Attribute $attribute does not have value (found end of tag instead of '=')");
+ }
+ break;
+
+ case self::ST_ATTR_EQ:
+ if ($c === '=') {
+ $state = self::ST_ATTR_VALUE;
+ } elseif (!self::isWhiteChar($c)) {
+ $this->raiseError("Attribute $attribute in < $tagname > does not have value (found character '$c' instead of '=')");
+ }
+ break;
+
+ case self::ST_ATTR_VALUE:
+ if (self::isWhiteChar($c)) {
+ } elseif ($c === '"' or $c === '\'') {
+ $quoteStyle = $c;
+ $state = self::ST_ATTR_QUOTE;
+ $mark = $i+1; // mark attribute real value start
+ } else {
+ $this->raiseError("Value of attribute $attribute in < $tagname > is not in quotes (found character '$c' instead of quote)");
+ }
+ break;
+
+ case self::ST_ATTR_QUOTE:
+ if ($c === $quoteStyle) {
+ $attributes[$attribute] = $this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark)));
+
+ // PHPTAL's code generator assumes input is escaped for double-quoted strings. Single-quoted attributes need to be converted.
+ // FIXME: it should be escaped at later stage.
+ $attributes[$attribute] = str_replace('"',"&quot;", $attributes[$attribute]);
+ $state = self::ST_TAG_BETWEEN_ATTRIBUTE;
+ }
+ break;
+ }
+ }
+
+ if ($state === self::ST_TEXT) // allows text past root node, which is in violation of XML spec
+ {
+ if ($i > $mark) {
+ $text = substr($src, $mark, $i-$mark);
+ if (!ctype_space($text)) $this->raiseError("Characters found after end of the root element (wrap document in < tal:block > to avoid this error)");
+ }
+ } else {
+ if ($state === self::ST_ROOT) {
+ $msg = "Document does not have any tags";
+ } else {
+ $msg = "Finished document in unexpected state: ".self::$state_names[$state]." is not finished";
+ }
+ $this->raiseError($msg);
+ }
+
+ $builder->onDocumentEnd();
+ }
+ catch(PHPTAL_TemplateException $e)
+ {
+ $e->hintSrcPosition($this->_file, $this->_line);
+ throw $e;
+ }
+ return $builder;
+ }
+
+ private function isValidQName($name)
+ {
+ $name = $this->checkEncoding($name);
+ return preg_match('/^([a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*:)?[a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*$/i', $name);
+ }
+
+ private function checkEncoding($str)
+ {
+ if ($str === '') return '';
+
+ if ($this->input_encoding === 'UTF-8') {
+
+ // $match expression below somehow triggers quite deep recurrency and stack overflow in preg
+ // to avoid this, check string bit by bit, omitting ASCII fragments.
+ if (strlen($str) > 200) {
+ $chunks = preg_split('/(?>[\x09\x0A\x0D\x20-\x7F]+)/',$str,null,PREG_SPLIT_NO_EMPTY);
+ foreach ($chunks as $chunk) {
+ if (strlen($chunk) < 200) {
+ $this->checkEncoding($chunk);
+ }
+ }
+ return $str;
+ }
+
+ // http://www.w3.org/International/questions/qa-forms-utf-8
+ $match = '[\x09\x0A\x0D\x20-\x7F]' // ASCII
+ . '|[\xC2-\xDF][\x80-\xBF]' // non-overlong 2-byte
+ . '|\xE0[\xA0-\xBF][\x80-\xBF]' // excluding overlongs
+ . '|[\xE1-\xEC\xEE\xEE][\x80-\xBF]{2}' // straight 3-byte (exclude FFFE and FFFF)
+ . '|\xEF[\x80-\xBE][\x80-\xBF]' // straight 3-byte
+ . '|\xEF\xBF[\x80-\xBD]' // straight 3-byte
+ . '|\xED[\x80-\x9F][\x80-\xBF]' // excluding surrogates
+ . '|\xF0[\x90-\xBF][\x80-\xBF]{2}' // planes 1-3
+ . '|[\xF1-\xF3][\x80-\xBF]{3}' // planes 4-15
+ . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'; // plane 16
+
+ if (!preg_match('/^(?:(?>'.$match.'))+$/s',$str)) {
+ $res = preg_split('/((?>'.$match.')+)/s',$str,null,PREG_SPLIT_DELIM_CAPTURE);
+ for($i=0; $i < count($res); $i+=2)
+ {
+ $res[$i] = self::convertBytesToEntities(array(1=>$res[$i]));
+ }
+ $this->raiseError("Invalid UTF-8 bytes: ".implode('', $res));
+ }
+ }
+ if ($this->input_encoding === 'ISO-8859-1') {
+
+ // http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
+ $forbid = '/((?>[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]+))/s';
+
+ if (preg_match($forbid, $str)) {
+ $str = preg_replace_callback($forbid, array('self', 'convertBytesToEntities'), $str);
+ $this->raiseError("Invalid ISO-8859-1 characters: ".$str);
+ }
+ }
+
+ return $str;
+ }
+
+ /**
+ * preg callback
+ * Changes all bytes to hexadecimal XML entities
+ *
+ * @param array $m first array element is used for input
+ *
+ * @return string
+ */
+ private static function convertBytesToEntities(array $m)
+ {
+ $m = $m[1]; $out = '';
+ for($i=0; $i < strlen($m); $i++)
+ {
+ $out .= '&#X'.strtoupper(dechex(ord($m[$i]))).';';
+ }
+ return $out;
+ }
+
+ /**
+ * This is where this parser violates XML and refuses to be an annoying bastard.
+ */
+ private function sanitizeEscapedText($str)
+ {
+ $str = str_replace('&apos;', '&#39;', $str); // PHP's html_entity_decode doesn't seem to support that!
+
+ /* <?php ?> blocks can't reliably work in attributes (due to escaping impossible in XML)
+ so they have to be converted into special TALES expression
+ */
+ $types = ini_get('short_open_tag')?'php|=|':'php';
+ $str = preg_replace_callback("/<\?($types)(.*?)\?>/", array('self', 'convertPHPBlockToTALES'), $str);
+
+ // corrects all non-entities and neutralizes potentially problematic CDATA end marker
+ $str = strtr(preg_replace('/&(?!(?:#x?[a-f0-9]+|[a-z][a-z0-9]*);)/i', '&amp;', $str), array('<'=>'&lt;', ']]>'=>']]&gt;'));
+
+ return $str;
+ }
+
+ private static function convertPHPBlockToTALES($m)
+ {
+ list(, $type, $code) = $m;
+ if ($type === '=') $code = 'echo '.$code;
+ return '${structure phptal-internal-php-block:'.rawurlencode($code).'}';
+ }
+
+ public function getSourceFile()
+ {
+ return $this->_file;
+ }
+
+ public function getLineNumber()
+ {
+ return $this->_line;
+ }
+
+ public static function isWhiteChar($c)
+ {
+ return strpos(" \t\n\r\0", $c) !== false;
+ }
+
+ protected function raiseError($errStr)
+ {
+ throw new PHPTAL_ParserException($errStr, $this->_file, $this->_line);
+ }
+}