+ * $markdown = new MarkdownParser;
+ * echo $markdown->parse($text);
+ *
+ */
+class MarkdownParser
+{
+ private static $md_nested_brackets;
+ private static $md_escape_table = array();
+ private static $md_backslash_escape_table = array();
+ private static $md_nested_brackets_depth = 6;
+
+ protected $md_empty_element_suffix = " />"; # Change to ">" for HTML output
+ protected $md_tab_width = 4;
+
+ private $md_list_level = 0;
+ private $md_urls = array();
+ private $md_titles = array();
+ private $md_html_blocks = array();
+
+ public function __construct()
+ {
+ if(is_null(self::$md_nested_brackets))
+ $this->initialize();
+ }
+
+ private function initialize()
+ {
+ self::$md_nested_brackets =
+ str_repeat('(?>[^\[\]]+|\[', self::$md_nested_brackets_depth).
+ str_repeat('\])*', self::$md_nested_brackets_depth);
+
+ self::$md_escape_table = array(
+ "\\" => md5("\\"),
+ "`" => md5("`"),
+ "*" => md5("*"),
+ "_" => md5("_"),
+ "{" => md5("{"),
+ "}" => md5("}"),
+ "[" => md5("["),
+ "]" => md5("]"),
+ "(" => md5("("),
+ ")" => md5(")"),
+ ">" => md5(">"),
+ "#" => md5("#"),
+ "+" => md5("+"),
+ "-" => md5("-"),
+ "." => md5("."),
+ "!" => md5("!")
+ );
+
+ # Table of hash values for escaped characters:
+ # Create an identical table but for escaped characters.
+ foreach (self::$md_escape_table as $key => $char)
+ self::$md_backslash_escape_table["\\$key"] = $char;
+ }
+
+ public function parse($text)
+ {
+ #
+ # Main function. The order in which other subs are called here is
+ # essential. Link and image substitutions need to happen before
+ # _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the
+ # and tags get encoded.
+ #
+ # Clear the hashes. If we don't clear these, you get conflicts
+ # from other articles when generating a page which contains more than
+ # one article (e.g. an index page that shows the N most recent
+ # articles):
+ $this->md_urls = array();
+ $this->md_titles = array();
+ $this->md_html_blocks = array();
+
+ # Standardize line endings:
+ # DOS to Unix and Mac to Unix
+ $text = str_replace(array("\r\n", "\r"), "\n", $text);
+
+ # Make sure $text ends with a couple of newlines:
+ $text .= "\n\n";
+
+ # Convert all tabs to spaces.
+ $text = $this->_Detab($text);
+
+ # Strip any lines consisting only of spaces and tabs.
+ # This makes subsequent regexen easier to write, because we can
+ # match consecutive blank lines with /\n+/ instead of something
+ # contorted like /[ \t]*\n+/ .
+ $text = preg_replace('/^[ \t]+$/m', '', $text);
+
+ # Turn block-level HTML blocks into hash entries
+ $text = $this->_HashHTMLBlocks($text);
+
+ # Strip link definitions, store in hashes.
+ $text = $this->_StripLinkDefinitions($text);
+
+ $text = $this->_RunBlockGamut($text);
+
+ $text = $this->_UnescapeSpecialChars($text);
+
+ return $text . "\n";
+ }
+
+
+ private function _StripLinkDefinitions($text) {
+ #
+ # Strips link definitions from text, stores the URLs and titles in
+ # hash references.
+ #
+ $less_than_tab = $this->md_tab_width - 1;
+
+ # Link defs are in the form: ^[id]: url "optional title"
+ $text = preg_replace_callback('{
+ ^[ ]{0,'.$less_than_tab.'}\[(.+)\]: # id = $1
+ [ \t]*
+ \n? # maybe *one* newline
+ [ \t]*
+ (\S+?)>? # url = $2
+ [ \t]*
+ \n? # maybe one newline
+ [ \t]*
+ (?:
+ (?<=\s) # lookbehind for whitespace
+ ["(]
+ (.+?) # title = $3
+ [")]
+ [ \t]*
+ )? # title is optional
+ (?:\n+|\Z)
+ }xm',
+ array($this,'_StripLinkDefinitions_callback'),
+ $text);
+ return $text;
+ }
+
+ private function _StripLinkDefinitions_callback($matches) {
+ $link_id = strtolower($matches[1]);
+ $this->md_urls[$link_id] = $this->_EncodeAmpsAndAngles($matches[2]);
+ if (isset($matches[3]))
+ $this->md_titles[$link_id] = str_replace('"', '"', $matches[3]);
+ return ''; # String that will replace the block
+ }
+
+
+ private function _HashHTMLBlocks($text) {
+ $less_than_tab = $this->md_tab_width - 1;
+
+ # Hashify HTML blocks:
+ # We only want to do this for block-level HTML tags, such as headers,
+ # lists, and tables. That's because we still want to wrap s around + # "paragraphs" that are wrapped in non-block-level tags, such as anchors, + # phrase emphasis, and spans. The list of tags we're looking for is + # hard-coded: + $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + 'script|noscript|form|fieldset|iframe|math|ins|del'; + $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + 'script|noscript|form|fieldset|iframe|math'; + + # First, look for nested blocks, e.g.: + #
tags around block-level tags.
+ $text = $this->_HashHTMLBlocks($text);
+ $text = $this->_FormParagraphs($text);
+
+ return $text;
+ }
+
+
+ private function _RunSpanGamut($text) {
+ #
+ # These are all the transformations that occur *within* block-level
+ # tags like paragraphs, headers, and list items.
+ #
+
+ $text = $this->_DoCodeSpans($text);
+
+ $text = $this->_EscapeSpecialChars($text);
+
+ # Process anchor and image tags. Images must come first,
+ # because ![foo][f] looks like an anchor.
+ $text = $this->_DoImages($text);
+ $text = $this->_DoAnchors($text);
+
+ # Make links out of things like ` Just type tags
+ #
+ # Strip leading and trailing lines:
+ $text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
+
+ $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
+
+ #
+ # Wrap tags.
+ #
+ foreach ($grafs as $key => $value) {
+ if (!isset( $this->md_html_blocks[$value] )) {
+ $value = $this->_RunSpanGamut($value);
+ $value = preg_replace('/^([ \t]*)/', ' ', $value);
+ $value .= "
md_empty_element_suffix}\n", $text);
+
+ return $text;
+ }
+
+
+ private function _EscapeSpecialChars($text) {
+ $tokens = $this->_TokenizeHTML($text);
+
+ $text = ''; # rebuild $text from the tokens
+ # $in_pre = 0; # Keep track of when we're inside or
tags.
+ # $tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
+
+ foreach ($tokens as $cur_token) {
+ if ($cur_token[0] == 'tag') {
+ # Within tags, encode * and _ so they don't conflict
+ # with their use in Markdown for italics and strong.
+ # We're replacing each such character with its
+ # corresponding MD5 checksum value; this is likely
+ # overkill, but it should prevent us from colliding
+ # with the escape values by accident.
+ $cur_token[1] = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'], self::$md_escape_table['_']),
+ $cur_token[1]);
+ $text .= $cur_token[1];
+ } else {
+ $t = $cur_token[1];
+ $t = $this->_EncodeBackslashEscapes($t);
+ $text .= $t;
+ }
+ }
+ return $text;
+ }
+
+
+ private function _DoAnchors($text) {
+ #
+ # Turn Markdown link shortcuts into XHTML tags.
+ #
+ #
+ # First, handle reference-style links: [link text] [id]
+ #
+ $bracket = self::$md_nested_brackets;
+ $text = preg_replace_callback("{
+ ( # wrap whole match in $1
+ \\[
+ ({$bracket}) # link text = $2
+ \\]
+
+ [ ]? # one optional space
+ (?:\\n[ ]*)? # one optional newline followed by spaces
+
+ \\[
+ (.*?) # id = $3
+ \\]
+ )
+ }xs",
+ array($this,'_DoAnchors_reference_callback'), $text);
+
+ #
+ # Next, inline-style links: [link text](url "optional title")
+ #
+ $text = preg_replace_callback("{
+ ( # wrap whole match in $1
+ \\[
+ ({$bracket}) # link text = $2
+ \\]
+ \\( # literal paren
+ [ \\t]*
+ (.*?)>? # href = $3
+ [ \\t]*
+ ( # $4
+ (['\"]) # quote char = $5
+ (.*?) # Title = $6
+ \\5 # matching quote
+ )? # title is optional
+ \\)
+ )
+ }xs",
+ array($this,'_DoAnchors_inline_callback'), $text);
+
+ return $text;
+ }
+ private function _DoAnchors_reference_callback($matches) {
+ $whole_match = $matches[1];
+ $link_text = $matches[2];
+ $link_id = strtolower($matches[3]);
+
+ if ($link_id == "") {
+ $link_id = strtolower($link_text); # for shortcut links like [this][].
+ }
+
+ if (isset($this->md_urls[$link_id])) {
+ $url = $this->md_urls[$link_id];
+ # We've got to encode these to avoid conflicting with italics/bold.
+ $url = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'], self::$md_escape_table['_']),
+ $url);
+ $result = "md_titles[$link_id] ) ) {
+ $title = $this->md_titles[$link_id];
+ $title = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'],
+ self::$md_escape_table['_']), $title);
+ $result .= " title=\"$title\"";
+ }
+ $result .= ">$link_text";
+ }
+ else {
+ $result = $whole_match;
+ }
+ return $result;
+ }
+ private function _DoAnchors_inline_callback($matches) {
+ $whole_match = $matches[1];
+ $link_text = $matches[2];
+ $url = $matches[3];
+ $title =& $matches[6];
+
+ # We've got to encode these to avoid conflicting with italics/bold.
+ $url = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'], self::$md_escape_table['_']),
+ $url);
+ $result = " tags.
+ #
+ #
+ # First, handle reference-style labeled images: ![alt text][id]
+ #
+ $text = preg_replace_callback('{
+ ( # wrap whole match in $1
+ !\[
+ ('.self::$md_nested_brackets.') # alt text = $2
+ \]
+
+ [ ]? # one optional space
+ (?:\n[ ]*)? # one optional newline followed by spaces
+
+ \[
+ (.*?) # id = $3
+ \]
+
+ )
+ }xs',
+ array($this,'_DoImages_reference_callback'), $text);
+
+ #
+ # Next, handle inline images: ![alt text](url "optional title")
+ # Don't forget: encode * and _
+
+ $text = preg_replace_callback('{
+ ( # wrap whole match in $1
+ !\[
+ ('.self::$md_nested_brackets.') # alt text = $2
+ \]
+ \( # literal paren
+ [ \t]*
+ (\S+?)>? # src url = $3
+ [ \t]*
+ ( # $4
+ ([\'"]) # quote char = $5
+ (.*?) # title = $6
+ \5 # matching quote
+ [ \t]*
+ )? # title is optional
+ \)
+ )
+ }xs',
+ array($this,'_DoImages_inline_callback'), $text);
+
+ return $text;
+ }
+ private function _DoImages_reference_callback($matches) {
+ $whole_match = $matches[1];
+ $alt_text = $matches[2];
+ $link_id = strtolower($matches[3]);
+
+ if ($link_id == "") {
+ $link_id = strtolower($alt_text); # for shortcut links like ![this][].
+ }
+
+ $alt_text = str_replace('"', '"', $alt_text);
+ if (isset($this->md_urls[$link_id])) {
+ $url = $this->md_urls[$link_id];
+ # We've got to encode these to avoid conflicting with italics/bold.
+ $url = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'], self::$md_escape_table['_']),
+ $url);
+ $result = "md_titles[$link_id])) {
+ $title = $this->md_titles[$link_id];
+ $title = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'],
+ self::$md_escape_table['_']), $title);
+ $result .= " title=\"$title\"";
+ }
+ $result .= $this->md_empty_element_suffix;
+ }
+ else {
+ # If there's no such link ID, leave intact:
+ $result = $whole_match;
+ }
+
+ return $result;
+ }
+ private function _DoImages_inline_callback($matches) {
+ $whole_match = $matches[1];
+ $alt_text = $matches[2];
+ $url = $matches[3];
+ $title = '';
+ if (isset($matches[6])) {
+ $title = $matches[6];
+ }
+
+ $alt_text = str_replace('"', '"', $alt_text);
+ $title = str_replace('"', '"', $title);
+ # We've got to encode these to avoid conflicting with italics/bold.
+ $url = str_replace(array('*', '_'),
+ array(self::$md_escape_table['*'], self::$md_escape_table['_']),
+ $url);
+ $result = "md_empty_element_suffix;
+
+ return $result;
+ }
+
+
+ private function _DoHeaders($text) {
+ # Setext-style headers:
+ # Header 1
+ # ========
+ #
+ # Header 2
+ # --------
+ #
+ $text = preg_replace(
+ array('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }emx',
+ '{ ^(.+)[ \t]*\n-+[ \t]*\n+ }emx'),
+ array("'
'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'
\n\n'",
+ "''.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'
\n\n'"),
+ $text);
+
+ # atx-style headers:
+ # # Header 1
+ # ## Header 2
+ # ## Header 2 with closing hashes ##
+ # ...
+ # ###### Header 6
+ #
+ $text = preg_replace("{
+ ^(\\#{1,6}) # $1 = string of #'s
+ [ \\t]*
+ (.+?) # $2 = Header text
+ [ \\t]*
+ \\#* # optional closing #'s (not counted)
+ \\n+
+ }xme",
+ "'` blocks.
+ #
+ $text = preg_replace_callback('{
+ (?:\n\n|\A)
+ ( # $1 = the code block -- one or more lines, starting with a space/tab
+ (?:
+ (?:[ ]{'.$this->md_tab_width.'} | \t) # Lines must start with a tab or a tab-width of spaces
+ .*\n+
+ )+
+ )
+ ((?=^[ ]{0,'.$this->md_tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
+ }xm',
+ array($this,'_DoCodeBlocks_callback'), $text);
+
+ return $text;
+ }
+ private function _DoCodeBlocks_callback($matches) {
+ $codeblock = $matches[1];
+
+ $codeblock = $this->_EncodeCode($this->_Outdent($codeblock));
+ // $codeblock = _Detab($codeblock);
+ # trim leading newlines and trailing whitespace
+ $codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock);
+
+ $result = "\n\n
\n\n";
+
+ return $result;
+ }
+
+
+ private function _DoCodeSpans($text) {
+ #
+ # * Backtick quotes are used for " . $codeblock . "\n
spans.
+ #
+ # * You can use multiple backticks as the delimiters if you want to
+ # include literal backticks in the code span. So, this input:
+ #
+ # Just type ``foo `bar` baz`` at the prompt.
+ #
+ # Will translate to:
+ #
+ #
foo `bar` baz
at the prompt.`bar`
...
+ #
+ $text = preg_replace_callback('@
+ (?_EncodeCode($c);
+ return "$c
";
+ }
+
+
+ private function _EncodeCode($_) {
+ #
+ # Encode/escape certain characters inside Markdown code runs.
+ # The point is that in code, these characters are literals,
+ # and lose their special Markdown meanings.
+ #
+ # Encode all ampersands; HTML entities are not
+ # entities within a Markdown code span.
+ $_ = str_replace('&', '&', $_);
+
+ # Do the angle bracket song and dance:
+ $_ = str_replace(array('<', '>'),
+ array('<', '>'), $_);
+
+ # Now, escape characters that are magic in Markdown:
+ $_ = str_replace(array_keys(self::$md_escape_table),
+ array_values(self::$md_escape_table), $_);
+
+ return $_;
+ }
+
+
+ private function _DoItalicsAndBold($text) {
+ # must go first:
+ $text = preg_replace('{
+ ( # $1: Marker
+ (?\2', $text);
+ # Then :
+ $text = preg_replace(
+ '{ ( (?\2', $text);
+
+ return $text;
+ }
+
+
+ private function _DoBlockQuotes($text) {
+ $text = preg_replace_callback('/
+ ( # Wrap whole match in $1
+ (
+ ^[ \t]*>[ \t]? # ">" at the start of a line
+ .+\n # rest of the first line
+ (.+\n)* # subsequent consecutive lines
+ \n* # blanks
+ )+
+ )
+ /xm',
+ array($this,'_DoBlockQuotes_callback'), $text);
+
+ return $text;
+ }
+ private function _DoBlockQuotes_callback($matches) {
+ $bq = $matches[1];
+ # trim one level of quoting - trim whitespace-only lines
+ $bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
+ $bq = $this->_RunBlockGamut($bq); # recurse
+
+ $bq = preg_replace('/^/m', " ", $bq);
+ # These leading spaces screw with content, so we need to fix that:
+ $bq = preg_replace_callback('{(\s*
.+?
)}sx',
+ array($this,'_DoBlockQuotes_callback2'), $bq);
+
+ return "\n$bq\n
\n\n";
+ }
+ private function _DoBlockQuotes_callback2($matches) {
+ $pre = $matches[1];
+ $pre = preg_replace('/^ /m', '', $pre);
+ return $pre;
+ }
+
+
+ private function _FormParagraphs($text) {
+ #
+ # Params:
+ # $text - string to process with html as well).
+
+For more information about Markdown's syntax, see:
+
+