* $markdown = new MarkdownParser;
* echo $markdown->parse($text);
*
*/
class MarkdownParser
{
private static $md_nested_brackets;
private static $md_escape_table = array();
private static $md_backslash_escape_table = array();
private static $md_nested_brackets_depth = 6;
protected $md_empty_element_suffix = " />"; # Change to ">" for HTML output
protected $md_tab_width = 4;
private $md_list_level = 0;
private $md_urls = array();
private $md_titles = array();
private $md_html_blocks = array();
public function __construct()
{
if(is_null(self::$md_nested_brackets))
$this->initialize();
}
private function initialize()
{
self::$md_nested_brackets =
str_repeat('(?>[^\[\]]+|\[', self::$md_nested_brackets_depth).
str_repeat('\])*', self::$md_nested_brackets_depth);
self::$md_escape_table = array(
"\\" => md5("\\"),
"`" => md5("`"),
"*" => md5("*"),
"_" => md5("_"),
"{" => md5("{"),
"}" => md5("}"),
"[" => md5("["),
"]" => md5("]"),
"(" => md5("("),
")" => md5(")"),
">" => md5(">"),
"#" => md5("#"),
"+" => md5("+"),
"-" => md5("-"),
"." => md5("."),
"!" => md5("!")
);
# Table of hash values for escaped characters:
# Create an identical table but for escaped characters.
foreach (self::$md_escape_table as $key => $char)
self::$md_backslash_escape_table["\\$key"] = $char;
}
public function parse($text)
{
#
# Main function. The order in which other subs are called here is
# essential. Link and image substitutions need to happen before
# _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the
# and tags get encoded.
#
# Clear the hashes. If we don't clear these, you get conflicts
# from other articles when generating a page which contains more than
# one article (e.g. an index page that shows the N most recent
# articles):
$this->md_urls = array();
$this->md_titles = array();
$this->md_html_blocks = array();
# Standardize line endings:
# DOS to Unix and Mac to Unix
$text = str_replace(array("\r\n", "\r"), "\n", $text);
# Make sure $text ends with a couple of newlines:
$text .= "\n\n";
# Convert all tabs to spaces.
$text = $this->_Detab($text);
# Strip any lines consisting only of spaces and tabs.
# This makes subsequent regexen easier to write, because we can
# match consecutive blank lines with /\n+/ instead of something
# contorted like /[ \t]*\n+/ .
$text = preg_replace('/^[ \t]+$/m', '', $text);
# Turn block-level HTML blocks into hash entries
$text = $this->_HashHTMLBlocks($text);
# Strip link definitions, store in hashes.
$text = $this->_StripLinkDefinitions($text);
$text = $this->_RunBlockGamut($text);
$text = $this->_UnescapeSpecialChars($text);
return $text . "\n";
}
private function _StripLinkDefinitions($text) {
#
# Strips link definitions from text, stores the URLs and titles in
# hash references.
#
$less_than_tab = $this->md_tab_width - 1;
# Link defs are in the form: ^[id]: url "optional title"
$text = preg_replace_callback('{
^[ ]{0,'.$less_than_tab.'}\[(.+)\]: # id = $1
[ \t]*
\n? # maybe *one* newline
[ \t]*
(\S+?)>? # url = $2
[ \t]*
\n? # maybe one newline
[ \t]*
(?:
(?<=\s) # lookbehind for whitespace
["(]
(.+?) # title = $3
[")]
[ \t]*
)? # title is optional
(?:\n+|\Z)
}xm',
array($this,'_StripLinkDefinitions_callback'),
$text);
return $text;
}
private function _StripLinkDefinitions_callback($matches) {
$link_id = strtolower($matches[1]);
$this->md_urls[$link_id] = $this->_EncodeAmpsAndAngles($matches[2]);
if (isset($matches[3]))
$this->md_titles[$link_id] = str_replace('"', '"', $matches[3]);
return ''; # String that will replace the block
}
private function _HashHTMLBlocks($text) {
$less_than_tab = $this->md_tab_width - 1;
# Hashify HTML blocks:
# We only want to do this for block-level HTML tags, such as headers,
# lists, and tables. That's because we still want to wrap s around # "paragraphs" that are wrapped in non-block-level tags, such as anchors, # phrase emphasis, and spans. The list of tags we're looking for is # hard-coded: $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. 'script|noscript|form|fieldset|iframe|math|ins|del'; $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. 'script|noscript|form|fieldset|iframe|math'; # First, look for nested blocks, e.g.: #
tags around block-level tags.
$text = $this->_HashHTMLBlocks($text);
$text = $this->_FormParagraphs($text);
return $text;
}
private function _RunSpanGamut($text) {
#
# These are all the transformations that occur *within* block-level
# tags like paragraphs, headers, and list items.
#
$text = $this->_DoCodeSpans($text);
$text = $this->_EscapeSpecialChars($text);
# Process anchor and image tags. Images must come first,
# because ![foo][f] looks like an anchor.
$text = $this->_DoImages($text);
$text = $this->_DoAnchors($text);
# Make links out of things like ` Just type tags
#
# Strip leading and trailing lines:
$text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
$grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
#
# Wrap tags.
#
foreach ($grafs as $key => $value) {
if (!isset( $this->md_html_blocks[$value] )) {
$value = $this->_RunSpanGamut($value);
$value = preg_replace('/^([ \t]*)/', ' ', $value);
$value .= "
md_empty_element_suffix}\n", $text);
return $text;
}
private function _EscapeSpecialChars($text) {
$tokens = $this->_TokenizeHTML($text);
$text = ''; # rebuild $text from the tokens
# $in_pre = 0; # Keep track of when we're inside or
tags.
# $tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
foreach ($tokens as $cur_token) {
if ($cur_token[0] == 'tag') {
# Within tags, encode * and _ so they don't conflict
# with their use in Markdown for italics and strong.
# We're replacing each such character with its
# corresponding MD5 checksum value; this is likely
# overkill, but it should prevent us from colliding
# with the escape values by accident.
$cur_token[1] = str_replace(array('*', '_'),
array(self::$md_escape_table['*'], self::$md_escape_table['_']),
$cur_token[1]);
$text .= $cur_token[1];
} else {
$t = $cur_token[1];
$t = $this->_EncodeBackslashEscapes($t);
$text .= $t;
}
}
return $text;
}
private function _DoAnchors($text) {
#
# Turn Markdown link shortcuts into XHTML tags.
#
#
# First, handle reference-style links: [link text] [id]
#
$bracket = self::$md_nested_brackets;
$text = preg_replace_callback("{
( # wrap whole match in $1
\\[
({$bracket}) # link text = $2
\\]
[ ]? # one optional space
(?:\\n[ ]*)? # one optional newline followed by spaces
\\[
(.*?) # id = $3
\\]
)
}xs",
array($this,'_DoAnchors_reference_callback'), $text);
#
# Next, inline-style links: [link text](url "optional title")
#
$text = preg_replace_callback("{
( # wrap whole match in $1
\\[
({$bracket}) # link text = $2
\\]
\\( # literal paren
[ \\t]*
(.*?)>? # href = $3
[ \\t]*
( # $4
(['\"]) # quote char = $5
(.*?) # Title = $6
\\5 # matching quote
)? # title is optional
\\)
)
}xs",
array($this,'_DoAnchors_inline_callback'), $text);
return $text;
}
private function _DoAnchors_reference_callback($matches) {
$whole_match = $matches[1];
$link_text = $matches[2];
$link_id = strtolower($matches[3]);
if ($link_id == "") {
$link_id = strtolower($link_text); # for shortcut links like [this][].
}
if (isset($this->md_urls[$link_id])) {
$url = $this->md_urls[$link_id];
# We've got to encode these to avoid conflicting with italics/bold.
$url = str_replace(array('*', '_'),
array(self::$md_escape_table['*'], self::$md_escape_table['_']),
$url);
$result = "md_titles[$link_id] ) ) {
$title = $this->md_titles[$link_id];
$title = str_replace(array('*', '_'),
array(self::$md_escape_table['*'],
self::$md_escape_table['_']), $title);
$result .= " title=\"$title\"";
}
$result .= ">$link_text";
}
else {
$result = $whole_match;
}
return $result;
}
private function _DoAnchors_inline_callback($matches) {
$whole_match = $matches[1];
$link_text = $matches[2];
$url = $matches[3];
$title =& $matches[6];
# We've got to encode these to avoid conflicting with italics/bold.
$url = str_replace(array('*', '_'),
array(self::$md_escape_table['*'], self::$md_escape_table['_']),
$url);
$result = " tags.
#
#
# First, handle reference-style labeled images: ![alt text][id]
#
$text = preg_replace_callback('{
( # wrap whole match in $1
!\[
('.self::$md_nested_brackets.') # alt text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}xs',
array($this,'_DoImages_reference_callback'), $text);
#
# Next, handle inline images: ![alt text](url "optional title")
# Don't forget: encode * and _
$text = preg_replace_callback('{
( # wrap whole match in $1
!\[
('.self::$md_nested_brackets.') # alt text = $2
\]
\( # literal paren
[ \t]*
(\S+?)>? # src url = $3
[ \t]*
( # $4
([\'"]) # quote char = $5
(.*?) # title = $6
\5 # matching quote
[ \t]*
)? # title is optional
\)
)
}xs',
array($this,'_DoImages_inline_callback'), $text);
return $text;
}
private function _DoImages_reference_callback($matches) {
$whole_match = $matches[1];
$alt_text = $matches[2];
$link_id = strtolower($matches[3]);
if ($link_id == "") {
$link_id = strtolower($alt_text); # for shortcut links like ![this][].
}
$alt_text = str_replace('"', '"', $alt_text);
if (isset($this->md_urls[$link_id])) {
$url = $this->md_urls[$link_id];
# We've got to encode these to avoid conflicting with italics/bold.
$url = str_replace(array('*', '_'),
array(self::$md_escape_table['*'], self::$md_escape_table['_']),
$url);
$result = "md_titles[$link_id])) {
$title = $this->md_titles[$link_id];
$title = str_replace(array('*', '_'),
array(self::$md_escape_table['*'],
self::$md_escape_table['_']), $title);
$result .= " title=\"$title\"";
}
$result .= $this->md_empty_element_suffix;
}
else {
# If there's no such link ID, leave intact:
$result = $whole_match;
}
return $result;
}
private function _DoImages_inline_callback($matches) {
$whole_match = $matches[1];
$alt_text = $matches[2];
$url = $matches[3];
$title = '';
if (isset($matches[6])) {
$title = $matches[6];
}
$alt_text = str_replace('"', '"', $alt_text);
$title = str_replace('"', '"', $title);
# We've got to encode these to avoid conflicting with italics/bold.
$url = str_replace(array('*', '_'),
array(self::$md_escape_table['*'], self::$md_escape_table['_']),
$url);
$result = "md_empty_element_suffix;
return $result;
}
private function _DoHeaders($text) {
# Setext-style headers:
# Header 1
# ========
#
# Header 2
# --------
#
$text = preg_replace(
array('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }emx',
'{ ^(.+)[ \t]*\n-+[ \t]*\n+ }emx'),
array("'
'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'
\n\n'",
"''.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'
\n\n'"),
$text);
# atx-style headers:
# # Header 1
# ## Header 2
# ## Header 2 with closing hashes ##
# ...
# ###### Header 6
#
$text = preg_replace("{
^(\\#{1,6}) # $1 = string of #'s
[ \\t]*
(.+?) # $2 = Header text
[ \\t]*
\\#* # optional closing #'s (not counted)
\\n+
}xme",
"'` blocks.
#
$text = preg_replace_callback('{
(?:\n\n|\A)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?:
(?:[ ]{'.$this->md_tab_width.'} | \t) # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,'.$this->md_tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
}xm',
array($this,'_DoCodeBlocks_callback'), $text);
return $text;
}
private function _DoCodeBlocks_callback($matches) {
$codeblock = $matches[1];
$codeblock = $this->_EncodeCode($this->_Outdent($codeblock));
// $codeblock = _Detab($codeblock);
# trim leading newlines and trailing whitespace
$codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock);
$result = "\n\n
\n\n";
return $result;
}
private function _DoCodeSpans($text) {
#
# * Backtick quotes are used for " . $codeblock . "\n
spans.
#
# * You can use multiple backticks as the delimiters if you want to
# include literal backticks in the code span. So, this input:
#
# Just type ``foo `bar` baz`` at the prompt.
#
# Will translate to:
#
#
foo `bar` baz
at the prompt.`bar`
...
#
$text = preg_replace_callback('@
(?_EncodeCode($c);
return "$c
";
}
private function _EncodeCode($_) {
#
# Encode/escape certain characters inside Markdown code runs.
# The point is that in code, these characters are literals,
# and lose their special Markdown meanings.
#
# Encode all ampersands; HTML entities are not
# entities within a Markdown code span.
$_ = str_replace('&', '&', $_);
# Do the angle bracket song and dance:
$_ = str_replace(array('<', '>'),
array('<', '>'), $_);
# Now, escape characters that are magic in Markdown:
$_ = str_replace(array_keys(self::$md_escape_table),
array_values(self::$md_escape_table), $_);
return $_;
}
private function _DoItalicsAndBold($text) {
# must go first:
$text = preg_replace('{
( # $1: Marker
(?\2', $text);
# Then :
$text = preg_replace(
'{ ( (?\2', $text);
return $text;
}
private function _DoBlockQuotes($text) {
$text = preg_replace_callback('/
( # Wrap whole match in $1
(
^[ \t]*>[ \t]? # ">" at the start of a line
.+\n # rest of the first line
(.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
/xm',
array($this,'_DoBlockQuotes_callback'), $text);
return $text;
}
private function _DoBlockQuotes_callback($matches) {
$bq = $matches[1];
# trim one level of quoting - trim whitespace-only lines
$bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
$bq = $this->_RunBlockGamut($bq); # recurse
$bq = preg_replace('/^/m', " ", $bq);
# These leading spaces screw with content, so we need to fix that:
$bq = preg_replace_callback('{(\s*
.+?
)}sx',
array($this,'_DoBlockQuotes_callback2'), $bq);
return "\n$bq\n
\n\n";
}
private function _DoBlockQuotes_callback2($matches) {
$pre = $matches[1];
$pre = preg_replace('/^ /m', '', $pre);
return $pre;
}
private function _FormParagraphs($text) {
#
# Params:
# $text - string to process with html as well).
For more information about Markdown's syntax, see: