diff options
author | xue <> | 2006-06-02 18:27:02 +0000 |
---|---|---|
committer | xue <> | 2006-06-02 18:27:02 +0000 |
commit | 0f3a577bed4d828472469675e90fcab032e33f44 (patch) | |
tree | 3ca817247b8006563900d5fb8995d6a6f0627a2b /framework/3rdParty/Markdown/MarkdownParser.php | |
parent | 067ab51fbd9b2f18f63fc80895476e5b0e2f9bfb (diff) |
merge from 3.0 branch till 1133.
Diffstat (limited to 'framework/3rdParty/Markdown/MarkdownParser.php')
-rw-r--r-- | framework/3rdParty/Markdown/MarkdownParser.php | 1257 |
1 files changed, 1257 insertions, 0 deletions
diff --git a/framework/3rdParty/Markdown/MarkdownParser.php b/framework/3rdParty/Markdown/MarkdownParser.php new file mode 100644 index 00000000..c0d2becf --- /dev/null +++ b/framework/3rdParty/Markdown/MarkdownParser.php @@ -0,0 +1,1257 @@ +<?php + +# +# Markdown - A text-to-HTML conversion tool for web writers +# +# Copyright (c) 2004-2005 John Gruber +# <http://daringfireball.net/projects/markdown/> +# +# Copyright (c) 2004-2005 Michel Fortin - PHP Port +# <http://www.michelf.com/projects/php-markdown/> +# + +/** + * PHP5 version of the markdown parser. + * Usage: + * <code> + * $markdown = new MarkdownParser; + * echo $markdown->parse($text); + * </code> + */ +class MarkdownParser +{ + private static $md_nested_brackets; + private static $md_escape_table = array(); + private static $md_backslash_escape_table = array(); + private static $md_nested_brackets_depth = 6; + + protected $md_empty_element_suffix = " />"; # Change to ">" for HTML output + protected $md_tab_width = 4; + + private $md_list_level = 0; + private $md_urls = array(); + private $md_titles = array(); + private $md_html_blocks = array(); + + public function __construct() + { + if(is_null(self::$md_nested_brackets)) + $this->initialize(); + } + + private function initialize() + { + self::$md_nested_brackets = + str_repeat('(?>[^\[\]]+|\[', self::$md_nested_brackets_depth). + str_repeat('\])*', self::$md_nested_brackets_depth); + + self::$md_escape_table = array( + "\\" => md5("\\"), + "`" => md5("`"), + "*" => md5("*"), + "_" => md5("_"), + "{" => md5("{"), + "}" => md5("}"), + "[" => md5("["), + "]" => md5("]"), + "(" => md5("("), + ")" => md5(")"), + ">" => md5(">"), + "#" => md5("#"), + "+" => md5("+"), + "-" => md5("-"), + "." => md5("."), + "!" => md5("!") + ); + + # Table of hash values for escaped characters: + # Create an identical table but for escaped characters. + foreach (self::$md_escape_table as $key => $char) + self::$md_backslash_escape_table["\\$key"] = $char; + } + + public function parse($text) + { + # + # Main function. The order in which other subs are called here is + # essential. Link and image substitutions need to happen before + # _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the <a> + # and <img> tags get encoded. + # + # Clear the hashes. If we don't clear these, you get conflicts + # from other articles when generating a page which contains more than + # one article (e.g. an index page that shows the N most recent + # articles): + $this->md_urls = array(); + $this->md_titles = array(); + $this->md_html_blocks = array(); + + # Standardize line endings: + # DOS to Unix and Mac to Unix + $text = str_replace(array("\r\n", "\r"), "\n", $text); + + # Make sure $text ends with a couple of newlines: + $text .= "\n\n"; + + # Convert all tabs to spaces. + $text = $this->_Detab($text); + + # Strip any lines consisting only of spaces and tabs. + # This makes subsequent regexen easier to write, because we can + # match consecutive blank lines with /\n+/ instead of something + # contorted like /[ \t]*\n+/ . + $text = preg_replace('/^[ \t]+$/m', '', $text); + + # Turn block-level HTML blocks into hash entries + $text = $this->_HashHTMLBlocks($text); + + # Strip link definitions, store in hashes. + $text = $this->_StripLinkDefinitions($text); + + $text = $this->_RunBlockGamut($text); + + $text = $this->_UnescapeSpecialChars($text); + + return $text . "\n"; + } + + + private function _StripLinkDefinitions($text) { + # + # Strips link definitions from text, stores the URLs and titles in + # hash references. + # + $less_than_tab = $this->md_tab_width - 1; + + # Link defs are in the form: ^[id]: url "optional title" + $text = preg_replace_callback('{ + ^[ ]{0,'.$less_than_tab.'}\[(.+)\]: # id = $1 + [ \t]* + \n? # maybe *one* newline + [ \t]* + <?(\S+?)>? # url = $2 + [ \t]* + \n? # maybe one newline + [ \t]* + (?: + (?<=\s) # lookbehind for whitespace + ["(] + (.+?) # title = $3 + [")] + [ \t]* + )? # title is optional + (?:\n+|\Z) + }xm', + array($this,'_StripLinkDefinitions_callback'), + $text); + return $text; + } + + private function _StripLinkDefinitions_callback($matches) { + $link_id = strtolower($matches[1]); + $this->md_urls[$link_id] = $this->_EncodeAmpsAndAngles($matches[2]); + if (isset($matches[3])) + $this->md_titles[$link_id] = str_replace('"', '"', $matches[3]); + return ''; # String that will replace the block + } + + + private function _HashHTMLBlocks($text) { + $less_than_tab = $this->md_tab_width - 1; + + # Hashify HTML blocks: + # We only want to do this for block-level HTML tags, such as headers, + # lists, and tables. That's because we still want to wrap <p>s around + # "paragraphs" that are wrapped in non-block-level tags, such as anchors, + # phrase emphasis, and spans. The list of tags we're looking for is + # hard-coded: + $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + 'script|noscript|form|fieldset|iframe|math|ins|del'; + $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + 'script|noscript|form|fieldset|iframe|math'; + + # First, look for nested blocks, e.g.: + # <div> + # <div> + # tags for inner block must be indented. + # </div> + # </div> + # + # The outermost tags must start at the left margin for this to match, and + # the inner nested divs must be indented. + # We need to do this before the next, more liberal match, because the next + # match will start at the first `<div>` and stop at the first `</div>`. + $text = preg_replace_callback("{ + ( # save in $1 + ^ # start of line (with /m) + <($block_tags_a) # start tag = $2 + \\b # word break + (.*\\n)*? # any number of lines, minimally matching + </\\2> # the matching end tag + [ \\t]* # trailing spaces/tabs + (?=\\n+|\\Z) # followed by a newline or end of document + ) + }xm", + array($this,'_HashHTMLBlocks_callback'), + $text); + + # + # Now match more liberally, simply from `\n<tag>` to `</tag>\n` + # + $text = preg_replace_callback("{ + ( # save in $1 + ^ # start of line (with /m) + <($block_tags_b) # start tag = $2 + \\b # word break + (.*\\n)*? # any number of lines, minimally matching + .*</\\2> # the matching end tag + [ \\t]* # trailing spaces/tabs + (?=\\n+|\\Z) # followed by a newline or end of document + ) + }xm", + array($this,'_HashHTMLBlocks_callback'), + $text); + + # Special case just for <hr />. It was easier to make a special case than + # to make the other regex more complicated. + $text = preg_replace_callback('{ + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,'.$less_than_tab.'} + <(hr) # start tag = $2 + \b # word break + ([^<>])*? # + /?> # the matching end tag + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + }x', + array($this,'_HashHTMLBlocks_callback'), + $text); + + # Special case for standalone HTML comments: + $text = preg_replace_callback('{ + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,'.$less_than_tab.'} + (?s: + <! + (--.*?--\s*)+ + > + ) + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + }x', + array($this,'_HashHTMLBlocks_callback'), + $text); + + return $text; + } + private function _HashHTMLBlocks_callback($matches) { + $text = $matches[1]; + $key = md5($text); + $this->md_html_blocks[$key] = $text; + return "\n\n$key\n\n"; # String that will replace the block + } + + + private function _RunBlockGamut($text) { + # + # These are all the transformations that form block-level + # tags like paragraphs, headers, and list items. + # + $text = $this->_DoHeaders($text); + + # Do Horizontal Rules: + $text = preg_replace( + array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx', + '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx', + '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'), + "\n<hr{$this->md_empty_element_suffix}\n", + $text); + + $text = $this->_DoLists($text); + $text = $this->_DoCodeBlocks($text); + $text = $this->_DoBlockQuotes($text); + + # We already ran _HashHTMLBlocks() before, in Markdown(), but that + # was to escape raw HTML in the original Markdown source. This time, + # we're escaping the markup we've just created, so that we don't wrap + # <p> tags around block-level tags. + $text = $this->_HashHTMLBlocks($text); + $text = $this->_FormParagraphs($text); + + return $text; + } + + + private function _RunSpanGamut($text) { + # + # These are all the transformations that occur *within* block-level + # tags like paragraphs, headers, and list items. + # + + $text = $this->_DoCodeSpans($text); + + $text = $this->_EscapeSpecialChars($text); + + # Process anchor and image tags. Images must come first, + # because ![foo][f] looks like an anchor. + $text = $this->_DoImages($text); + $text = $this->_DoAnchors($text); + + # Make links out of things like `<http://example.com/>` + # Must come after _DoAnchors(), because you can use < and > + # delimiters in inline links like [this](<url>). + $text = $this->_DoAutoLinks($text); + $text = $this->_EncodeAmpsAndAngles($text); + $text = $this->_DoItalicsAndBold($text); + + # Do hard breaks: + $text = preg_replace('/ {2,}\n/', "<br{$this->md_empty_element_suffix}\n", $text); + + return $text; + } + + + private function _EscapeSpecialChars($text) { + $tokens = $this->_TokenizeHTML($text); + + $text = ''; # rebuild $text from the tokens + # $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags. + # $tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!"; + + foreach ($tokens as $cur_token) { + if ($cur_token[0] == 'tag') { + # Within tags, encode * and _ so they don't conflict + # with their use in Markdown for italics and strong. + # We're replacing each such character with its + # corresponding MD5 checksum value; this is likely + # overkill, but it should prevent us from colliding + # with the escape values by accident. + $cur_token[1] = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $cur_token[1]); + $text .= $cur_token[1]; + } else { + $t = $cur_token[1]; + $t = $this->_EncodeBackslashEscapes($t); + $text .= $t; + } + } + return $text; + } + + + private function _DoAnchors($text) { + # + # Turn Markdown link shortcuts into XHTML <a> tags. + # + # + # First, handle reference-style links: [link text] [id] + # + $bracket = self::$md_nested_brackets; + $text = preg_replace_callback("{ + ( # wrap whole match in $1 + \\[ + ({$bracket}) # link text = $2 + \\] + + [ ]? # one optional space + (?:\\n[ ]*)? # one optional newline followed by spaces + + \\[ + (.*?) # id = $3 + \\] + ) + }xs", + array($this,'_DoAnchors_reference_callback'), $text); + + # + # Next, inline-style links: [link text](url "optional title") + # + $text = preg_replace_callback("{ + ( # wrap whole match in $1 + \\[ + ({$bracket}) # link text = $2 + \\] + \\( # literal paren + [ \\t]* + <?(.*?)>? # href = $3 + [ \\t]* + ( # $4 + (['\"]) # quote char = $5 + (.*?) # Title = $6 + \\5 # matching quote + )? # title is optional + \\) + ) + }xs", + array($this,'_DoAnchors_inline_callback'), $text); + + return $text; + } + private function _DoAnchors_reference_callback($matches) { + $whole_match = $matches[1]; + $link_text = $matches[2]; + $link_id = strtolower($matches[3]); + + if ($link_id == "") { + $link_id = strtolower($link_text); # for shortcut links like [this][]. + } + + if (isset($this->md_urls[$link_id])) { + $url = $this->md_urls[$link_id]; + # We've got to encode these to avoid conflicting with italics/bold. + $url = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $url); + $result = "<a href=\"$url\""; + if ( isset( $this->md_titles[$link_id] ) ) { + $title = $this->md_titles[$link_id]; + $title = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], + self::$md_escape_table['_']), $title); + $result .= " title=\"$title\""; + } + $result .= ">$link_text</a>"; + } + else { + $result = $whole_match; + } + return $result; + } + private function _DoAnchors_inline_callback($matches) { + $whole_match = $matches[1]; + $link_text = $matches[2]; + $url = $matches[3]; + $title =& $matches[6]; + + # We've got to encode these to avoid conflicting with italics/bold. + $url = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $url); + $result = "<a href=\"$url\""; + if (isset($title)) { + $title = str_replace('"', '"', $title); + $title = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $title); + $result .= " title=\"$title\""; + } + + $result .= ">$link_text</a>"; + + return $result; + } + + + private function _DoImages($text) { + # + # Turn Markdown image shortcuts into <img> tags. + # + # + # First, handle reference-style labeled images: ![alt text][id] + # + $text = preg_replace_callback('{ + ( # wrap whole match in $1 + !\[ + ('.self::$md_nested_brackets.') # alt text = $2 + \] + + [ ]? # one optional space + (?:\n[ ]*)? # one optional newline followed by spaces + + \[ + (.*?) # id = $3 + \] + + ) + }xs', + array($this,'_DoImages_reference_callback'), $text); + + # + # Next, handle inline images: ![alt text](url "optional title") + # Don't forget: encode * and _ + + $text = preg_replace_callback('{ + ( # wrap whole match in $1 + !\[ + ('.self::$md_nested_brackets.') # alt text = $2 + \] + \( # literal paren + [ \t]* + <?(\S+?)>? # src url = $3 + [ \t]* + ( # $4 + ([\'"]) # quote char = $5 + (.*?) # title = $6 + \5 # matching quote + [ \t]* + )? # title is optional + \) + ) + }xs', + array($this,'_DoImages_inline_callback'), $text); + + return $text; + } + private function _DoImages_reference_callback($matches) { + $whole_match = $matches[1]; + $alt_text = $matches[2]; + $link_id = strtolower($matches[3]); + + if ($link_id == "") { + $link_id = strtolower($alt_text); # for shortcut links like ![this][]. + } + + $alt_text = str_replace('"', '"', $alt_text); + if (isset($this->md_urls[$link_id])) { + $url = $this->md_urls[$link_id]; + # We've got to encode these to avoid conflicting with italics/bold. + $url = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $url); + $result = "<img src=\"$url\" alt=\"$alt_text\""; + if (isset($this->md_titles[$link_id])) { + $title = $this->md_titles[$link_id]; + $title = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], + self::$md_escape_table['_']), $title); + $result .= " title=\"$title\""; + } + $result .= $this->md_empty_element_suffix; + } + else { + # If there's no such link ID, leave intact: + $result = $whole_match; + } + + return $result; + } + private function _DoImages_inline_callback($matches) { + $whole_match = $matches[1]; + $alt_text = $matches[2]; + $url = $matches[3]; + $title = ''; + if (isset($matches[6])) { + $title = $matches[6]; + } + + $alt_text = str_replace('"', '"', $alt_text); + $title = str_replace('"', '"', $title); + # We've got to encode these to avoid conflicting with italics/bold. + $url = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $url); + $result = "<img src=\"$url\" alt=\"$alt_text\""; + if (isset($title)) { + $title = str_replace(array('*', '_'), + array(self::$md_escape_table['*'], self::$md_escape_table['_']), + $title); + $result .= " title=\"$title\""; # $title already quoted + } + $result .= $this->md_empty_element_suffix; + + return $result; + } + + + private function _DoHeaders($text) { + # Setext-style headers: + # Header 1 + # ======== + # + # Header 2 + # -------- + # + $text = preg_replace( + array('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }emx', + '{ ^(.+)[ \t]*\n-+[ \t]*\n+ }emx'), + array("'<h1>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h1>\n\n'", + "'<h2>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\1')).'</h2>\n\n'"), + $text); + + # atx-style headers: + # # Header 1 + # ## Header 2 + # ## Header 2 with closing hashes ## + # ... + # ###### Header 6 + # + $text = preg_replace("{ + ^(\\#{1,6}) # $1 = string of #'s + [ \\t]* + (.+?) # $2 = Header text + [ \\t]* + \\#* # optional closing #'s (not counted) + \\n+ + }xme", + "'<h'.strlen('\\1').'>'.\$this->_RunSpanGamut(\$this->_UnslashQuotes('\\2')).'</h'.strlen('\\1').'>\n\n'", + $text); + + return $text; + } + + + private function _DoLists($text) { + # + # Form HTML ordered (numbered) and unordered (bulleted) lists. + # + $less_than_tab = $this->md_tab_width - 1; + + # Re-usable patterns to match list item bullets and number markers: + $marker_ul = '[*+-]'; + $marker_ol = '\d+[.]'; + $marker_any = "(?:$marker_ul|$marker_ol)"; + + $markers = array($marker_ul, $marker_ol); + + foreach ($markers as $marker) { + # Re-usable pattern to match any entirel ul or ol list: + $whole_list = ' + ( # $1 = whole list + ( # $2 + [ ]{0,'.$less_than_tab.'} + ('.$marker.') # $3 = first list item marker + [ \t]+ + ) + (?s:.+?) + ( # $4 + \z + | + \n{2,} + (?=\S) + (?! # Negative lookahead for another list item marker + [ \t]* + '.$marker.'[ \t]+ + ) + ) + ) + '; // mx + + # We use a different prefix before nested lists than top-level lists. + # See extended comment in _ProcessListItems(). + + if ($this->md_list_level) { + $text = preg_replace_callback('{ + ^ + '.$whole_list.' + }mx', + array($this,'_DoLists_callback_top'), $text); + } + else { + $text = preg_replace_callback('{ + (?:(?<=\n\n)|\A\n?) + '.$whole_list.' + }mx', + array($this,'_DoLists_callback_nested'), $text); + } + } + + return $text; + } + private function _DoLists_callback_top($matches) { + # Re-usable patterns to match list item bullets and number markers: + $marker_ul = '[*+-]'; + $marker_ol = '\d+[.]'; + $marker_any = "(?:$marker_ul|$marker_ol)"; + + $list = $matches[1]; + $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol"; + + $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol ); + + # Turn double returns into triple returns, so that we can make a + # paragraph for the last item in a list, if necessary: + $list = preg_replace("/\n{2,}/", "\n\n\n", $list); + $result = $this->_ProcessListItems($list, $marker_any); + + # Trim any trailing whitespace, to put the closing `</$list_type>` + # up on the preceding line, to get it past the current stupid + # HTML block parser. This is a hack to work around the terrible + # hack that is the HTML block parser. + $result = rtrim($result); + $result = "<$list_type>" . $result . "</$list_type>\n"; + return $result; + } + private function _DoLists_callback_nested($matches) { + # Re-usable patterns to match list item bullets and number markers: + $marker_ul = '[*+-]'; + $marker_ol = '\d+[.]'; + $marker_any = "(?:$marker_ul|$marker_ol)"; + + $list = $matches[1]; + $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol"; + + $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol ); + + # Turn double returns into triple returns, so that we can make a + # paragraph for the last item in a list, if necessary: + $list = preg_replace("/\n{2,}/", "\n\n\n", $list); + $result = $this->_ProcessListItems($list, $marker_any); + $result = "<$list_type>\n" . $result . "</$list_type>\n"; + return $result; + } + + + private function _ProcessListItems($list_str, $marker_any) { + # + # Process the contents of a single ordered or unordered list, splitting it + # into individual list items. + # + + # The $md_list_level keeps track of when we're inside a list. + # Each time we enter a list, we increment it; when we leave a list, + # we decrement. If it's zero, we're not in a list anymore. + # + # We do this because when we're not inside a list, we want to treat + # something like this: + # + # I recommend upgrading to version + # 8. Oops, now this line is treated + # as a sub-list. + # + # As a single paragraph, despite the fact that the second line starts + # with a digit-period-space sequence. + # + # Whereas when we're inside a list (or sub-list), that line will be + # treated as the start of a sub-list. What a kludge, huh? This is + # an aspect of Markdown's syntax that's hard to parse perfectly + # without resorting to mind-reading. Perhaps the solution is to + # change the syntax rules such that sub-lists must start with a + # starting cardinal number; e.g. "1." or "a.". + + $this->md_list_level++; + + # trim trailing blank lines: + $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str); + + $list_str = preg_replace_callback('{ + (\n)? # leading line = $1 + (^[ \t]*) # leading whitespace = $2 + ('.$marker_any.') [ \t]+ # list marker = $3 + ((?s:.+?) # list item text = $4 + (\n{1,2})) + (?= \n* (\z | \2 ('.$marker_any.') [ \t]+)) + }xm', + array($this,'_ProcessListItems_callback'), $list_str); + + $this->md_list_level--; + return $list_str; + } + private function _ProcessListItems_callback($matches) { + $item = $matches[4]; + $leading_line =& $matches[1]; + $leading_space =& $matches[2]; + + if ($leading_line || preg_match('/\n{2,}/', $item)) { + $item = $this->_RunBlockGamut($this->_Outdent($item)); + } + else { + # Recursion for sub-lists: + $item = $this->_DoLists($this->_Outdent($item)); + $item = preg_replace('/\n+$/', '', $item); + $item = $this->_RunSpanGamut($item); + } + + return "<li>" . $item . "</li>\n"; + } + + + private function _DoCodeBlocks($text) { + # + # Process Markdown `<pre><code>` blocks. + # + $text = preg_replace_callback('{ + (?:\n\n|\A) + ( # $1 = the code block -- one or more lines, starting with a space/tab + (?: + (?:[ ]{'.$this->md_tab_width.'} | \t) # Lines must start with a tab or a tab-width of spaces + .*\n+ + )+ + ) + ((?=^[ ]{0,'.$this->md_tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc + }xm', + array($this,'_DoCodeBlocks_callback'), $text); + + return $text; + } + private function _DoCodeBlocks_callback($matches) { + $codeblock = $matches[1]; + + $codeblock = $this->_EncodeCode($this->_Outdent($codeblock)); + // $codeblock = _Detab($codeblock); + # trim leading newlines and trailing whitespace + $codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock); + + $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n"; + + return $result; + } + + + private function _DoCodeSpans($text) { + # + # * Backtick quotes are used for <code></code> spans. + # + # * You can use multiple backticks as the delimiters if you want to + # include literal backticks in the code span. So, this input: + # + # Just type ``foo `bar` baz`` at the prompt. + # + # Will translate to: + # + # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> + # + # There's no arbitrary limit to the number of backticks you + # can use as delimters. If you need three consecutive backticks + # in your code, use four for delimiters, etc. + # + # * You can use spaces to get literal backticks at the edges: + # + # ... type `` `bar` `` ... + # + # Turns to: + # + # ... type <code>`bar`</code> ... + # + $text = preg_replace_callback('@ + (?<!\\\) # Character before opening ` can\'t be a backslash + (`+) # $1 = Opening run of ` + (.+?) # $2 = The code block + (?<!`) + \1 # Matching closer + (?!`) + @xs', + array($this,'_DoCodeSpans_callback'), $text); + + return $text; + } + private function _DoCodeSpans_callback($matches) { + $c = $matches[2]; + $c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace + $c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace + $c = $this->_EncodeCode($c); + return "<code>$c</code>"; + } + + + private function _EncodeCode($_) { + # + # Encode/escape certain characters inside Markdown code runs. + # The point is that in code, these characters are literals, + # and lose their special Markdown meanings. + # + # Encode all ampersands; HTML entities are not + # entities within a Markdown code span. + $_ = str_replace('&', '&', $_); + + # Do the angle bracket song and dance: + $_ = str_replace(array('<', '>'), + array('<', '>'), $_); + + # Now, escape characters that are magic in Markdown: + $_ = str_replace(array_keys(self::$md_escape_table), + array_values(self::$md_escape_table), $_); + + return $_; + } + + + private function _DoItalicsAndBold($text) { + # <strong> must go first: + $text = preg_replace('{ + ( # $1: Marker + (?<!\*\*) \*\* | # (not preceded by two chars of + (?<!__) __ # the same marker) + ) + (?=\S) # Not followed by whitespace + (?!\1) # or two others marker chars. + ( # $2: Content + (?: + [^*_]+? # Anthing not em markers. + | + # Balence any regular emphasis inside. + ([*_]) (?=\S) .+? (?<=\S) \3 # $3: em char (* or _) + | + (?! \1 ) . # Allow unbalenced * and _. + )+? + ) + (?<=\S) \1 # End mark not preceded by whitespace. + }sx', + '<strong>\2</strong>', $text); + # Then <em>: + $text = preg_replace( + '{ ( (?<!\*)\* | (?<!_)_ ) (?=\S) (?! \1) (.+?) (?<=\S) \1 }sx', + '<em>\2</em>', $text); + + return $text; + } + + + private function _DoBlockQuotes($text) { + $text = preg_replace_callback('/ + ( # Wrap whole match in $1 + ( + ^[ \t]*>[ \t]? # ">" at the start of a line + .+\n # rest of the first line + (.+\n)* # subsequent consecutive lines + \n* # blanks + )+ + ) + /xm', + array($this,'_DoBlockQuotes_callback'), $text); + + return $text; + } + private function _DoBlockQuotes_callback($matches) { + $bq = $matches[1]; + # trim one level of quoting - trim whitespace-only lines + $bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq); + $bq = $this->_RunBlockGamut($bq); # recurse + + $bq = preg_replace('/^/m', " ", $bq); + # These leading spaces screw with <pre> content, so we need to fix that: + $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', + array($this,'_DoBlockQuotes_callback2'), $bq); + + return "<blockquote>\n$bq\n</blockquote>\n\n"; + } + private function _DoBlockQuotes_callback2($matches) { + $pre = $matches[1]; + $pre = preg_replace('/^ /m', '', $pre); + return $pre; + } + + + private function _FormParagraphs($text) { + # + # Params: + # $text - string to process with html <p> tags + # + # Strip leading and trailing lines: + $text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text); + + $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY); + + # + # Wrap <p> tags. + # + foreach ($grafs as $key => $value) { + if (!isset( $this->md_html_blocks[$value] )) { + $value = $this->_RunSpanGamut($value); + $value = preg_replace('/^([ \t]*)/', '<p>', $value); + $value .= "</p>"; + $grafs[$key] = $value; + } + } + + # + # Unhashify HTML blocks + # + foreach ($grafs as $key => $value) { + if (isset( $this->md_html_blocks[$value] )) { + $grafs[$key] = $this->md_html_blocks[$value]; + } + } + + return implode("\n\n", $grafs); + } + + + private function _EncodeAmpsAndAngles($text) { + # Smart processing for ampersands and angle brackets that need to be encoded. + + # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: + # http://bumppo.net/projects/amputator/ + $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', + '&', $text);; + + # Encode naked <'s + $text = preg_replace('{<(?![a-z/?\$!])}i', '<', $text); + + return $text; + } + + + private function _EncodeBackslashEscapes($text) { + # + # Parameter: String. + # Returns: The string, with after processing the following backslash + # escape sequences. + # + # Must process escaped backslashes first. + return str_replace(array_keys(self::$md_backslash_escape_table), + array_values(self::$md_backslash_escape_table), $text); + } + + + private function _DoAutoLinks($text) { + $text = preg_replace("!<((https?|ftp):[^'\">\\s]+)>!", + '<a href="\1">\1</a>', $text); + + # Email addresses: <address@domain.foo> + $text = preg_replace('{ + < + (?:mailto:)? + ( + [-.\w]+ + \@ + [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ + ) + > + }exi', + "\$this->_EncodeEmailAddress(\$this->_UnescapeSpecialChars(\$this->_UnslashQuotes('\\1')))", + $text); + + return $text; + } + + + private function _EncodeEmailAddress($addr) { + # + # Input: an email address, e.g. "foo@example.com" + # + # Output: the email address as a mailto link, with each character + # of the address encoded as either a decimal or hex entity, in + # the hopes of foiling most address harvesting spam bots. E.g.: + # + # <a href="mailto:foo@e + # xample.com">foo + # @example.com</a> + # + # Based by a filter by Matthew Wickline, posted to the BBEdit-Talk + # mailing list: <http://tinyurl.com/yu7ue> + # + $addr = "mailto:" . $addr; + $length = strlen($addr); + + # leave ':' alone (to spot mailto: later) + $addr = preg_replace_callback('/([^\:])/', + array($this,'_EncodeEmailAddress_callback'), $addr); + + $addr = "<a href=\"$addr\">$addr</a>"; + # strip the mailto: from the visible part + $addr = preg_replace('/">.+?:/', '">', $addr); + + return $addr; + } + private function _EncodeEmailAddress_callback($matches) { + $char = $matches[1]; + $r = rand(0, 100); + # roughly 10% raw, 45% hex, 45% dec + # '@' *must* be encoded. I insist. + if ($r > 90 && $char != '@') return $char; + if ($r < 45) return '&#x'.dechex(ord($char)).';'; + return '&#'.ord($char).';'; + } + + + private function _UnescapeSpecialChars($text) { + # + # Swap back in all the special characters we've hidden. + # + return str_replace(array_values(self::$md_escape_table), + array_keys(self::$md_escape_table), $text); + } + + + # _TokenizeHTML is shared between PHP Markdown and PHP SmartyPants. + # We only define it if it is not already defined. + + private function _TokenizeHTML($str) { + # + # Parameter: String containing HTML markup. + # Returns: An array of the tokens comprising the input + # string. Each token is either a tag (possibly with nested, + # tags contained therein, such as <a href="<MTFoo>">, or a + # run of text between tags. Each element of the array is a + # two-element array; the first is either 'tag' or 'text'; + # the second is the actual value. + # + # + # Regular expression derived from the _tokenize() subroutine in + # Brad Choate's MTRegex plugin. + # <http://www.bradchoate.com/past/mtregex.php> + # + $index = 0; + $tokens = array(); + + $match = '(?s:<!(?:--.*?--\s*)+>)|'. # comment + '(?s:<\?.*?\?>)|'. # processing instruction + # regular tags + '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)'; + + $parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE); + + foreach ($parts as $part) { + if (++$index % 2 && $part != '') + $tokens[] = array('text', $part); + else + $tokens[] = array('tag', $part); + } + + return $tokens; + } + + private function _Outdent($text) { + # + # Remove one level of line-leading tabs or spaces + # + return preg_replace("/^(\\t|[ ]{1,".$this->md_tab_width."})/m", "", $text); + } + + + private function _Detab($text) { + # + # Replace tabs with the appropriate amount of space. + # + # For each line we separate the line in blocks delemited by + # tab characters. Then we reconstruct every line by adding the + # appropriate number of space between each blocks. + + $lines = explode("\n", $text); + $text = ""; + + foreach ($lines as $line) { + # Split in blocks. + $blocks = explode("\t", $line); + # Add each blocks to the line. + $line = $blocks[0]; + unset($blocks[0]); # Do not add first block twice. + foreach ($blocks as $block) { + # Calculate amount of space, insert spaces, insert block. + $amount = $this->md_tab_width - strlen($line) % $this->md_tab_width; + $line .= str_repeat(" ", $amount) . $block; + } + $text .= "$line\n"; + } + return $text; + } + + + private function _UnslashQuotes($text) { + # + # This function is useful to remove automaticaly slashed double quotes + # when using preg_replace and evaluating an expression. + # Parameter: String. + # Returns: The string with any slash-double-quote (\") sequence replaced + # by a single double quote. + # + return str_replace('\"', '"', $text); + } +} + +/* + +PHP Markdown +============ + +Description +----------- + +This is a PHP translation of the original Markdown formatter written in +Perl by John Gruber. + +Markdown is a text-to-HTML filter; it translates an easy-to-read / +easy-to-write structured text format into HTML. Markdown's text format +is most similar to that of plain text email, and supports features such +as headers, *emphasis*, code blocks, blockquotes, and links. + +Markdown's syntax is designed not as a generic markup language, but +specifically to serve as a front-end to (X)HTML. You can use span-level +HTML tags anywhere in a Markdown document, and you can use block level +HTML tags (like <div> and <table> as well). + +For more information about Markdown's syntax, see: + +<http://daringfireball.net/projects/markdown/> + + +Bugs +---- + +To file bug reports please send email to: + +<michel.fortin@michelf.com> + +Please include with your report: (1) the example input; (2) the output you +expected; (3) the output Markdown actually produced. + + +Version History +--------------- + +See the readme file for detailed release notes for this version. + +1.0.1c - 9 Dec 2005 + +1.0.1b - 6 Jun 2005 + +1.0.1a - 15 Apr 2005 + +1.0.1 - 16 Dec 2004 + +1.0 - 21 Aug 2004 + + +Author & Contributors +--------------------- + +Original Perl version by John Gruber +<http://daringfireball.net/> + +PHP port and other contributions by Michel Fortin +<http://www.michelf.com/> + + +Copyright and License +--------------------- + +Copyright (c) 2004-2005 Michel Fortin +<http://www.michelf.com/> +All rights reserved. + +Copyright (c) 2003-2004 John Gruber +<http://daringfireball.net/> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name "Markdown" nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +This software is provided by the copyright holders and contributors "as +is" and any express or implied warranties, including, but not limited +to, the implied warranties of merchantability and fitness for a +particular purpose are disclaimed. In no event shall the copyright owner +or contributors be liable for any direct, indirect, incidental, special, +exemplary, or consequential damages (including, but not limited to, +procurement of substitute goods or services; loss of use, data, or +profits; or business interruption) however caused and on any theory of +liability, whether in contract, strict liability, or tort (including +negligence or otherwise) arising in any way out of the use of this +software, even if advised of the possibility of such damage. + +*/ +?> |