From 972020d5308a262932b16d2e235af9923168f079 Mon Sep 17 00:00:00 2001 From: Nik Nyby Date: Wed, 4 Jan 2017 22:08:56 -0500 Subject: [PATCH] Update commonmark spec to 0.27 --- CHANGELOG.md | 3 + CommonMark/blocks.py | 2 +- CommonMark/inlines.py | 60 ++++++++++++------ spec.txt | 142 +++++++++++++++++++++++++++--------------- 4 files changed, 135 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23344ab..f12d075 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.7.3 +- The CommonMark spec has been updated to 0.27. + ## 0.7.2 (2016-08-10) - Removed outdated files from distributed packages, reported by @hyperknot diff --git a/CommonMark/blocks.py b/CommonMark/blocks.py index 00fa4ae..ad4ff09 100644 --- a/CommonMark/blocks.py +++ b/CommonMark/blocks.py @@ -47,7 +47,7 @@ reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)') reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}(?!.*~)') reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)') -reSetextHeadingLine = re.compile(r'^(?:=+|-+) *$') +reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$') reLineEnding = re.compile(r'\r\n|\n|\r') diff --git a/CommonMark/inlines.py b/CommonMark/inlines.py index 54af6e5..ee56bd5 100644 --- a/CommonMark/inlines.py +++ b/CommonMark/inlines.py @@ -20,8 +20,6 @@ # Some regexps used in inline parser: ESCAPED_CHAR = '\\\\' + common.ESCAPABLE -REG_CHAR = '[^\\\\()\\x00-\\x20]' -IN_PARENS_NOSP = '\\((' + REG_CHAR + '|' + ESCAPED_CHAR + '|\\\\)*\\)' rePunctuation = re.compile( r'^[\u2000-\u206F\u2E00-\u2E7F\\' + "'" + '!"#\$%&\(\)' @@ -36,9 +34,6 @@ reLinkDestinationBraces = re.compile( '^(?:[<](?:[^ <>\\t\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + '\\\\)*[>])') -reLinkDestination = re.compile( - '^(?:' + REG_CHAR + '+|' + ESCAPED_CHAR + '|\\\\|' + - IN_PARENS_NOSP + ')*') reEscapable = re.compile('^' + common.ESCAPABLE) reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE) @@ -54,8 +49,9 @@ r'^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', re.IGNORECASE) reSpnl = re.compile(r'^ *(?:\n *)?') -reWhitespaceChar = re.compile(r'^\s') -reWhitespace = re.compile(r'\s+') +reWhitespaceChar = re.compile(r'^^[ \t\n\x0b\x0c\x0d]') +reWhitespace = re.compile(r'[ \t\n\x0b\x0c\x0d]+') +reUnicodeWhitespaceChar = re.compile(r'^\s') reFinalSpace = re.compile(r' *$') reInitialSpace = re.compile(r'^ *') reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)') @@ -262,10 +258,10 @@ def scanDelims(self, c): c_after = '\n' # Python 2 doesn't recognize '\xa0' as whitespace - after_is_whitespace = re.match(reWhitespaceChar, c_after) or \ + after_is_whitespace = re.match(reUnicodeWhitespaceChar, c_after) or \ c_after == '\xa0' after_is_punctuation = re.match(rePunctuation, c_after) - before_is_whitespace = re.match(reWhitespaceChar, c_before) or \ + before_is_whitespace = re.match(reUnicodeWhitespaceChar, c_before) or \ c_before == '\xa0' before_is_punctuation = re.match(rePunctuation, c_before) @@ -318,6 +314,7 @@ def handleDelim(self, cc, block): self.delimiters = { 'cc': cc, 'numdelims': numdelims, + 'origdelims': numdelims, 'node': node, 'previous': self.delimiters, 'next': None, @@ -372,8 +369,8 @@ def processEmphasis(self, stack_bottom): opener != openers_bottom[closercc]): odd_match = (closer.get('can_open') or opener.get('can_close')) and \ - (opener.get('numdelims') + - closer.get('numdelims')) % 3 == 0 + (opener.get('origdelims') + + closer.get('origdelims')) % 3 == 0 if opener.get('cc') == closercc and \ opener.get('can_open') and \ not odd_match: @@ -487,11 +484,31 @@ def parseLinkDestination(self): """ res = self.match(reLinkDestinationBraces) if res is None: - res = self.match(reLinkDestination) - if res is None: - return None - else: - return normalize_uri(unescape_string(res)) + # TODO handrolled parser; res should be None or the string + savepos = self.pos + openparens = 0 + c = self.peek() + while c is not None: + if c == '\\': + self.pos += 1 + if self.peek() is not None: + self.pos += 1 + elif c == '(': + self.pos += 1 + openparens += 1 + elif c == ')': + if openparens < 1: + break + else: + self.pos += 1 + openparens -= 1 + elif re.match(reWhitespaceChar, c): + break + else: + self.pos += 1 + c = self.peek() + res = self.subject[savepos:self.pos] + return normalize_uri(unescape_string(res)) else: # chop off surrounding <..>: return normalize_uri(unescape_string(res[1:-1])) @@ -575,22 +592,25 @@ def parseCloseBracket(self, block): # Check to see if we have a link/image + savepos = self.pos + # Inline link? if self.peek() == '(': self.pos += 1 self.spnl() dest = self.parseLinkDestination() - if dest is not None and \ - self.spnl(): + if dest is not None and self.spnl(): # make sure there's a space before the title if re.match(reWhitespaceChar, self.subject[self.pos-1]): title = self.parseLinkTitle() if self.spnl() and self.peek() == ')': self.pos += 1 matched = True - else: + else: + self.pos = savepos + + if not matched: # Next, see if there's a link label - savepos = self.pos beforelabel = self.pos n = self.parseLinkLabel() if n > 2: diff --git a/spec.txt b/spec.txt index e2b6834..c49e85b 100644 --- a/spec.txt +++ b/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.26 -date: '2016-07-15' +version: 0.27 +date: '2016-11-18' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -1985,7 +1985,7 @@ by their start and end conditions. The block begins with a line that meets a [start condition](@) (after up to three spaces optional indentation). It ends with the first subsequent line that meets a matching [end condition](@), or the last line of -the document or other [container block](@), if no line is encountered that meets the +the document or other [container block]), if no line is encountered that meets the [end condition]. If the first line meets both the [start condition] and the [end condition], the block will contain just that line. @@ -2015,7 +2015,8 @@ followed by one of the strings (case-insensitive) `address`, `article`, `aside`, `base`, `basefont`, `blockquote`, `body`, `caption`, `center`, `col`, `colgroup`, `dd`, `details`, `dialog`, `dir`, `div`, `dl`, `dt`, `fieldset`, `figcaption`, `figure`, -`footer`, `form`, `frame`, `frameset`, `h1`, `head`, `header`, `hr`, +`footer`, `form`, `frame`, `frameset`, +`h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `head`, `header`, `hr`, `html`, `iframe`, `legend`, `li`, `link`, `main`, `menu`, `menuitem`, `meta`, `nav`, `noframes`, `ol`, `optgroup`, `option`, `p`, `param`, `section`, `source`, `summary`, `table`, `tbody`, `td`, @@ -3636,11 +3637,11 @@ The following rules define [list items]: If the list item is ordered, then it is also assigned a start number, based on the ordered list marker. - Exceptions: When the list item interrupts a paragraph---that - is, when it starts on a line that would otherwise count as - [paragraph continuation text]---then (a) the lines *Ls* must - not begin with a blank line, and (b) if the list item is - ordered, the start number must be 1. + Exceptions: When the first list item in a [list] interrupts + a paragraph---that is, when it starts on a line that would + otherwise count as [paragraph continuation text]---then (a) + the lines *Ls* must not begin with a blank line, and (b) if + the list item is ordered, the start number must be 1. For example, let *Ls* be the lines @@ -4730,8 +4731,7 @@ takes four spaces (a common case), but diverge in other cases. A [list](@) is a sequence of one or more list items [of the same type]. The list items -may be separated by single [blank lines], but two -blank lines end all containing lists. +may be separated by any number of blank lines. Two list items are [of the same type](@) if they begin with a [list marker] of the same type. @@ -4809,10 +4809,11 @@ Foo `Markdown.pl` does not allow this, through fear of triggering a list via a numeral in a hard-wrapped line: -```````````````````````````````` markdown +``` markdown The number of windows in my house is 14. The number of doors is 6. -```````````````````````````````` +``` + Oddly, though, `Markdown.pl` *does* allow a blockquote to interrupt a paragraph, even though the same considerations might apply. @@ -4821,10 +4822,12 @@ In CommonMark, we do allow lists to interrupt paragraphs, for two reasons. First, it is natural and not uncommon for people to start lists without blank lines: - I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` Second, we are attracted to a @@ -4836,20 +4839,24 @@ Second, we are attracted to a (Indeed, the spec for [list items] and [block quotes] presupposes this principle.) This principle implies that if - * I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown + * I need to buy + - new shoes + - a coat + - a plane ticket +``` is a list item containing a paragraph followed by a nested sublist, as all Markdown implementations agree it is (though the paragraph may be rendered without `

` tags, since the list is "tight"), then - I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` by itself should be a paragraph followed by a nested sublist. @@ -5671,6 +5678,16 @@ single spaces, just as they would be by a browser: ```````````````````````````````` +Not all [Unicode whitespace] (for instance, non-breaking space) is +collapsed, however: + +```````````````````````````````` example +`a  b` +. +

a  b

+```````````````````````````````` + + Q: Why not just leave the spaces, since browsers will collapse them anyway? A: Because we might be targeting a non-HTML format, and we shouldn't rely on HTML-specific rendering assumptions. @@ -6558,7 +6575,7 @@ Note that in the preceding case, the interpretation is precluded by the condition that a delimiter that -can both open and close (like the `*` after `foo` +can both open and close (like the `*` after `foo`) cannot form emphasis if the sum of the lengths of the delimiter runs containing the opening and closing delimiters is a multiple of 3. @@ -6590,12 +6607,6 @@ omitted: ```````````````````````````````` -```````````````````````````````` example -*foo**bar*** -. -

foobar

-```````````````````````````````` - Indefinite levels of nesting are possible: ```````````````````````````````` example @@ -7137,8 +7148,7 @@ A [link destination](@) consists of either - a nonempty sequence of characters that does not include ASCII space or control characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses that is not itself - inside a balanced pair of unescaped parentheses. + a balanced pair of unescaped parentheses. A [link title](@) consists of either @@ -7244,35 +7254,29 @@ Parentheses inside the link destination may be escaped:

link

```````````````````````````````` -One level of balanced parentheses is allowed without escaping: - -```````````````````````````````` example -[link]((foo)and(bar)) -. -

link

-```````````````````````````````` - -However, if you have parentheses within parentheses, you need to escape -or use the `<...>` form: +Any number parentheses are allowed without escaping, as long as they are +balanced: ```````````````````````````````` example [link](foo(and(bar))) . -

[link](foo(and(bar)))

+

link

```````````````````````````````` +However, if you have unbalanced parentheses, you need to escape or use the +`<...>` form: ```````````````````````````````` example -[link](foo(and\(bar\))) +[link](foo\(and\(bar\)) . -

link

+

link

```````````````````````````````` ```````````````````````````````` example -[link]() +[link]() . -

link

+

link

```````````````````````````````` @@ -7361,6 +7365,16 @@ may be used in titles: ```````````````````````````````` +Titles must be separated from the link using a [whitespace]. +Other [Unicode whitespace] like non-breaking space doesn't work. + +```````````````````````````````` example +[link](/url "title") +. +

link

+```````````````````````````````` + + Nested balanced quotes are not allowed without escaping: ```````````````````````````````` example @@ -8025,7 +8039,8 @@ following closing bracket: ```````````````````````````````` -Full references take precedence over shortcut references: +Full and compact references take precedence over shortcut +references: ```````````````````````````````` example [foo][bar] @@ -8036,6 +8051,31 @@ Full references take precedence over shortcut references:

foo

```````````````````````````````` +```````````````````````````````` example +[foo][] + +[foo]: /url1 +. +

foo

+```````````````````````````````` + +Inline links also take precedence: + +```````````````````````````````` example +[foo]() + +[foo]: /url1 +. +

foo

+```````````````````````````````` + +```````````````````````````````` example +[foo](not a link) + +[foo]: /url1 +. +

foo(not a link)

+```````````````````````````````` In the following case `[bar][baz]` is parsed as a reference, `[foo]` as normal text: @@ -9045,7 +9085,7 @@ blocks. But we cannot close unmatched blocks yet, because we may have a [lazy continuation line]. 2. Next, after consuming the continuation markers for existing -blocks, we look for new block starts (e.g. `>` for a block quote. +blocks, we look for new block starts (e.g. `>` for a block quote). If we encounter a new block start, we close any blocks unmatched in step 1 before creating the new block as a child of the last matched block.