KiBot/kibot/PcbDraw/mistune/block_parser.py

367 lines
11 KiB
Python

import re
from .scanner import ScannerParser, Matcher
from .inline_parser import ESCAPE_CHAR, LINK_LABEL
from .util import unikey
_NEW_LINES = re.compile(r'\r\n|\r')
_BLANK_LINES = re.compile(r'^ +$', re.M)
_TRIM_4 = re.compile(r'^ {1,4}')
_EXPAND_TAB = re.compile(r'^( {0,3})\t', flags=re.M)
_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M)
_BLOCK_QUOTE_TRIM = re.compile(r'^ {0,1}', flags=re.M)
_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M)
_BLOCK_TAGS = {
'address', 'article', 'aside', 'base', 'basefont', 'blockquote',
'body', 'caption', 'center', 'col', 'colgroup', 'dd', 'details',
'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3',
'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe',
'legend', 'li', 'link', 'main', 'menu', 'menuitem', 'meta', 'nav',
'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'section',
'source', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
'title', 'tr', 'track', 'ul'
}
_BLOCK_HTML_RULE6 = (
r'</?(?:' + '|'.join(_BLOCK_TAGS) + r')'
r'(?: +|\n|/?>)[\s\S]*?'
r'(?:\n{2,}|\n*$)'
)
_BLOCK_HTML_RULE7 = (
# open tag
r'<(?!script|pre|style)([a-z][\w-]*)(?:'
r' +[a-zA-Z:_][\w.:-]*(?: *= *"[^"\n]*"|'
r''' *= *'[^'\n]*'| *= *[^\s"'=<>`]+)?'''
r')*? */?>(?=\s*\n)[\s\S]*?(?:\n{2,}|\n*$)|'
# close tag
r'</(?!script|pre|style)[a-z][\w-]*\s*>(?=\s*\n)[\s\S]*?(?:\n{2,}|\n*$)'
)
_PARAGRAPH_SPLIT = re.compile(r'\n{2,}')
_LIST_BULLET = re.compile(r'^ *([\*\+-]|\d+[.)])')
class BlockParser(ScannerParser):
scanner_cls = Matcher
NEWLINE = re.compile(r'\n+')
DEF_LINK = re.compile(
r' {0,3}\[(' + LINK_LABEL + r')\]:(?:[ \t]*\n)?[ \t]*'
r'<?([^\s>]+)>?(?:[ \t]*\n)?'
r'(?: +["(]([^\n]+)[")])? *\n+'
)
AXT_HEADING = re.compile(
r' {0,3}(#{1,6})(?!#+)(?: *\n+|'
r'\s+([^\n]*?)(?:\n+|\s+?#+\s*\n+))'
)
SETEX_HEADING = re.compile(r'([^\n]+)\n *(=|-){2,}[ \t]*\n+')
THEMATIC_BREAK = re.compile(
r' {0,3}((?:-[ \t]*){3,}|'
r'(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})\n+'
)
INDENT_CODE = re.compile(r'(?:\n*)(?:(?: {4}| *\t)[^\n]+\n*)+')
FENCED_CODE = re.compile(
r'( {0,3})(`{3,}|~{3,})([^`\n]*)\n'
r'(?:|([\s\S]*?)\n)'
r'(?: {0,3}\2[~`]* *\n+|$)'
)
BLOCK_QUOTE = re.compile(
r'(?: {0,3}>[^\n]*\n)+'
)
LIST_START = re.compile(
r'( {0,3})([\*\+-]|\d{1,9}[.)])(?:[ \t]*|[ \t][^\n]+)\n+'
)
BLOCK_HTML = re.compile((
r' {0,3}(?:'
r'<(script|pre|style)[\s>][\s\S]*?(?:</\1>[^\n]*\n+|$)|'
r'<!--(?!-?>)[\s\S]*?-->[^\n]*\n+|'
r'<\?[\s\S]*?\?>[^\n]*\n+|'
r'<![A-Z][\s\S]*?>[^\n]*\n+|'
r'<!\[CDATA\[[\s\S]*?\]\]>[^\n]*\n+'
r'|' + _BLOCK_HTML_RULE6 + '|' + _BLOCK_HTML_RULE7 + ')'
), re.I)
LIST_MAX_DEPTH = 6
BLOCK_QUOTE_MAX_DEPTH = 6
RULE_NAMES = (
'newline', 'thematic_break',
'fenced_code', 'indent_code',
'block_quote', 'block_html',
'list_start',
'axt_heading', 'setex_heading',
'def_link',
)
def __init__(self):
super(BlockParser, self).__init__()
self.block_quote_rules = list(self.RULE_NAMES)
self.list_rules = list(self.RULE_NAMES)
def parse_newline(self, m, state):
return {'type': 'newline', 'blank': True}
def parse_thematic_break(self, m, state):
return {'type': 'thematic_break', 'blank': True}
def parse_indent_code(self, m, state):
text = expand_leading_tab(m.group(0))
code = _INDENT_CODE_TRIM.sub('', text)
code = code.lstrip('\n')
return self.tokenize_block_code(code, None, state)
def parse_fenced_code(self, m, state):
info = ESCAPE_CHAR.sub(r'\1', m.group(3))
spaces = m.group(1)
code = m.group(4) or ''
if spaces and code:
_trim_pattern = re.compile('^' + spaces, re.M)
code = _trim_pattern.sub('', code)
return self.tokenize_block_code(code + '\n', info, state)
def tokenize_block_code(self, code, info, state):
token = {'type': 'block_code', 'raw': code}
if info:
token['params'] = (info, )
return token
def parse_axt_heading(self, m, state):
level = len(m.group(1))
text = m.group(2) or ''
text = text.strip()
if set(text) == {'#'}:
text = ''
return self.tokenize_heading(text, level, state)
def parse_setex_heading(self, m, state):
level = 1 if m.group(2) == '=' else 2
text = m.group(1)
text = text.strip()
return self.tokenize_heading(text, level, state)
def tokenize_heading(self, text, level, state):
return {'type': 'heading', 'text': text, 'params': (level,)}
def get_block_quote_rules(self, depth):
if depth > self.BLOCK_QUOTE_MAX_DEPTH - 1:
rules = list(self.block_quote_rules)
rules.remove('block_quote')
return rules
return self.block_quote_rules
def parse_block_quote(self, m, state):
depth = state.get('block_quote_depth', 0) + 1
state['block_quote_depth'] = depth
# normalize block quote text
text = _BLOCK_QUOTE_LEADING.sub('', m.group(0))
text = expand_leading_tab(text)
text = _BLOCK_QUOTE_TRIM.sub('', text)
text = cleanup_lines(text)
rules = self.get_block_quote_rules(depth)
children = self.parse(text, state, rules)
state['block_quote_depth'] = depth - 1
return {'type': 'block_quote', 'children': children}
def get_list_rules(self, depth):
if depth > self.LIST_MAX_DEPTH - 1:
rules = list(self.list_rules)
rules.remove('list_start')
return rules
return self.list_rules
def parse_list_start(self, m, state, string):
items = []
spaces = m.group(1)
marker = m.group(2)
items, pos = _find_list_items(string, m.start(), spaces, marker)
tight = '\n\n' not in ''.join(items).strip()
ordered = len(marker) != 1
if ordered:
start = int(marker[:-1])
if start == 1:
start = None
else:
start = None
list_tights = state.get('list_tights', [])
list_tights.append(tight)
state['list_tights'] = list_tights
depth = len(list_tights)
rules = self.get_list_rules(depth)
children = [
self.parse_list_item(item, depth, state, rules)
for item in items
]
list_tights.pop()
params = (ordered, depth, start)
token = {'type': 'list', 'children': children, 'params': params}
return token, pos
def parse_list_item(self, text, depth, state, rules):
text = self.normalize_list_item_text(text)
if not text:
children = [{'type': 'block_text', 'text': ''}]
else:
children = self.parse(text, state, rules)
return {
'type': 'list_item',
'params': (depth,),
'children': children,
}
@staticmethod
def normalize_list_item_text(text):
text_length = len(text)
text = _LIST_BULLET.sub('', text)
if not text.strip():
return ''
space = text_length - len(text)
text = expand_leading_tab(text)
if text.startswith(' '):
text = text[1:]
space += 1
else:
text_length = len(text)
text = _TRIM_4.sub('', text)
space += max(text_length - len(text), 1)
# outdent
if '\n ' in text:
pattern = re.compile(r'\n {1,' + str(space) + r'}')
text = pattern.sub(r'\n', text)
return text
def parse_block_html(self, m, state):
html = m.group(0).rstrip()
return {'type': 'block_html', 'raw': html}
def parse_def_link(self, m, state):
key = unikey(m.group(1))
link = m.group(2)
title = m.group(3)
if key not in state['def_links']:
state['def_links'][key] = (link, title)
def parse_text(self, text, state):
list_tights = state.get('list_tights')
if list_tights and list_tights[-1]:
return {'type': 'block_text', 'text': text.strip()}
tokens = []
for s in _PARAGRAPH_SPLIT.split(text):
s = s.strip()
if s:
tokens.append({'type': 'paragraph', 'text': s})
return tokens
def parse(self, s, state, rules=None):
if rules is None:
rules = self.rules
return list(self._scan(s, state, rules))
def render(self, tokens, inline, state):
data = self._iter_render(tokens, inline, state)
return inline.renderer.finalize(data)
def _iter_render(self, tokens, inline, state):
for tok in tokens:
method = inline.renderer._get_method(tok['type'])
if 'blank' in tok:
yield method()
continue
if 'children' in tok:
children = self.render(tok['children'], inline, state)
elif 'raw' in tok:
children = tok['raw']
else:
children = inline(tok['text'], state)
params = tok.get('params')
if params:
yield method(children, *params)
else:
yield method(children)
def cleanup_lines(s):
s = _NEW_LINES.sub('\n', s)
s = _BLANK_LINES.sub('', s)
return s
def expand_leading_tab(text):
return _EXPAND_TAB.sub(_expand_tab_repl, text)
def _expand_tab_repl(m):
s = m.group(1)
return s + ' ' * (4 - len(s))
def _create_list_item_pattern(spaces, marker):
prefix = r'( {0,' + str(len(spaces) + len(marker)) + r'})'
if len(marker) > 1:
if marker[-1] == '.':
prefix = prefix + r'\d{0,9}\.'
else:
prefix = prefix + r'\d{0,9}\)'
else:
if marker == '*':
prefix = prefix + r'\*'
elif marker == '+':
prefix = prefix + r'\+'
else:
prefix = prefix + r'-'
s1 = ' {' + str(len(marker) + 1) + ',}'
if len(marker) > 4:
s2 = ' {' + str(len(marker) - 4) + r',}\t'
else:
s2 = r' *\t'
return re.compile(
prefix + r'(?:[ \t]*|[ \t]+[^\n]+)\n+'
r'(?:\1(?:' + s1 + '|' + s2 + ')'
r'[^\n]+\n+)*'
)
def _find_list_items(string, pos, spaces, marker):
items = []
if marker in {'*', '-'}:
is_hr = re.compile(
r' *((?:-[ \t]*){3,}|(?:\*[ \t]*){3,})\n+'
)
else:
is_hr = None
pattern = _create_list_item_pattern(spaces, marker)
while 1:
m = pattern.match(string, pos)
if not m:
break
text = m.group(0)
if is_hr and is_hr.match(text):
break
new_spaces = m.group(1)
if new_spaces != spaces:
spaces = new_spaces
pattern = _create_list_item_pattern(spaces, marker)
items.append(text)
pos = m.end()
return items, pos