299 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			299 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| PRE-PROCESSORS
 | |
| =============================================================================
 | |
| 
 | |
| Preprocessors work on source text before we start doing anything too
 | |
| complicated. 
 | |
| """
 | |
| 
 | |
| from __future__ import absolute_import
 | |
| from __future__ import unicode_literals
 | |
| from . import util
 | |
| from . import odict
 | |
| import re
 | |
| 
 | |
| 
 | |
| def build_preprocessors(md_instance, **kwargs):
 | |
|     """ Build the default set of preprocessors used by Markdown. """
 | |
|     preprocessors = odict.OrderedDict()
 | |
|     preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
 | |
|     if md_instance.safeMode != 'escape':
 | |
|         preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
 | |
|     preprocessors["reference"] = ReferencePreprocessor(md_instance)
 | |
|     return preprocessors
 | |
| 
 | |
| 
 | |
| class Preprocessor(util.Processor):
 | |
|     """
 | |
|     Preprocessors are run after the text is broken into lines.
 | |
| 
 | |
|     Each preprocessor implements a "run" method that takes a pointer to a
 | |
|     list of lines of the document, modifies it as necessary and returns
 | |
|     either the same pointer or a pointer to a new list.
 | |
| 
 | |
|     Preprocessors must extend markdown.Preprocessor.
 | |
| 
 | |
|     """
 | |
|     def run(self, lines):
 | |
|         """
 | |
|         Each subclass of Preprocessor should override the `run` method, which
 | |
|         takes the document as a list of strings split by newlines and returns
 | |
|         the (possibly modified) list of lines.
 | |
| 
 | |
|         """
 | |
|         pass
 | |
| 
 | |
| 
 | |
| class NormalizeWhitespace(Preprocessor):
 | |
|     """ Normalize whitespace for consistant parsing. """
 | |
| 
 | |
|     def run(self, lines):
 | |
|         source = '\n'.join(lines)
 | |
|         source = source.replace(util.STX, "").replace(util.ETX, "")
 | |
|         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
 | |
|         source = source.expandtabs(self.markdown.tab_length)
 | |
|         source = re.sub(r'(?<=\n) +\n', '\n', source)
 | |
|         return source.split('\n')
 | |
| 
 | |
| 
 | |
| class HtmlBlockPreprocessor(Preprocessor):
 | |
|     """Remove html blocks from the text and store them for later retrieval."""
 | |
| 
 | |
|     right_tag_patterns = ["</%s>", "%s>"]
 | |
|     attrs_pattern = r"""
 | |
|         \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
 | |
|         |                                                         # OR 
 | |
|         \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
 | |
|         |                                                         # OR
 | |
|         \s+(?P<attr2>[^>"'/= ]+)                                  # attr
 | |
|         """
 | |
|     left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
 | |
|     attrs_re = re.compile(attrs_pattern, re.VERBOSE)
 | |
|     left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
 | |
|     markdown_in_raw = False
 | |
| 
 | |
|     def _get_left_tag(self, block):
 | |
|         m = self.left_tag_re.match(block)
 | |
|         if m:
 | |
|             tag = m.group('tag')
 | |
|             raw_attrs = m.group('attrs')
 | |
|             attrs = {}
 | |
|             if raw_attrs:
 | |
|                 for ma in self.attrs_re.finditer(raw_attrs):
 | |
|                     if ma.group('attr'):
 | |
|                         if ma.group('value'):
 | |
|                             attrs[ma.group('attr').strip()] = ma.group('value')
 | |
|                         else:
 | |
|                             attrs[ma.group('attr').strip()] = ""
 | |
|                     elif ma.group('attr1'):
 | |
|                         if ma.group('value1'):
 | |
|                             attrs[ma.group('attr1').strip()] = ma.group('value1')
 | |
|                         else:
 | |
|                             attrs[ma.group('attr1').strip()] = ""
 | |
|                     elif ma.group('attr2'):
 | |
|                         attrs[ma.group('attr2').strip()] = ""
 | |
|             return tag, len(m.group(0)), attrs
 | |
|         else:
 | |
|             tag = block[1:].split(">", 1)[0].lower()
 | |
|             return tag, len(tag)+2, {}
 | |
| 
 | |
|     def _recursive_tagfind(self, ltag, rtag, start_index, block):
 | |
|         while 1:
 | |
|             i = block.find(rtag, start_index)
 | |
|             if i == -1:
 | |
|                 return -1
 | |
|             j = block.find(ltag, start_index) 
 | |
|             # if no ltag, or rtag found before another ltag, return index
 | |
|             if (j > i or j == -1):
 | |
|                 return i + len(rtag)
 | |
|             # another ltag found before rtag, use end of ltag as starting
 | |
|             # point and search again
 | |
|             j = block.find('>', j)
 | |
|             start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
 | |
|             if start_index == -1:
 | |
|                 # HTML potentially malformed- ltag has no corresponding 
 | |
|                 # rtag
 | |
|                 return -1
 | |
| 
 | |
|     def _get_right_tag(self, left_tag, left_index, block):
 | |
|         for p in self.right_tag_patterns:
 | |
|             tag = p % left_tag
 | |
|             i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
 | |
|             if i > 2:
 | |
|                 return tag.lstrip("<").rstrip(">"), i
 | |
|         return block.rstrip()[-left_index:-1].lower(), len(block)
 | |
|     
 | |
|     def _equal_tags(self, left_tag, right_tag):
 | |
|         if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
 | |
|             return True
 | |
|         if ("/" + left_tag) == right_tag:
 | |
|             return True
 | |
|         if (right_tag == "--" and left_tag == "--"):
 | |
|             return True
 | |
|         elif left_tag == right_tag[1:] \
 | |
|             and right_tag[0] == "/":
 | |
|             return True
 | |
|         else:
 | |
|             return False
 | |
| 
 | |
|     def _is_oneliner(self, tag):
 | |
|         return (tag in ['hr', 'hr/'])
 | |
| 
 | |
|     def run(self, lines):
 | |
|         text = "\n".join(lines)
 | |
|         new_blocks = []
 | |
|         text = text.rsplit("\n\n")
 | |
|         items = []
 | |
|         left_tag = ''
 | |
|         right_tag = ''
 | |
|         in_tag = False # flag
 | |
| 
 | |
|         while text:
 | |
|             block = text[0]
 | |
|             if block.startswith("\n"):
 | |
|                 block = block[1:]
 | |
|             text = text[1:]
 | |
| 
 | |
|             if block.startswith("\n"):
 | |
|                 block = block[1:]
 | |
| 
 | |
|             if not in_tag:
 | |
|                 if block.startswith("<") and len(block.strip()) > 1:
 | |
| 
 | |
|                     if block[1] == "!":
 | |
|                         # is a comment block
 | |
|                         left_tag, left_index, attrs  = "--", 2, {}
 | |
|                     else:
 | |
|                         left_tag, left_index, attrs = self._get_left_tag(block)
 | |
|                     right_tag, data_index = self._get_right_tag(left_tag, 
 | |
|                                                                 left_index,
 | |
|                                                                 block)
 | |
|                     # keep checking conditions below and maybe just append
 | |
|                     
 | |
|                     if data_index < len(block) \
 | |
|                         and (util.isBlockLevel(left_tag)
 | |
|                         or left_tag == '--'): 
 | |
|                         text.insert(0, block[data_index:])
 | |
|                         block = block[:data_index]
 | |
| 
 | |
|                     if not (util.isBlockLevel(left_tag) \
 | |
|                         or block[1] in ["!", "?", "@", "%"]):
 | |
|                         new_blocks.append(block)
 | |
|                         continue
 | |
| 
 | |
|                     if self._is_oneliner(left_tag):
 | |
|                         new_blocks.append(block.strip())
 | |
|                         continue
 | |
| 
 | |
|                     if block.rstrip().endswith(">") \
 | |
|                         and self._equal_tags(left_tag, right_tag):
 | |
|                         if self.markdown_in_raw and 'markdown' in attrs.keys():
 | |
|                             start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
 | |
|                                            '', block[:left_index])
 | |
|                             end = block[-len(right_tag)-2:]
 | |
|                             block = block[left_index:-len(right_tag)-2]
 | |
|                             new_blocks.append(
 | |
|                                 self.markdown.htmlStash.store(start))
 | |
|                             new_blocks.append(block)
 | |
|                             new_blocks.append(
 | |
|                                 self.markdown.htmlStash.store(end))
 | |
|                         else:
 | |
|                             new_blocks.append(
 | |
|                                 self.markdown.htmlStash.store(block.strip()))
 | |
|                         continue
 | |
|                     else: 
 | |
|                         # if is block level tag and is not complete
 | |
| 
 | |
|                         if util.isBlockLevel(left_tag) or left_tag == "--" \
 | |
|                             and not block.rstrip().endswith(">"):
 | |
|                             items.append(block.strip())
 | |
|                             in_tag = True
 | |
|                         else:
 | |
|                             new_blocks.append(
 | |
|                             self.markdown.htmlStash.store(block.strip()))
 | |
| 
 | |
|                         continue
 | |
| 
 | |
|                 new_blocks.append(block)
 | |
| 
 | |
|             else:
 | |
|                 items.append(block)
 | |
| 
 | |
|                 right_tag, data_index = self._get_right_tag(left_tag, 0, block)
 | |
| 
 | |
|                 if self._equal_tags(left_tag, right_tag):
 | |
|                     # if find closing tag
 | |
|                     
 | |
|                     if data_index < len(block):
 | |
|                         # we have more text after right_tag
 | |
|                         items[-1] = block[:data_index]
 | |
|                         text.insert(0, block[data_index:])
 | |
| 
 | |
|                     in_tag = False
 | |
|                     if self.markdown_in_raw and 'markdown' in attrs.keys():
 | |
|                         start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
 | |
|                                        '', items[0][:left_index])
 | |
|                         items[0] = items[0][left_index:]
 | |
|                         end = items[-1][-len(right_tag)-2:]
 | |
|                         items[-1] = items[-1][:-len(right_tag)-2]
 | |
|                         new_blocks.append(
 | |
|                             self.markdown.htmlStash.store(start))
 | |
|                         new_blocks.extend(items)
 | |
|                         new_blocks.append(
 | |
|                             self.markdown.htmlStash.store(end))
 | |
|                     else:
 | |
|                         new_blocks.append(
 | |
|                             self.markdown.htmlStash.store('\n\n'.join(items)))
 | |
|                     items = []
 | |
| 
 | |
|         if items:
 | |
|             if self.markdown_in_raw and 'markdown' in attrs.keys():
 | |
|                 start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
 | |
|                                '', items[0][:left_index])
 | |
|                 items[0] = items[0][left_index:]
 | |
|                 end = items[-1][-len(right_tag)-2:]
 | |
|                 items[-1] = items[-1][:-len(right_tag)-2]
 | |
|                 new_blocks.append(
 | |
|                     self.markdown.htmlStash.store(start))
 | |
|                 new_blocks.extend(items)
 | |
|                 if end.strip():
 | |
|                     new_blocks.append(
 | |
|                         self.markdown.htmlStash.store(end))
 | |
|             else:
 | |
|                 new_blocks.append(
 | |
|                     self.markdown.htmlStash.store('\n\n'.join(items)))
 | |
|             #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
 | |
|             new_blocks.append('\n')
 | |
| 
 | |
|         new_text = "\n\n".join(new_blocks)
 | |
|         return new_text.split("\n")
 | |
| 
 | |
| 
 | |
| class ReferencePreprocessor(Preprocessor):
 | |
|     """ Remove reference definitions from text and store for later use. """
 | |
| 
 | |
|     TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
 | |
|     RE = re.compile(r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL)
 | |
|     TITLE_RE = re.compile(r'^%s$' % TITLE)
 | |
| 
 | |
|     def run (self, lines):
 | |
|         new_text = [];
 | |
|         while lines:
 | |
|             line = lines.pop(0)
 | |
|             m = self.RE.match(line)
 | |
|             if m:
 | |
|                 id = m.group(1).strip().lower()
 | |
|                 link = m.group(2).lstrip('<').rstrip('>')
 | |
|                 t = m.group(5) or m.group(6) or m.group(7)
 | |
|                 if not t:
 | |
|                     # Check next line for title
 | |
|                     tm = self.TITLE_RE.match(lines[0])
 | |
|                     if tm:
 | |
|                         lines.pop(0)
 | |
|                         t = tm.group(2) or tm.group(3) or tm.group(4)
 | |
|                 self.markdown.references[id] = (link, t)
 | |
|             else:
 | |
|                 new_text.append(line)
 | |
| 
 | |
|         return new_text #+ "\n"
 |