#
# Copyright (C) 2014 Vadym Khokhlov
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse links in Markdown files.
Supported links are:
<http://autolink.com>
[name](http://link.com "Optional title")
[id]: http://link.com "Optional title"
"""
# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project
import re
from . import _ContentPlugin
from .. import log, LOG_PLUGIN
[docs]
class MarkdownCheck(_ContentPlugin):
"""Markdown parsing plugin."""
_filename_re_key = "filename_re"
_default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
_link_res = [
re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
re.compile(
r"""
\[.+\]: # id
[ \t]*\n? # maybe *one* newline
[ \t]*
<?(.+?)>? # url = \1
[ \t]*
(?:
\n? # maybe one newline
[ \t]*
(?<=\s) # lookbehind for whitespace
['"(]
[^\n]* # title
['")]
[ \t]*
)? # title is optional
(?:\n+|\Z)
""",
re.X | re.M | re.U,
),
]
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
_inline_link_title = re.compile(
r'''
( # \1
[ \t]+
(['"]) # quote char
(.*?)
)? # title is optional
\)$
''',
re.X | re.S,
)
def __init__(self, config):
super().__init__(config)
self.filename_re = self._default_filename_re
pattern = config.get(self._filename_re_key)
if pattern:
try:
self.filename_re = re.compile(pattern)
except re.error as msg:
log.warn(LOG_PLUGIN, _("Invalid regex pattern %r: %s"), pattern, msg)
[docs]
@classmethod
def read_config(cls, configparser):
"""Read configuration file options."""
config = dict()
config[cls._filename_re_key] = (
configparser.get(cls.__name__, cls._filename_re_key)
if configparser.has_option(cls.__name__, cls._filename_re_key)
else None
)
return config
[docs]
def applies_to(self, url_data, pagetype=None):
"""Check for Markdown file."""
return self.filename_re.search(url_data.base_url) is not None
[docs]
def check(self, url_data):
"""Extracts urls from the file."""
content = url_data.get_content()
self._check_by_re(url_data, content)
self._check_inline_links(url_data, content)
def _save_url(self, url_data, content, url_text, url_pos):
"""Saves url. Converts url to 1-line text and url position as offset
from the file beginning to (line, column).
:param url_data: object for url storing
:param content: file content
:param url_text: url text
:param url_pos: url position from the beginning
"""
line = content.count('\n', 0, url_pos) + 1
column = url_pos - content.rfind('\n', 0, url_pos)
url_data.add_url(
url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column
)
def _check_by_re(self, url_data, content):
""" Finds urls by re.
:param url_data: object for url storing
:param content: file content
"""
for link_re in self._link_res:
for u in link_re.finditer(content):
self._save_url(url_data, content, u.group(1), u.start(1))
def _find_balanced(self, text, start, open_c, close_c):
"""Returns the index where the open_c and close_c characters balance
out - the same number of open_c and close_c are encountered - or the
end of string if it's reached before the balance point is found.
"""
i = start
n = len(text)
count = 1
while count > 0 and i < n:
if text[i] == open_c:
count += 1
elif text[i] == close_c:
count -= 1
i += 1
return i
def _extract_url_and_title(self, text, start):
"""Extracts the url from the tail of a link."""
# text[start] equals the opening parenthesis
idx = self._whitespace.match(text, start + 1).end()
if idx == len(text):
return None, None
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx + 1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None
url = text[idx:match.start()]
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, end_idx
def _check_inline_links(self, url_data, content):
"""Checks inline links.
:param url_data: url_data object
:param content: content for processing
"""
MAX_LINK_TEXT_SENTINEL = 3000
curr_pos = 0
content_length = len(content)
while True: # Handle the next link.
# The next '[' is the start of:
# - an inline anchor: [text](url "title")
# - an inline img: ![text](url "title")
# - not markup: [...anything else...
try:
start_idx = content.index('[', curr_pos)
except ValueError:
break
# Find the matching closing ']'.
bracket_depth = 0
for p in range(
start_idx + 1, min(start_idx + MAX_LINK_TEXT_SENTINEL, content_length)
):
if content[p] == ']':
bracket_depth -= 1
if bracket_depth < 0:
break
elif content[p] == '[':
bracket_depth += 1
else:
# Closing bracket not found within sentinel length. This isn't markup.
curr_pos = start_idx + 1
continue
# Now determine what this is by the remainder.
p += 1
if p >= content_length:
return
if content[p] == '(':
url, url_end_idx = self._extract_url_and_title(content, p)
if url is not None:
self._save_url(url_data, content, url, p)
start_idx = url_end_idx
# Otherwise, it isn't markup.
curr_pos = start_idx + 1