"""GFM autolink extension rules.
Three inline scanners are registered:
- **gfm_autolink_www** (char ``w``): bare ``www.`` URLs.
Uses ``add_terminator_char("w")`` so the text scanner interrupts at ``w``.
- **gfm_autolink_protocol** (char ``:``): ``http://``, ``https://``,
``mailto:``, ``xmpp:`` URLs via back-scanning ``pending``.
- **gfm_autolink_email** (char ``@``): bare email addresses via
back-scanning ``pending``.
Since ``:`` and ``@`` are already default terminator characters in
markdown-it-py, the protocol and email rules are invoked at every occurrence
of those characters. They use a *back-scanning* approach: looking backwards
through ``state.pending`` for a protocol prefix or email local-part that was
accumulated by the text rule. This means every ``:`` and ``@`` in the
document incurs a (cheap) regex check or character scan of pending text.
The trade-off vs. a **core-rule** (post-processing) approach — which would
walk the final token stream, find autolink patterns in text tokens, and
split them — is:
- **Inline approach** (current): simpler, integrates naturally with
``state.linkLevel`` to suppress matching inside links, but relies on the
prefix being present in ``state.pending`` (if a prior inline rule consumed
part of the prefix, matching would fail — unlikely in practice).
- **Core-rule approach**: guaranteed to find all autolinks regardless of
inline rule ordering, but requires token-stream surgery (splitting text
tokens and inserting link tokens) and cannot easily interact with nesting
guards like ``linkLevel``.
The ``w`` terminator is the only *new* terminator added. It causes the text
rule to interrupt at every ``w``, which is a minor performance cost for
documents heavy in that letter, but necessary since ``www.`` must be matched
from the start of the URL.
Specification: https://github.github.com/gfm/#autolinks-extension-
.. versionadded:: 0.5.0
Requires markdown-it-py ≥ 4.1.0.
"""
from __future__ import annotations
import re
from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline
from ._match import check_prev, match_any_email, match_http, match_www
# Regex to back-scan pending text for a protocol name ending at the current
# position (the colon character).
_PROTO_RE = re.compile(r"(?:^|.)(https?|mailto|xmpp)$", re.DOTALL)
[docs]
def gfm_autolink_plugin(md: MarkdownIt) -> None:
"""Enable the GFM autolink extension.
Recognises bare ``www.`` URLs, ``http(s)://`` URLs,
``mailto:``/``xmpp:`` links, and bare email addresses.
Requires markdown-it-py ≥ 4.1.0.
"""
if not hasattr(md.inline, "add_terminator_char"):
raise RuntimeError("gfm_autolink_plugin requires markdown-it-py >= 4.1.0")
md.inline.add_terminator_char("w")
md.inline.ruler.push("gfm_autolink_www", _www_inline_rule)
md.inline.ruler.push("gfm_autolink_protocol", _protocol_rule)
md.inline.ruler.push("gfm_autolink_email", _email_rule)
# ---------------------------------------------------------------------------
# Helpers (inline rules)
# ---------------------------------------------------------------------------
def _preceding_ok(state: StateInline, bscan_len: int) -> bool:
"""Check whether the character before the back-scanned portion allows an autolink."""
abs_pos = state.pos - bscan_len
if abs_pos <= 0:
return True
preceding = state.src[abs_pos - 1]
return check_prev(preceding)
def _create_autolink(
state: StateInline,
bscan_len: int,
total_len: int,
url: str,
text: str,
) -> bool:
"""Emit ``link_open`` / ``text`` / ``link_close`` tokens.
*bscan_len* characters are trimmed from the end of ``state.pending``
(the back-scanned protocol or local part). The parser position is
then advanced by ``total_len - bscan_len`` characters.
"""
if bscan_len:
state.pending = state.pending[:-bscan_len]
full_url = state.md.normalizeLink(url)
if not state.md.validateLink(full_url):
return False
token = state.push("link_open", "a", 1)
token.attrs = {"href": full_url}
token.markup = "autolink"
token.info = "auto"
token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(text)
token = state.push("link_close", "a", -1)
token.markup = "autolink"
token.info = "auto"
state.pos += total_len - bscan_len
return True
# ---------------------------------------------------------------------------
# www inline rule (requires add_terminator_char("w") — markdown-it-py >= 4.1.0)
# ---------------------------------------------------------------------------
def _www_inline_rule(state: StateInline, silent: bool) -> bool:
"""Match ``www.`` autolinks as an inline rule (trigger char: ``w``)."""
if state.linkLevel > 0:
return False
pos = state.pos
src = state.src
# Quick check: must be 'w' and form "www."
if src[pos] != "w":
return False
if pos + 4 > state.posMax or src[pos : pos + 4] != "www.":
return False
# Check preceding character (from pending text or start-of-line).
if state.pending:
preceding = state.pending[-1]
if not check_prev(preceding):
return False
elif pos > 0:
preceding = src[pos - 1]
if not check_prev(preceding):
return False
result = match_www(src[pos : state.posMax])
if result is None:
return False
url, length = result
label = src[pos : pos + length]
if silent:
return True
full_url = state.md.normalizeLink(url)
if not state.md.validateLink(full_url):
return False
token = state.push("link_open", "a", 1)
token.attrs = {"href": full_url}
token.markup = "autolink"
token.info = "auto"
token = state.push("text", "", 0)
token.content = state.md.normalizeLinkText(label)
token = state.push("link_close", "a", -1)
token.markup = "autolink"
token.info = "auto"
state.pos += length
return True
# ---------------------------------------------------------------------------
# Protocol scanner (trigger char: ':')
# ---------------------------------------------------------------------------
def _protocol_rule(state: StateInline, silent: bool) -> bool:
if state.linkLevel > 0:
return False
pos = state.pos
remaining = state.src[pos : state.posMax]
# Must start with ':' and have at least 3 more characters.
if len(remaining) < 4 or remaining[0] != ":":
return False
# Back-scan pending text for a known protocol name.
m = _PROTO_RE.search(state.pending)
if m is None:
return False
proto = m.group(1)
bscan_len = len(proto)
if not _preceding_ok(state, bscan_len):
return False
# Combine back-scanned protocol with the remaining text.
combined = proto + remaining
if proto in ("mailto", "xmpp"):
result = match_any_email(combined, bscan_len + 1, proto)
else:
result = match_http(combined)
if result is None:
return False
full_url, total_len = result
label = combined[:total_len]
if silent:
return True
return _create_autolink(state, bscan_len, total_len, full_url, label)
# ---------------------------------------------------------------------------
# Bare email scanner (trigger char: '@')
# ---------------------------------------------------------------------------
def _email_rule(state: StateInline, silent: bool) -> bool:
if state.linkLevel > 0:
return False
pos = state.pos
if pos >= state.posMax or state.src[pos] != "@":
return False
# Need at least one character after '@'.
if pos + 1 >= state.posMax:
return False
# Back-scan pending text for the local part of the email.
local_rev: list[str] = []
for ch in reversed(state.pending):
if ch.isascii() and (ch.isalnum() or ch in ".+-_"):
local_rev.append(ch)
else:
break
if not local_rev:
return False
local_len = len(local_rev)
if not _preceding_ok(state, local_len):
return False
# Forward-scan for the domain part.
after_at = state.src[pos + 1 : state.posMax]
domain_len = 0
num_period = 0
for i, ch in enumerate(after_at):
if ch.isascii() and ch.isalnum():
pass
elif ch == "@":
return False
elif (
ch == "."
and i + 1 < len(after_at)
and after_at[i + 1].isascii()
and after_at[i + 1].isalnum()
):
num_period += 1
elif ch != "-" and ch != "_":
break
domain_len += 1
if domain_len == 0 or num_period == 0:
return False
last_ch = after_at[domain_len - 1]
if not (last_ch.isascii() and last_ch.isalnum()) and last_ch != ".":
return False
local_part = "".join(reversed(local_rev))
email_text = local_part + state.src[pos : pos + 1 + domain_len]
total_len = local_len + 1 + domain_len
url = "mailto:" + email_text
if silent:
return True
return _create_autolink(state, local_len, total_len, url, email_text)