Source code for mdit_py_plugins.gfm_autolink.index

"""GFM autolink extension rules.

Three inline scanners are registered:

- **gfm_autolink_www** (char ``w``): bare ``www.`` URLs.
  Uses ``add_terminator_char("w")`` so the text scanner interrupts at ``w``.
- **gfm_autolink_protocol** (char ``:``): ``http://``, ``https://``,
  ``mailto:``, ``xmpp:`` URLs via back-scanning ``pending``.
- **gfm_autolink_email** (char ``@``): bare email addresses via
  back-scanning ``pending``.

Since ``:`` and ``@`` are already default terminator characters in
markdown-it-py, the protocol and email rules are invoked at every occurrence
of those characters. They use a *back-scanning* approach: looking backwards
through ``state.pending`` for a protocol prefix or email local-part that was
accumulated by the text rule. This means every ``:`` and ``@`` in the
document incurs a (cheap) regex check or character scan of pending text.

The trade-off vs. a **core-rule** (post-processing) approach — which would
walk the final token stream, find autolink patterns in text tokens, and
split them — is:

- **Inline approach** (current): simpler, integrates naturally with
  ``state.linkLevel`` to suppress matching inside links, but relies on the
  prefix being present in ``state.pending`` (if a prior inline rule consumed
  part of the prefix, matching would fail — unlikely in practice).
- **Core-rule approach**: guaranteed to find all autolinks regardless of
  inline rule ordering, but requires token-stream surgery (splitting text
  tokens and inserting link tokens) and cannot easily interact with nesting
  guards like ``linkLevel``.

The ``w`` terminator is the only *new* terminator added. It causes the text
rule to interrupt at every ``w``, which is a minor performance cost for
documents heavy in that letter, but necessary since ``www.`` must be matched
from the start of the URL.

Specification: https://github.github.com/gfm/#autolinks-extension-

.. versionadded:: 0.5.0

Requires markdown-it-py ≥ 4.1.0.
"""

from __future__ import annotations

import re

from markdown_it import MarkdownIt
from markdown_it.rules_inline import StateInline

from ._match import check_prev, match_any_email, match_http, match_www

# Regex to back-scan pending text for a protocol name ending at the current
# position (the colon character).
_PROTO_RE = re.compile(r"(?:^|.)(https?|mailto|xmpp)$", re.DOTALL)






# ---------------------------------------------------------------------------
# Helpers (inline rules)
# ---------------------------------------------------------------------------


def _preceding_ok(state: StateInline, bscan_len: int) -> bool:
    """Check whether the character before the back-scanned portion allows an autolink."""
    abs_pos = state.pos - bscan_len
    if abs_pos <= 0:
        return True
    preceding = state.src[abs_pos - 1]
    return check_prev(preceding)


def _create_autolink(
    state: StateInline,
    bscan_len: int,
    total_len: int,
    url: str,
    text: str,
) -> bool:
    """Emit ``link_open`` / ``text`` / ``link_close`` tokens.

    *bscan_len* characters are trimmed from the end of ``state.pending``
    (the back-scanned protocol or local part).  The parser position is
    then advanced by ``total_len - bscan_len`` characters.
    """
    if bscan_len:
        state.pending = state.pending[:-bscan_len]

    full_url = state.md.normalizeLink(url)
    if not state.md.validateLink(full_url):
        return False

    token = state.push("link_open", "a", 1)
    token.attrs = {"href": full_url}
    token.markup = "autolink"
    token.info = "auto"

    token = state.push("text", "", 0)
    token.content = state.md.normalizeLinkText(text)

    token = state.push("link_close", "a", -1)
    token.markup = "autolink"
    token.info = "auto"

    state.pos += total_len - bscan_len
    return True


# ---------------------------------------------------------------------------
# www inline rule  (requires add_terminator_char("w") — markdown-it-py >= 4.1.0)
# ---------------------------------------------------------------------------


def _www_inline_rule(state: StateInline, silent: bool) -> bool:
    """Match ``www.`` autolinks as an inline rule (trigger char: ``w``)."""
    if state.linkLevel > 0:
        return False

    pos = state.pos
    src = state.src

    # Quick check: must be 'w' and form "www."
    if src[pos] != "w":
        return False
    if pos + 4 > state.posMax or src[pos : pos + 4] != "www.":
        return False

    # Check preceding character (from pending text or start-of-line).
    if state.pending:
        preceding = state.pending[-1]
        if not check_prev(preceding):
            return False
    elif pos > 0:
        preceding = src[pos - 1]
        if not check_prev(preceding):
            return False

    result = match_www(src[pos : state.posMax])
    if result is None:
        return False

    url, length = result
    label = src[pos : pos + length]

    if silent:
        return True

    full_url = state.md.normalizeLink(url)
    if not state.md.validateLink(full_url):
        return False

    token = state.push("link_open", "a", 1)
    token.attrs = {"href": full_url}
    token.markup = "autolink"
    token.info = "auto"

    token = state.push("text", "", 0)
    token.content = state.md.normalizeLinkText(label)

    token = state.push("link_close", "a", -1)
    token.markup = "autolink"
    token.info = "auto"

    state.pos += length
    return True


# ---------------------------------------------------------------------------
# Protocol scanner  (trigger char: ':')
# ---------------------------------------------------------------------------


def _protocol_rule(state: StateInline, silent: bool) -> bool:
    if state.linkLevel > 0:
        return False

    pos = state.pos
    remaining = state.src[pos : state.posMax]

    # Must start with ':' and have at least 3 more characters.
    if len(remaining) < 4 or remaining[0] != ":":
        return False

    # Back-scan pending text for a known protocol name.
    m = _PROTO_RE.search(state.pending)
    if m is None:
        return False

    proto = m.group(1)
    bscan_len = len(proto)

    if not _preceding_ok(state, bscan_len):
        return False

    # Combine back-scanned protocol with the remaining text.
    combined = proto + remaining

    if proto in ("mailto", "xmpp"):
        result = match_any_email(combined, bscan_len + 1, proto)
    else:
        result = match_http(combined)

    if result is None:
        return False

    full_url, total_len = result
    label = combined[:total_len]

    if silent:
        return True
    return _create_autolink(state, bscan_len, total_len, full_url, label)


# ---------------------------------------------------------------------------
# Bare email scanner  (trigger char: '@')
# ---------------------------------------------------------------------------


def _email_rule(state: StateInline, silent: bool) -> bool:
    if state.linkLevel > 0:
        return False

    pos = state.pos
    if pos >= state.posMax or state.src[pos] != "@":
        return False
    # Need at least one character after '@'.
    if pos + 1 >= state.posMax:
        return False

    # Back-scan pending text for the local part of the email.
    local_rev: list[str] = []
    for ch in reversed(state.pending):
        if ch.isascii() and (ch.isalnum() or ch in ".+-_"):
            local_rev.append(ch)
        else:
            break

    if not local_rev:
        return False

    local_len = len(local_rev)
    if not _preceding_ok(state, local_len):
        return False

    # Forward-scan for the domain part.
    after_at = state.src[pos + 1 : state.posMax]
    domain_len = 0
    num_period = 0
    for i, ch in enumerate(after_at):
        if ch.isascii() and ch.isalnum():
            pass
        elif ch == "@":
            return False
        elif (
            ch == "."
            and i + 1 < len(after_at)
            and after_at[i + 1].isascii()
            and after_at[i + 1].isalnum()
        ):
            num_period += 1
        elif ch != "-" and ch != "_":
            break
        domain_len += 1

    if domain_len == 0 or num_period == 0:
        return False

    last_ch = after_at[domain_len - 1]
    if not (last_ch.isascii() and last_ch.isalnum()) and last_ch != ".":
        return False

    local_part = "".join(reversed(local_rev))
    email_text = local_part + state.src[pos : pos + 1 + domain_len]
    total_len = local_len + 1 + domain_len
    url = "mailto:" + email_text

    if silent:
        return True
    return _create_autolink(state, local_len, total_len, url, email_text)