Macro rewrite (#2702)

* Move all remaining fixText() stuff (macros, curly aposes, and em dashes) into the parser. Manual test changes are now 100% expected. * Remove stray print() * Properly revert an unmatched macro to [] characters. * No need to check for a comment start, since that's already been parsed. * Fix the regex * Don't eagerly lowercase macros on parse. Recursively replace HTML-parsed macros. * Remove the zwsp from the em-dash, as browsers already allow a break opportunity there anyway. * Pipe previous token into the context, so apostrophe handling can be done after an element's end tag. * Pull out the 'turn metadata into properly-parsed text' into a function, invoke it on all the macros that need it. * Add parseTitle() for generating <title>-safe content, and use it (and parseText()) in more metadata. Add a printNodeTree() debugging tool. Rebase some tests, whose changes should all be known-good. * Actually create Doctype nodes, so I don't accidentally kick docs into quirks mode. * Rebase all the tests that look expected so far. * Whoops, restore header/footer addition. * Handle lists in HTML trees. * Correctly handle otherMetadata so it doesn't double-wrap with <dd> * update docs * Correct the line numbers downstream when I remove a newline while handling em-dashes. * rebase tests * Rebase tests that have expected changes * Switch parser functions to taking a ParseConfig * Whoops, give Note: paragraphs a line number. * Make multi-line start tags emit IncrementLineCountChar charaters, which increment the offset for Lines and are removed from the output. Switch em-dash line correction to use them. Store macro start/end chars as named constants, too. * Rather than eagerly adding ilccs, only generate them *on request when stringifying* if the reported line span (endLine - line) is different from the actual line span * Rebase tests with fixed line numbers * Add line-count test to exercise the errors more directly. * Add a test for accidental raw text line count * Parse char references in text (and properly escape text). * Instead of stripping comments for Markdown, replace them with a recognizable comment string. Then, if they're the only thing on the line, Markdown can just drop them. * rebase tests * Rebase more tests. Don't output escaped text, but do output char refs as the charref, not the underlying character. Explicitly handle whole-line comments in datablocks, now that I don't strip them early. * lint * Regen docs * rebase one final test
speced · Oct 26, 2023 · 5c6f7af · 5c6f7af
1 parent 29c533f
commit 5c6f7af
Show file tree

Hide file tree

Showing 305 changed files with 2,963 additions and 2,188 deletions.
diff --git a/bikeshed/InputSource.py b/bikeshed/InputSource.py
@@ -13,7 +13,7 @@
 import requests
 import tenacity
 
-from . import config, line, t
+from . import config, constants, line, t
 from . import messages as m
 
 
@@ -24,7 +24,27 @@ class InputContent:
 
     @property
     def lines(self) -> list[line.Line]:
-        return [line.Line(lineNo, text) for lineNo, text in enumerate(self.rawLines, 1)]
+        ret = []
+        offset = 0
+        for i, text in enumerate(self.rawLines, 1):
+            lineNo = i + offset
+            # The early HTML parser runs before Markdown,
+            # and in some cases removes linebreaks that were present
+            # in the source. When properly invoked, it inserts
+            # a special PUA char for each of these omitted linebreaks,
+            # so I can remove them here and properly increment the
+            # line number.
+            # Current known causes of this:
+            # * line-ending -- turned into em dashes
+            # * multi-line start tags
+            ilcc = constants.incrementLineCountChar
+            if ilcc in text:
+                offset += text.count(ilcc)
+                text = text.replace(ilcc, "")
+
+            ret.append(line.Line(lineNo, text))
+
+        return ret
 
     @property
     def content(self) -> str:

diff --git a/bikeshed/Spec.py b/bikeshed/Spec.py
@@ -8,7 +8,6 @@
 import sys
 from collections import OrderedDict, defaultdict
 from datetime import datetime
-from functools import partial as curry
 
 from . import (
     InputSource,
@@ -146,7 +145,7 @@ def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
         )
         self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)
 
-        text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
+        text = h.strFromNodes(h.initialDocumentParse(inputContent.content, h.ParseConfig.fromSpec(self)), withIlcc=True)
         inputContent.rawLines = [x + "\n" for x in text.split("\n")]
         return inputContent.lines
 
@@ -167,7 +166,6 @@ def assembleDocument(self) -> Spec:
         u.stripBOM(self)
         if self.lineNumbers:
             self.lines = u.hackyLineNumbers(self.lines)
-        self.lines = markdown.stripComments(self.lines)
         self.recordDependencies(self.inputSource)
         # Extract and process metadata
         self.lines, self.mdDocument = metadata.parse(lines=self.lines)
@@ -182,7 +180,7 @@ def assembleDocument(self) -> Spec:
         # Using all of that, load up the text macros so I can sub them into the computed-metadata file.
         self.md.fillTextMacros(self.macros, doc=self)
         jsonEscapedMacros = {k: json.dumps(v)[1:-1] for k, v in self.macros.items()}
-        computedMdText = h.replaceMacros(
+        computedMdText = h.replaceMacrosTextly(
             retrieve.retrieveBoilerplateFile(self, "computed-metadata", error=True),
             macros=jsonEscapedMacros,
         )
@@ -235,7 +233,7 @@ def assembleDocument(self) -> Spec:
         # Convert to a single string of html now, for convenience.
         self.html = "".join(x.text for x in self.lines)
         boilerplate.addHeaderFooter(self)
-        self.html = self.fixText(self.html)
+        self.html = h.replaceMacros(self.html, self.macros)
 
         # Build the document
         self.document = h.parseDocument(self.html)
@@ -470,20 +468,9 @@ def log_message(self, format: t.Any, *args: t.Any) -> None:
         except Exception as e:
             m.die(f"Something went wrong while watching the file:\n{e}")
 
-    def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
-        # Do several textual replacements that need to happen *before* the document is parsed as h.
-
-        # If markdown shorthands are on, remove all `foo`s while processing,
-        # so their contents don't accidentally trigger other stuff.
-        # Also handle markdown escapes.
-        if moreMacros is None:
-            moreMacros = {}
+    def fixText(self, text: str) -> str:
         textFunctor: func.Functor = func.Functor(text)
-
-        macros = dict(self.macros, **moreMacros)
-        textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
         textFunctor = textFunctor.map(h.fixTypography)
-
         return t.cast(str, textFunctor.extract())
 
     def printTargets(self) -> None:

diff --git a/bikeshed/boilerplate.py b/bikeshed/boilerplate.py
@@ -19,6 +19,8 @@
 
 
 def boilerplateFromHtml(doc: t.SpecT, htmlString: str) -> t.NodesT:
+    htmlString = h.parseText(htmlString, h.ParseConfig.fromSpec(doc))
+    htmlString = h.replaceMacros(htmlString, doc.macros)
     htmlString = doc.fixText(htmlString)
     bp = h.E.div({}, h.parseHTML(htmlString))
     conditional.processConditionals(doc, bp)
@@ -121,7 +123,9 @@ def addHeaderFooter(doc: t.SpecT) -> None:
     header = retrieve.retrieveBoilerplateFile(doc, "header") if "header" in doc.md.boilerplate else ""
     footer = retrieve.retrieveBoilerplateFile(doc, "footer") if "footer" in doc.md.boilerplate else ""
 
-    doc.html = "\n".join([header, doc.html, footer])
+    doc.html = "\n".join(
+        [h.parseText(header, h.ParseConfig.fromSpec(doc)), doc.html, h.parseText(footer, h.ParseConfig.fromSpec(doc))],
+    )
 
 
 def fillWith(tag: str, newElements: t.NodesT, doc: t.SpecT) -> None:
@@ -213,14 +217,16 @@ def addAtRisk(doc: t.SpecT) -> None:
         return
     html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
     for feature in doc.md.atRisk:
-        html += "<li>" + doc.fixText(h.parseText(feature))
+        html += "<li>" + doc.fixText(h.parseText(feature, h.ParseConfig.fromSpec(doc)))
     html += (
         "</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
         + "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "
         + "and marking it as such allows the WG to drop the feature if necessary when transitioning to the Proposed Rec stage, "
         + "without having to publish a new Candidate Rec without the feature first."
     )
-    fillWith("at-risk", h.parseHTML(html), doc=doc)
+    html = h.replaceMacros(html, doc.macros)
+    frag = h.parseHTML(html)
+    fillWith("at-risk", frag, doc=doc)
 
 
 def addStyles(doc: t.SpecT) -> None:
@@ -1031,7 +1037,15 @@ def printPreviousVersion(v: dict[str, str]) -> t.ElementT | None:
     # and upgrade html-text values into real elements
     otherMd: OrderedDict[str, list[MetadataValueT]] = OrderedDict()
     for k, vs in doc.md.otherMetadata.items():
-        parsed: list[t.NodesT] = [h.parseHTML(doc.fixText(v)) if isinstance(v, str) else v for v in vs]
+        parsed: list[t.NodesT] = []
+        for v in vs:
+            if isinstance(v, str):
+                htmlText = h.parseText(v, h.ParseConfig.fromSpec(doc))
+                htmlText = h.replaceMacros(htmlText, doc.macros)
+                htmlText = doc.fixText(htmlText)
+                parsed.append(h.parseHTML(htmlText))
+            else:
+                parsed.append(v)
         if k in md:
             md[k].extend(parsed)
         else:
@@ -1075,7 +1089,7 @@ def createMdEntry(key: str, dirtyVals: t.Sequence[MetadataValueT], doc: t.SpecT)
         ret = [h.E.dt(displayKey, ":")]
     # Add all the values, wrapping in a <dd> if necessary.
     for val in vals:
-        if h.isElement(val) and val.tag == "dd":
+        if h.isElement(val) and h.tagName(val) == "dd":
             ret.append(val)
         else:
             ret.append(h.E.dd({}, val))

diff --git a/bikeshed/config/status.py b/bikeshed/config/status.py
@@ -14,7 +14,7 @@
     "DRAFT-FINDING": "Draft Finding",
     "FINDING": "Finding",
     "whatwg/RD": "Review Draft",
-    "w3c/ED": "Editor's Draft",
+    "w3c/ED": "Editor’s Draft",
     "w3c/WD": "W3C Working Draft",
     "w3c/FPWD": "W3C First Public Working Draft",
     "w3c/LCWD": "W3C Last Call Working Draft",
@@ -26,7 +26,7 @@
     "w3c/WG-NOTE": "W3C Group Note",
     "w3c/IG-NOTE": "W3C Group Note",
     "w3c/NOTE": "W3C Group Note",
-    "w3c/NOTE-ED": "Editor's Draft",
+    "w3c/NOTE-ED": "Editor’s Draft",
     "w3c/NOTE-WD": "W3C Group Draft Note",
     "w3c/NOTE-FPWD": "W3C Group Draft Note",
     "w3c/DRY": "W3C Draft Registry",
@@ -49,7 +49,7 @@
     "iso/MEET": "Meeting Announcements",
     "iso/RESP": "Records of Response",
     "iso/MIN": "Minutes",
-    "iso/ER": "Editor's Report",
+    "iso/ER": "Editor’s Report",
     "iso/SD": "Standing Document",
     "iso/PWI": "Preliminary Work Item",
     "iso/NP": "New Proposal",
@@ -80,13 +80,13 @@
     "iso/FD-AMD": "Final Draft Amendment",
     "iso/PRF-AMD": "Proof Amendment",
     "iso/AMD": "Amendment",
-    "fido/ED": "Editor's Draft",
+    "fido/ED": "Editor’s Draft",
     "fido/WD": "Working Draft",
     "fido/RD": "Review Draft",
     "fido/ID": "Implementation Draft",
     "fido/PS": "Proposed Standard",
     "fido/FD": "Final Document",
-    "khronos/ED": "Editor's Draft",
+    "khronos/ED": "Editor’s Draft",
     "aom/PD": "Pre-Draft",
     "aom/WGD": "AOM Working Group Draft",
     "aom/WGA": "AOM Working Group Approved Draft",

diff --git a/bikeshed/constants.py b/bikeshed/constants.py
@@ -7,3 +7,8 @@
 biblioDisplay: StringEnum = StringEnum("index", "inline", "direct")
 chroot: bool = True
 executeCode: bool = False
+
+macroStartChar = "\uebbb"
+macroEndChar = "\uebbc"
+incrementLineCountChar = "\uebbd"
+bsComment = "<!--\uebbe-->"
diff --git a/bikeshed/datablocks.py b/bikeshed/datablocks.py
@@ -7,7 +7,7 @@
 
 import attr
 
-from . import biblio, config, h, printjson, refs, t
+from . import biblio, config, constants, h, printjson, refs, t
 from . import messages as m
 from .line import Line
 
@@ -91,6 +91,8 @@ def transformDataBlocks(doc: t.SpecT, lines: list[Line] | list[str]) -> list[Lin
     blockLines: list[Line] = []
     newLines: list[Line] = []
     for line in _lines:
+        if line.text.strip() == constants.bsComment:
+            continue
         # Look for the start of a block.
         match = re.match(r"\s*<(pre|xmp)[\s>]", line.text, re.I)
         # Note that, by design, I don't pay attention to anything on the same line as the start tag,
@@ -611,7 +613,7 @@ def parseDefBlock(
         else:
             vals[key] = val
     for key, val in vals.items():
-        vals[key] = h.parseText(val)
+        vals[key] = h.parseText(val, h.ParseConfig.fromSpec(doc))
     return vals
 
 
@@ -1059,6 +1061,10 @@ def extendData(datas: InfoTreeT, infoLevels: InfoTreeT) -> None:
             thisLine = None
         if line.strip() == "":
             continue
+        if re.match(r"^\s*<!--.*-->\s*$", line):
+            # HTML comment filling the whole line,
+            # go ahead and strip it
+            continue
         ws, text = t.cast("re.Match", re.match(r"(\s*)(.*)", line)).groups()
         if text.startswith("#"):  # comment
             continue

diff --git a/bikeshed/h/__init__.py b/bikeshed/h/__init__.py
@@ -54,12 +54,14 @@
     parseHTML,
     prependChild,
     previousElements,
+    printNodeTree,
     relevantHeadings,
     removeAttr,
     removeClass,
     removeNode,
     replaceContents,
     replaceMacros,
+    replaceMacrosTextly,
     replaceNode,
     replaceWithContents,
     safeID,
@@ -80,6 +82,7 @@
     Comment,
     EndTag,
     Failure,
+    ParseConfig,
     ParseFailure,
     Result,
     StartTag,
@@ -109,6 +112,7 @@
     parseStyleToEnd,
     parseTagName,
     parseText,
+    parseTitle,
     parseUnquotedAttrValue,
     parseWhitespace,
     parseXmpToEnd,

diff --git a/bikeshed/h/dom.py b/bikeshed/h/dom.py
@@ -10,7 +10,7 @@
 from lxml.cssselect import CSSSelector
 from lxml.html import tostring
 
-from .. import t
+from .. import constants, t
 from ..messages import die, warn
 
 if t.TYPE_CHECKING:
@@ -155,6 +155,27 @@ def outerHTML(el: t.NodesT | None, literal: bool = False, with_tail: bool = Fals
     return t.cast(str, tostring(el, with_tail=with_tail, encoding="unicode"))
 
 
+def printNodeTree(node: t.NodeT | str) -> str:
+    # Debugging tool
+    if isinstance(node, str):
+        return "#text: " + repr(node)
+    if isinstance(node, list):
+        s = "[]"
+    else:
+        s = f"{serializeTag(node)}"
+    linesPerChild = [printNodeTree(child).split("\n") for child in childNodes(node)]
+    if linesPerChild:
+        for childLines in linesPerChild[:-1]:
+            childLines[0] = " ├" + childLines[0]
+            childLines[1:] = [" │" + line for line in childLines[1:]]
+            s += "\n" + "\n".join(childLines)
+        childLines = linesPerChild[-1]
+        childLines[0] = " ╰" + childLines[0]
+        childLines[1:] = ["  " + line for line in childLines[1:]]
+        s += "\n" + "\n".join(childLines)
+    return s
+
+
 def linkTextsFromElement(el: t.ElementT) -> list[str]:
     if el.get("data-lt") == "":
         return []
@@ -787,10 +808,6 @@ def hasOnlyChild(el: t.ElementT, wsAllowed: bool = True) -> t.ElementT | None:
 
 def fixTypography(text: str) -> str:
     # Replace straight aposes with curly quotes for possessives and contractions.
-    text = re.sub(r"([\w])'([\w])", r"\1’\2", text)
-    text = re.sub(r"(</[\w]+>)'([\w])", r"\1’\2", text)
-    # Fix line-ending em dashes, or --, by moving the previous line up, so no space.
-    text = re.sub(r"([^<][^!])(—|--)\r?\n\s*(\S)", r"\1—<wbr>\3", text)
     return text
 
 
@@ -834,6 +851,39 @@ def replaceMacros(text: str, macros: t.Mapping[str, str]) -> str:
     # Macro syntax is [FOO], where FOO is /[A-Z0-9-]+/
     # If written as [FOO?], failure to find a matching macro just replaced it with nothing;
     # otherwise, it throws a fatal error.
+
+    def macroReplacer(match: re.Match) -> str:
+        text = match.group(1).lower().strip()
+        if text.endswith("?"):
+            text = text[:-1].strip()
+            optional = True
+        else:
+            optional = False
+        if text in macros:
+            # For some reason I store all the macros in lowercase,
+            # despite requiring them to be spelled with uppercase.
+            return str(macros[text])
+        # Nothing has matched, so start failing the macros.
+        if optional:
+            return ""
+        die(
+            f"Found unmatched text macro [{match.group(1)}]. Correct the macro, or escape it somehow (leading backslash, html escape, etc).",
+        )
+        return t.cast(str, "[" + match.group(0)[1:-1] + "]")
+
+    while "\uebbb" in text:
+        # Loop, as macros might expand to more macros
+        # (which hopefully were HTML-parsed).
+        ms = constants.macroStartChar
+        me = constants.macroEndChar
+        text = re.sub(f"{ms}(.+?){me}", macroReplacer, text)
+    return text
+
+
+def replaceMacrosTextly(text: str, macros: t.Mapping[str, str]) -> str:
+    # Same as replaceMacros(), but does the substitution
+    # directly on the text, rather than relying on the
+    # html parser to have preparsed the macro syntax
     def macroReplacer(match: re.Match) -> str:
         fullText = t.cast(str, match.group(0))
         innerText = match.group(2).lower() or ""