Skip to content

Commit

Permalink
Macro rewrite (#2702)
Browse files Browse the repository at this point in the history
* Move all remaining fixText() stuff (macros, curly aposes, and em dashes) into the parser. Manual test changes are now 100% expected.

* Remove stray print()

* Properly revert an unmatched macro to [] characters.

* No need to check for a comment start, since that's already been parsed.

* Fix the regex

* Don't eagerly lowercase macros on parse. Recursively replace HTML-parsed macros.

* Remove the zwsp from the em-dash, as browsers already allow a break opportunity there anyway.

* Pipe previous token into the context, so apostrophe handling can be done after an element's end tag.

* Pull out the 'turn metadata into properly-parsed text' into a function, invoke it on all the macros that need it.

* Add parseTitle() for generating <title>-safe content, and use it (and parseText()) in more metadata. Add a printNodeTree() debugging tool. Rebase some tests, whose changes should all be known-good.

* Actually create Doctype nodes, so I don't accidentally kick docs into quirks mode.

* Rebase all the tests that look expected so far.

* Whoops, restore header/footer addition.

* Handle lists in HTML trees.

* Correctly handle otherMetadata so it doesn't double-wrap with <dd>

* update docs

* Correct the line numbers downstream when I remove a newline while handling em-dashes.

* rebase tests

* Rebase tests that have expected changes

* Switch parser functions to taking a ParseConfig

* Whoops, give Note: paragraphs a line number.

* Make multi-line start tags emit IncrementLineCountChar charaters, which increment the offset for Lines and are removed from the output. Switch em-dash line correction to use them. Store macro start/end chars as named constants, too.

* Rather than eagerly adding ilccs, only generate them *on request when stringifying* if the reported line span (endLine - line) is different from the actual line span

* Rebase tests with fixed line numbers

* Add line-count test to exercise the errors more directly.

* Add a test for accidental raw text line count

* Parse char references in text (and properly escape text).

* Instead of stripping comments for Markdown, replace them with a recognizable comment string. Then, if they're the only thing on the line, Markdown can just drop them.

* rebase tests

* Rebase more tests. Don't output escaped text, but do output char refs as the charref, not the underlying character. Explicitly handle whole-line comments in datablocks, now that I don't strip them early.

* lint

* Regen docs

* rebase one final test
  • Loading branch information
tabatkins authored Oct 26, 2023
1 parent 29c533f commit 5c6f7af
Show file tree
Hide file tree
Showing 305 changed files with 2,963 additions and 2,188 deletions.
24 changes: 22 additions & 2 deletions bikeshed/InputSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import requests
import tenacity

from . import config, line, t
from . import config, constants, line, t
from . import messages as m


Expand All @@ -24,7 +24,27 @@ class InputContent:

@property
def lines(self) -> list[line.Line]:
return [line.Line(lineNo, text) for lineNo, text in enumerate(self.rawLines, 1)]
ret = []
offset = 0
for i, text in enumerate(self.rawLines, 1):
lineNo = i + offset
# The early HTML parser runs before Markdown,
# and in some cases removes linebreaks that were present
# in the source. When properly invoked, it inserts
# a special PUA char for each of these omitted linebreaks,
# so I can remove them here and properly increment the
# line number.
# Current known causes of this:
# * line-ending -- turned into em dashes
# * multi-line start tags
ilcc = constants.incrementLineCountChar
if ilcc in text:
offset += text.count(ilcc)
text = text.replace(ilcc, "")

ret.append(line.Line(lineNo, text))

return ret

@property
def content(self) -> str:
Expand Down
21 changes: 4 additions & 17 deletions bikeshed/Spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import sys
from collections import OrderedDict, defaultdict
from datetime import datetime
from functools import partial as curry

from . import (
InputSource,
Expand Down Expand Up @@ -146,7 +145,7 @@ def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
)
self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)

text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
text = h.strFromNodes(h.initialDocumentParse(inputContent.content, h.ParseConfig.fromSpec(self)), withIlcc=True)
inputContent.rawLines = [x + "\n" for x in text.split("\n")]
return inputContent.lines

Expand All @@ -167,7 +166,6 @@ def assembleDocument(self) -> Spec:
u.stripBOM(self)
if self.lineNumbers:
self.lines = u.hackyLineNumbers(self.lines)
self.lines = markdown.stripComments(self.lines)
self.recordDependencies(self.inputSource)
# Extract and process metadata
self.lines, self.mdDocument = metadata.parse(lines=self.lines)
Expand All @@ -182,7 +180,7 @@ def assembleDocument(self) -> Spec:
# Using all of that, load up the text macros so I can sub them into the computed-metadata file.
self.md.fillTextMacros(self.macros, doc=self)
jsonEscapedMacros = {k: json.dumps(v)[1:-1] for k, v in self.macros.items()}
computedMdText = h.replaceMacros(
computedMdText = h.replaceMacrosTextly(
retrieve.retrieveBoilerplateFile(self, "computed-metadata", error=True),
macros=jsonEscapedMacros,
)
Expand Down Expand Up @@ -235,7 +233,7 @@ def assembleDocument(self) -> Spec:
# Convert to a single string of html now, for convenience.
self.html = "".join(x.text for x in self.lines)
boilerplate.addHeaderFooter(self)
self.html = self.fixText(self.html)
self.html = h.replaceMacros(self.html, self.macros)

# Build the document
self.document = h.parseDocument(self.html)
Expand Down Expand Up @@ -470,20 +468,9 @@ def log_message(self, format: t.Any, *args: t.Any) -> None:
except Exception as e:
m.die(f"Something went wrong while watching the file:\n{e}")

def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
# Do several textual replacements that need to happen *before* the document is parsed as h.

# If markdown shorthands are on, remove all `foo`s while processing,
# so their contents don't accidentally trigger other stuff.
# Also handle markdown escapes.
if moreMacros is None:
moreMacros = {}
def fixText(self, text: str) -> str:
textFunctor: func.Functor = func.Functor(text)

macros = dict(self.macros, **moreMacros)
textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
textFunctor = textFunctor.map(h.fixTypography)

return t.cast(str, textFunctor.extract())

def printTargets(self) -> None:
Expand Down
24 changes: 19 additions & 5 deletions bikeshed/boilerplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@


def boilerplateFromHtml(doc: t.SpecT, htmlString: str) -> t.NodesT:
htmlString = h.parseText(htmlString, h.ParseConfig.fromSpec(doc))
htmlString = h.replaceMacros(htmlString, doc.macros)
htmlString = doc.fixText(htmlString)
bp = h.E.div({}, h.parseHTML(htmlString))
conditional.processConditionals(doc, bp)
Expand Down Expand Up @@ -121,7 +123,9 @@ def addHeaderFooter(doc: t.SpecT) -> None:
header = retrieve.retrieveBoilerplateFile(doc, "header") if "header" in doc.md.boilerplate else ""
footer = retrieve.retrieveBoilerplateFile(doc, "footer") if "footer" in doc.md.boilerplate else ""

doc.html = "\n".join([header, doc.html, footer])
doc.html = "\n".join(
[h.parseText(header, h.ParseConfig.fromSpec(doc)), doc.html, h.parseText(footer, h.ParseConfig.fromSpec(doc))],
)


def fillWith(tag: str, newElements: t.NodesT, doc: t.SpecT) -> None:
Expand Down Expand Up @@ -213,14 +217,16 @@ def addAtRisk(doc: t.SpecT) -> None:
return
html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
for feature in doc.md.atRisk:
html += "<li>" + doc.fixText(h.parseText(feature))
html += "<li>" + doc.fixText(h.parseText(feature, h.ParseConfig.fromSpec(doc)))
html += (
"</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
+ "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "
+ "and marking it as such allows the WG to drop the feature if necessary when transitioning to the Proposed Rec stage, "
+ "without having to publish a new Candidate Rec without the feature first."
)
fillWith("at-risk", h.parseHTML(html), doc=doc)
html = h.replaceMacros(html, doc.macros)
frag = h.parseHTML(html)
fillWith("at-risk", frag, doc=doc)


def addStyles(doc: t.SpecT) -> None:
Expand Down Expand Up @@ -1031,7 +1037,15 @@ def printPreviousVersion(v: dict[str, str]) -> t.ElementT | None:
# and upgrade html-text values into real elements
otherMd: OrderedDict[str, list[MetadataValueT]] = OrderedDict()
for k, vs in doc.md.otherMetadata.items():
parsed: list[t.NodesT] = [h.parseHTML(doc.fixText(v)) if isinstance(v, str) else v for v in vs]
parsed: list[t.NodesT] = []
for v in vs:
if isinstance(v, str):
htmlText = h.parseText(v, h.ParseConfig.fromSpec(doc))
htmlText = h.replaceMacros(htmlText, doc.macros)
htmlText = doc.fixText(htmlText)
parsed.append(h.parseHTML(htmlText))
else:
parsed.append(v)
if k in md:
md[k].extend(parsed)
else:
Expand Down Expand Up @@ -1075,7 +1089,7 @@ def createMdEntry(key: str, dirtyVals: t.Sequence[MetadataValueT], doc: t.SpecT)
ret = [h.E.dt(displayKey, ":")]
# Add all the values, wrapping in a <dd> if necessary.
for val in vals:
if h.isElement(val) and val.tag == "dd":
if h.isElement(val) and h.tagName(val) == "dd":
ret.append(val)
else:
ret.append(h.E.dd({}, val))
Expand Down
10 changes: 5 additions & 5 deletions bikeshed/config/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"DRAFT-FINDING": "Draft Finding",
"FINDING": "Finding",
"whatwg/RD": "Review Draft",
"w3c/ED": "Editor's Draft",
"w3c/ED": "Editors Draft",
"w3c/WD": "W3C Working Draft",
"w3c/FPWD": "W3C First Public Working Draft",
"w3c/LCWD": "W3C Last Call Working Draft",
Expand All @@ -26,7 +26,7 @@
"w3c/WG-NOTE": "W3C Group Note",
"w3c/IG-NOTE": "W3C Group Note",
"w3c/NOTE": "W3C Group Note",
"w3c/NOTE-ED": "Editor's Draft",
"w3c/NOTE-ED": "Editors Draft",
"w3c/NOTE-WD": "W3C Group Draft Note",
"w3c/NOTE-FPWD": "W3C Group Draft Note",
"w3c/DRY": "W3C Draft Registry",
Expand All @@ -49,7 +49,7 @@
"iso/MEET": "Meeting Announcements",
"iso/RESP": "Records of Response",
"iso/MIN": "Minutes",
"iso/ER": "Editor's Report",
"iso/ER": "Editors Report",
"iso/SD": "Standing Document",
"iso/PWI": "Preliminary Work Item",
"iso/NP": "New Proposal",
Expand Down Expand Up @@ -80,13 +80,13 @@
"iso/FD-AMD": "Final Draft Amendment",
"iso/PRF-AMD": "Proof Amendment",
"iso/AMD": "Amendment",
"fido/ED": "Editor's Draft",
"fido/ED": "Editors Draft",
"fido/WD": "Working Draft",
"fido/RD": "Review Draft",
"fido/ID": "Implementation Draft",
"fido/PS": "Proposed Standard",
"fido/FD": "Final Document",
"khronos/ED": "Editor's Draft",
"khronos/ED": "Editors Draft",
"aom/PD": "Pre-Draft",
"aom/WGD": "AOM Working Group Draft",
"aom/WGA": "AOM Working Group Approved Draft",
Expand Down
5 changes: 5 additions & 0 deletions bikeshed/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@
biblioDisplay: StringEnum = StringEnum("index", "inline", "direct")
chroot: bool = True
executeCode: bool = False

macroStartChar = "\uebbb"
macroEndChar = "\uebbc"
incrementLineCountChar = "\uebbd"
bsComment = "<!--\uebbe-->"
10 changes: 8 additions & 2 deletions bikeshed/datablocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import attr

from . import biblio, config, h, printjson, refs, t
from . import biblio, config, constants, h, printjson, refs, t
from . import messages as m
from .line import Line

Expand Down Expand Up @@ -91,6 +91,8 @@ def transformDataBlocks(doc: t.SpecT, lines: list[Line] | list[str]) -> list[Lin
blockLines: list[Line] = []
newLines: list[Line] = []
for line in _lines:
if line.text.strip() == constants.bsComment:
continue
# Look for the start of a block.
match = re.match(r"\s*<(pre|xmp)[\s>]", line.text, re.I)
# Note that, by design, I don't pay attention to anything on the same line as the start tag,
Expand Down Expand Up @@ -611,7 +613,7 @@ def parseDefBlock(
else:
vals[key] = val
for key, val in vals.items():
vals[key] = h.parseText(val)
vals[key] = h.parseText(val, h.ParseConfig.fromSpec(doc))
return vals


Expand Down Expand Up @@ -1059,6 +1061,10 @@ def extendData(datas: InfoTreeT, infoLevels: InfoTreeT) -> None:
thisLine = None
if line.strip() == "":
continue
if re.match(r"^\s*<!--.*-->\s*$", line):
# HTML comment filling the whole line,
# go ahead and strip it
continue
ws, text = t.cast("re.Match", re.match(r"(\s*)(.*)", line)).groups()
if text.startswith("#"): # comment
continue
Expand Down
4 changes: 4 additions & 0 deletions bikeshed/h/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@
parseHTML,
prependChild,
previousElements,
printNodeTree,
relevantHeadings,
removeAttr,
removeClass,
removeNode,
replaceContents,
replaceMacros,
replaceMacrosTextly,
replaceNode,
replaceWithContents,
safeID,
Expand All @@ -80,6 +82,7 @@
Comment,
EndTag,
Failure,
ParseConfig,
ParseFailure,
Result,
StartTag,
Expand Down Expand Up @@ -109,6 +112,7 @@
parseStyleToEnd,
parseTagName,
parseText,
parseTitle,
parseUnquotedAttrValue,
parseWhitespace,
parseXmpToEnd,
Expand Down
60 changes: 55 additions & 5 deletions bikeshed/h/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from lxml.cssselect import CSSSelector
from lxml.html import tostring

from .. import t
from .. import constants, t
from ..messages import die, warn

if t.TYPE_CHECKING:
Expand Down Expand Up @@ -155,6 +155,27 @@ def outerHTML(el: t.NodesT | None, literal: bool = False, with_tail: bool = Fals
return t.cast(str, tostring(el, with_tail=with_tail, encoding="unicode"))


def printNodeTree(node: t.NodeT | str) -> str:
# Debugging tool
if isinstance(node, str):
return "#text: " + repr(node)
if isinstance(node, list):
s = "[]"
else:
s = f"{serializeTag(node)}"
linesPerChild = [printNodeTree(child).split("\n") for child in childNodes(node)]
if linesPerChild:
for childLines in linesPerChild[:-1]:
childLines[0] = " ├" + childLines[0]
childLines[1:] = [" │" + line for line in childLines[1:]]
s += "\n" + "\n".join(childLines)
childLines = linesPerChild[-1]
childLines[0] = " ╰" + childLines[0]
childLines[1:] = [" " + line for line in childLines[1:]]
s += "\n" + "\n".join(childLines)
return s


def linkTextsFromElement(el: t.ElementT) -> list[str]:
if el.get("data-lt") == "":
return []
Expand Down Expand Up @@ -787,10 +808,6 @@ def hasOnlyChild(el: t.ElementT, wsAllowed: bool = True) -> t.ElementT | None:

def fixTypography(text: str) -> str:
# Replace straight aposes with curly quotes for possessives and contractions.
text = re.sub(r"([\w])'([\w])", r"\1’\2", text)
text = re.sub(r"(</[\w]+>)'([\w])", r"\1’\2", text)
# Fix line-ending em dashes, or --, by moving the previous line up, so no space.
text = re.sub(r"([^<][^!])(—|--)\r?\n\s*(\S)", r"\1—<wbr>\3", text)
return text


Expand Down Expand Up @@ -834,6 +851,39 @@ def replaceMacros(text: str, macros: t.Mapping[str, str]) -> str:
# Macro syntax is [FOO], where FOO is /[A-Z0-9-]+/
# If written as [FOO?], failure to find a matching macro just replaced it with nothing;
# otherwise, it throws a fatal error.

def macroReplacer(match: re.Match) -> str:
text = match.group(1).lower().strip()
if text.endswith("?"):
text = text[:-1].strip()
optional = True
else:
optional = False
if text in macros:
# For some reason I store all the macros in lowercase,
# despite requiring them to be spelled with uppercase.
return str(macros[text])
# Nothing has matched, so start failing the macros.
if optional:
return ""
die(
f"Found unmatched text macro [{match.group(1)}]. Correct the macro, or escape it somehow (leading backslash, html escape, etc).",
)
return t.cast(str, "[" + match.group(0)[1:-1] + "]")

while "\uebbb" in text:
# Loop, as macros might expand to more macros
# (which hopefully were HTML-parsed).
ms = constants.macroStartChar
me = constants.macroEndChar
text = re.sub(f"{ms}(.+?){me}", macroReplacer, text)
return text


def replaceMacrosTextly(text: str, macros: t.Mapping[str, str]) -> str:
# Same as replaceMacros(), but does the substitution
# directly on the text, rather than relying on the
# html parser to have preparsed the macro syntax
def macroReplacer(match: re.Match) -> str:
fullText = t.cast(str, match.group(0))
innerText = match.group(2).lower() or ""
Expand Down
Loading

0 comments on commit 5c6f7af

Please sign in to comment.