Phase 1 of new HTML parser (#2602)

speced · Jul 11, 2023 · ad86d38 · ad86d38
1 parent 20bfb0f
commit ad86d38
Show file tree

Hide file tree

Showing 95 changed files with 1,956 additions and 1,171 deletions.
diff --git a/bikeshed/Spec.py b/bikeshed/Spec.py
@@ -4,6 +4,7 @@
 import glob
 import json
 import os
+import re
 import sys
 from collections import OrderedDict, defaultdict
 from datetime import datetime
@@ -28,7 +29,6 @@
     includes,
     inlineTags,
     language,
-    line,
     lint,
     markdown,
     mdn,
@@ -40,6 +40,7 @@
     t,
     wpt,
 )
+from . import line as l
 from . import messages as m
 from . import unsortedJunk as u
 
@@ -81,7 +82,7 @@ def __init__(
         else:
             self.dataFile = fileRequester
 
-        self.lines: list[line.Line] = []
+        self.lines: list[l.Line] = []
         self.valid = self.initializeState()
 
     def initializeState(self) -> bool:
@@ -121,7 +122,7 @@ def initializeState(self) -> bool:
 
         try:
             inputContent = self.inputSource.read()
-            self.lines = inputContent.lines
+            self.lines = self.earlyParse(inputContent)
             if inputContent.date is not None:
                 self.mdBaseline.addParsedData("Date", inputContent.date)
         except FileNotFoundError:
@@ -133,6 +134,22 @@ def initializeState(self) -> bool:
 
         return True
 
+    def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
+        _, self.mdDocument = metadata.parse(lines=inputContent.lines)
+
+        # First load the metadata sources from 'local' data
+        self.md = metadata.join(self.mdBaseline, self.mdDocument, self.mdCommandLine)
+        # Using that to determine the Group and Status, load the correct defaults.include boilerplate
+        self.mdDefaults = metadata.fromJson(
+            data=retrieve.retrieveBoilerplateFile(self, "defaults", error=True),
+            source="defaults",
+        )
+        self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)
+
+        text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
+        inputContent.rawLines = [x + "\n" for x in text.split("\n")]
+        return inputContent.lines
+
     def checkValidity(self) -> bool:
         return True
 
@@ -188,6 +205,14 @@ def assembleDocument(self) -> Spec:
         self.refs.initializeRefs(doc=self, datablocks=datablocks)
         self.refs.initializeBiblio()
 
+        if "mixed-indents" in self.md.complainAbout:
+            if self.md.indentInfo and self.md.indentInfo.char:
+                checkForMixedIndents(self.lines, self.md.indentInfo)
+            else:
+                m.warn(
+                    "`Complain About: mixed-indents yes` is active, but I couldn't infer the document's indentation. Be more consistent, or turn this lint off.",
+                )
+
         # Deal with further <pre> blocks, and markdown
         self.lines = datablocks.transformDataBlocks(self, self.lines)
 
@@ -446,17 +471,11 @@ def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
         # Also handle markdown escapes.
         if moreMacros is None:
             moreMacros = {}
-        textFunctor: func.Functor
-        if "markdown" in self.md.markupShorthands:
-            textFunctor = u.MarkdownCodeSpans(text)
-        else:
-            textFunctor = func.Functor(text)
+        textFunctor: func.Functor = func.Functor(text)
 
         macros = dict(self.macros, **moreMacros)
         textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
         textFunctor = textFunctor.map(h.fixTypography)
-        if "css" in self.md.markupShorthands:
-            textFunctor = textFunctor.map(h.replaceAwkwardCSSShorthands)
 
         return t.cast(str, textFunctor.extract())
 
@@ -473,7 +492,7 @@ def printTargets(self) -> None:
     def isOpaqueElement(self, el: t.ElementT) -> bool:
         if el.tag in self.md.opaqueElements:
             return True
-        if el.get("data-opaque") is not None:
+        if el.get("data-opaque") is not None or el.get("bs-opaque") is not None:
             return True
         return False
 
@@ -552,3 +571,17 @@ def addDomintroStyles(doc: Spec) -> None:
         return
 
     doc.extraStyles.setFile("domintro", "Spec-domintro.css")
+
+
+def checkForMixedIndents(lines: t.Sequence[l.Line], info: metadata.IndentInfo) -> None:
+    badIndentChar = " " if info.char == "\t" else "\t"
+    for line in lines:
+        if not line.text:
+            continue
+        if line.text.startswith(badIndentChar):
+            if info.char == " ":
+                m.lint(f"Your document appears to use spaces to indent, but line {line.i} starts with tabs.")
+            else:
+                m.lint(f"Your document appears to use tabs to indent, but line {line.i} starts with spaces.")
+        if re.match(r"(\t+ +\t)|( +\t)", line.text):
+            m.lint(f"Line {line.i}'s indent contains tabs after spaces.")
diff --git a/bikeshed/boilerplate.py b/bikeshed/boilerplate.py
@@ -213,7 +213,7 @@ def addAtRisk(doc: t.SpecT) -> None:
         return
     html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
     for feature in doc.md.atRisk:
-        html += "<li>" + doc.fixText(feature)
+        html += "<li>" + doc.fixText(h.parseText(feature))
     html += (
         "</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
         + "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "

diff --git a/bikeshed/cli.py b/bikeshed/cli.py
@@ -371,6 +371,13 @@ def main() -> None:
         action="store_true",
         help="Skip testing the real-world files in the repo, and only run the manually-written ones.",
     )
+    testParser.add_argument(
+        "--folder",
+        dest="folder",
+        default=None,
+        nargs="+",
+        help="Only work on tests whose paths contain any of these folder names.",
+    )
     testParser.add_argument(
         "testFiles",
         default=[],
@@ -650,10 +657,11 @@ def handleTest(options: argparse.Namespace, extras: list[str]) -> None:
     md = metadata.fromCommandLine(extras)
     constants.setErrorLevel("nothing")
     constants.quiet = 100
+    filters = test.TestFilter.fromOptions(options)
     if options.rebase:
-        test.rebase(options.testFiles, md=md)
+        test.rebase(filters, md=md)
     else:
-        result = test.runAllTests(options.testFiles, manualOnly=options.manualOnly, md=md)
+        result = test.runAllTests(filters, md=md)
         sys.exit(0 if result else 1)
 
 

diff --git a/bikeshed/datablocks.py b/bikeshed/datablocks.py
@@ -252,7 +252,7 @@ def transformPre(lines: list[str], tagName: str, firstLine: str, lineNum: int |
 
 
 def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
-    rows = parseDefBlock(lines, "simpledef")
+    rows = parseDefBlock(lines, "simpledef", doc=doc)
     lineNumAttr = ""
     if lineNum is not None:
         lineNumAttr = f" line-number={lineNum}"
@@ -269,7 +269,7 @@ def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum:
 
 def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
     attrs: OrderedDict[str, str | None] = OrderedDict()
-    parsedAttrs = parseDefBlock(lines, "propdef")
+    parsedAttrs = parseDefBlock(lines, "propdef", doc=doc)
     # Displays entries in the order specified in attrs,
     # then if there are any unknown parsedAttrs values,
     # they're displayed afterward in the order they were specified.
@@ -382,7 +382,7 @@ def transformDescdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
     lineNumAttr = ""
     if lineNum is not None:
         lineNumAttr = f" line-number={lineNum}"
-    vals = parseDefBlock(lines, "descdef")
+    vals = parseDefBlock(lines, "descdef", doc=doc)
     if "partial" in firstLine or "New values" in vals:
         requiredKeys = ["Name", "For"]
         ret = [
@@ -434,7 +434,7 @@ def transformElementdef(lines: list[str], tagName: str, firstLine: str, lineNum:
     if lineNum is not None:
         lineNumAttr = f" line-number={lineNum}"
     attrs: OrderedDict[str, str | None] = OrderedDict()
-    parsedAttrs = parseDefBlock(lines, "elementdef")
+    parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc)
     if "Attribute groups" in parsedAttrs or "Attributes" in parsedAttrs:
         html = "<ul>"
         if "Attribute groups" in parsedAttrs:
@@ -523,7 +523,7 @@ def transformArgumentdef(
     lineNumAttr = ""
     if lineNum is not None:
         lineNumAttr = f" line-number={lineNum}"
-    attrs = parseDefBlock(lines, "argumentdef", capitalizeKeys=False, lineNum=lineNum)
+    attrs = parseDefBlock(lines, "argumentdef", doc=doc, capitalizeKeys=False, lineNum=lineNum)
     el = h.parseHTML(firstLine + "</pre>")[0]
     if "for" in el.attrib:
         forValue = t.cast(str, el.get("for"))
@@ -585,6 +585,7 @@ def transformArgumentdef(
 def parseDefBlock(
     lines: list[str],
     type: str,
+    doc: t.SpecT,
     capitalizeKeys: bool = True,
     lineNum: int | None = None,
 ) -> OrderedDict[str, str]:
@@ -609,6 +610,8 @@ def parseDefBlock(
             vals[key] += "\n" + val
         else:
             vals[key] = val
+    for key, val in vals.items():
+        vals[key] = h.parseText(val)
     return vals
 
 

diff --git a/bikeshed/h/__init__.py b/bikeshed/h/__init__.py
@@ -58,7 +58,6 @@
     removeAttr,
     removeClass,
     removeNode,
-    replaceAwkwardCSSShorthands,
     replaceContents,
     replaceMacros,
     replaceNode,
@@ -84,6 +83,7 @@
     Result,
     StartTag,
     Stream,
+    initialDocumentParse,
     isASCII,
     isASCIIAlpha,
     isASCIIAlphanum,
@@ -101,14 +101,17 @@
     parseComment,
     parseDoctype,
     parseEndTag,
+    parseLines,
     parseQuotedAttrValue,
     parseScriptToEnd,
     parseStartTag,
     parseStyleToEnd,
     parseTagName,
+    parseText,
     parseUnquotedAttrValue,
     parseWhitespace,
     parseXmpToEnd,
+    strFromNodes,
     test,
 )
 from .serializer import Serializer
diff --git a/bikeshed/h/dom.py b/bikeshed/h/dom.py
@@ -856,32 +856,6 @@ def macroReplacer(match: re.Match) -> str:
     return re.sub(r"(\\|\[)?\[([A-Z0-9-]+)(\??)\]", macroReplacer, text)
 
 
-def replaceAwkwardCSSShorthands(text: str) -> str:
-    # Replace the <<production>> shortcuts, because they won't survive the HTML parser.
-    def replaceProduction(match: re.Match) -> str:
-        syntaxAttr = escapeAttr(match.group(0))
-        escape, text = match.groups()
-        if escape:
-            return escapeHTML(match.group(0)[1:])
-        return f"<fake-production-placeholder class=production bs-autolink-syntax='{syntaxAttr}' data-opaque>{text}</fake-production-placeholder>"
-
-    text = re.sub(r"(\\)?<<([^>\n]+)>>", replaceProduction, text)
-
-    # Replace the ''maybe link'' shortcuts.
-    # They'll survive the HTML parser,
-    # but the current shorthand-recognizer code won't find them if they contain an element.
-    # (The other shortcuts are "atomic" and can't contain elements.)
-    def replaceMaybe(match: re.Match) -> str:
-        syntaxAttr = escapeAttr(match.group(0))
-        escape, text = match.groups()
-        if escape:
-            return escapeHTML(match.group(0)[1:])
-        return f"<fake-maybe-placeholder bs-autolink-syntax='{syntaxAttr}'>{text}</fake-maybe-placeholder>"
-
-    text = re.sub(r"(\\)?''([^=\n]+?)''", replaceMaybe, text)
-    return text
-
-
 def fixupIDs(doc: t.SpecT, els: t.Iterable[t.ElementT]) -> None:
     addOldIDs(els)
     dedupIDs(doc)