Skip to content

Commit

Permalink
Phase 1 of new HTML parser (#2602)
Browse files Browse the repository at this point in the history
  • Loading branch information
tabatkins authored Jul 11, 2023
1 parent 20bfb0f commit ad86d38
Show file tree
Hide file tree
Showing 95 changed files with 1,956 additions and 1,171 deletions.
55 changes: 44 additions & 11 deletions bikeshed/Spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import glob
import json
import os
import re
import sys
from collections import OrderedDict, defaultdict
from datetime import datetime
Expand All @@ -28,7 +29,6 @@
includes,
inlineTags,
language,
line,
lint,
markdown,
mdn,
Expand All @@ -40,6 +40,7 @@
t,
wpt,
)
from . import line as l
from . import messages as m
from . import unsortedJunk as u

Expand Down Expand Up @@ -81,7 +82,7 @@ def __init__(
else:
self.dataFile = fileRequester

self.lines: list[line.Line] = []
self.lines: list[l.Line] = []
self.valid = self.initializeState()

def initializeState(self) -> bool:
Expand Down Expand Up @@ -121,7 +122,7 @@ def initializeState(self) -> bool:

try:
inputContent = self.inputSource.read()
self.lines = inputContent.lines
self.lines = self.earlyParse(inputContent)
if inputContent.date is not None:
self.mdBaseline.addParsedData("Date", inputContent.date)
except FileNotFoundError:
Expand All @@ -133,6 +134,22 @@ def initializeState(self) -> bool:

return True

def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
_, self.mdDocument = metadata.parse(lines=inputContent.lines)

# First load the metadata sources from 'local' data
self.md = metadata.join(self.mdBaseline, self.mdDocument, self.mdCommandLine)
# Using that to determine the Group and Status, load the correct defaults.include boilerplate
self.mdDefaults = metadata.fromJson(
data=retrieve.retrieveBoilerplateFile(self, "defaults", error=True),
source="defaults",
)
self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)

text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
inputContent.rawLines = [x + "\n" for x in text.split("\n")]
return inputContent.lines

def checkValidity(self) -> bool:
return True

Expand Down Expand Up @@ -188,6 +205,14 @@ def assembleDocument(self) -> Spec:
self.refs.initializeRefs(doc=self, datablocks=datablocks)
self.refs.initializeBiblio()

if "mixed-indents" in self.md.complainAbout:
if self.md.indentInfo and self.md.indentInfo.char:
checkForMixedIndents(self.lines, self.md.indentInfo)
else:
m.warn(
"`Complain About: mixed-indents yes` is active, but I couldn't infer the document's indentation. Be more consistent, or turn this lint off.",
)

# Deal with further <pre> blocks, and markdown
self.lines = datablocks.transformDataBlocks(self, self.lines)

Expand Down Expand Up @@ -446,17 +471,11 @@ def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
# Also handle markdown escapes.
if moreMacros is None:
moreMacros = {}
textFunctor: func.Functor
if "markdown" in self.md.markupShorthands:
textFunctor = u.MarkdownCodeSpans(text)
else:
textFunctor = func.Functor(text)
textFunctor: func.Functor = func.Functor(text)

macros = dict(self.macros, **moreMacros)
textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
textFunctor = textFunctor.map(h.fixTypography)
if "css" in self.md.markupShorthands:
textFunctor = textFunctor.map(h.replaceAwkwardCSSShorthands)

return t.cast(str, textFunctor.extract())

Expand All @@ -473,7 +492,7 @@ def printTargets(self) -> None:
def isOpaqueElement(self, el: t.ElementT) -> bool:
if el.tag in self.md.opaqueElements:
return True
if el.get("data-opaque") is not None:
if el.get("data-opaque") is not None or el.get("bs-opaque") is not None:
return True
return False

Expand Down Expand Up @@ -552,3 +571,17 @@ def addDomintroStyles(doc: Spec) -> None:
return

doc.extraStyles.setFile("domintro", "Spec-domintro.css")


def checkForMixedIndents(lines: t.Sequence[l.Line], info: metadata.IndentInfo) -> None:
badIndentChar = " " if info.char == "\t" else "\t"
for line in lines:
if not line.text:
continue
if line.text.startswith(badIndentChar):
if info.char == " ":
m.lint(f"Your document appears to use spaces to indent, but line {line.i} starts with tabs.")
else:
m.lint(f"Your document appears to use tabs to indent, but line {line.i} starts with spaces.")
if re.match(r"(\t+ +\t)|( +\t)", line.text):
m.lint(f"Line {line.i}'s indent contains tabs after spaces.")
2 changes: 1 addition & 1 deletion bikeshed/boilerplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def addAtRisk(doc: t.SpecT) -> None:
return
html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
for feature in doc.md.atRisk:
html += "<li>" + doc.fixText(feature)
html += "<li>" + doc.fixText(h.parseText(feature))
html += (
"</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
+ "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "
Expand Down
12 changes: 10 additions & 2 deletions bikeshed/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,13 @@ def main() -> None:
action="store_true",
help="Skip testing the real-world files in the repo, and only run the manually-written ones.",
)
testParser.add_argument(
"--folder",
dest="folder",
default=None,
nargs="+",
help="Only work on tests whose paths contain any of these folder names.",
)
testParser.add_argument(
"testFiles",
default=[],
Expand Down Expand Up @@ -650,10 +657,11 @@ def handleTest(options: argparse.Namespace, extras: list[str]) -> None:
md = metadata.fromCommandLine(extras)
constants.setErrorLevel("nothing")
constants.quiet = 100
filters = test.TestFilter.fromOptions(options)
if options.rebase:
test.rebase(options.testFiles, md=md)
test.rebase(filters, md=md)
else:
result = test.runAllTests(options.testFiles, manualOnly=options.manualOnly, md=md)
result = test.runAllTests(filters, md=md)
sys.exit(0 if result else 1)


Expand Down
13 changes: 8 additions & 5 deletions bikeshed/datablocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def transformPre(lines: list[str], tagName: str, firstLine: str, lineNum: int |


def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
rows = parseDefBlock(lines, "simpledef")
rows = parseDefBlock(lines, "simpledef", doc=doc)
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
Expand All @@ -269,7 +269,7 @@ def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum:

def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "propdef")
parsedAttrs = parseDefBlock(lines, "propdef", doc=doc)
# Displays entries in the order specified in attrs,
# then if there are any unknown parsedAttrs values,
# they're displayed afterward in the order they were specified.
Expand Down Expand Up @@ -382,7 +382,7 @@ def transformDescdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
vals = parseDefBlock(lines, "descdef")
vals = parseDefBlock(lines, "descdef", doc=doc)
if "partial" in firstLine or "New values" in vals:
requiredKeys = ["Name", "For"]
ret = [
Expand Down Expand Up @@ -434,7 +434,7 @@ def transformElementdef(lines: list[str], tagName: str, firstLine: str, lineNum:
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "elementdef")
parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc)
if "Attribute groups" in parsedAttrs or "Attributes" in parsedAttrs:
html = "<ul>"
if "Attribute groups" in parsedAttrs:
Expand Down Expand Up @@ -523,7 +523,7 @@ def transformArgumentdef(
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
attrs = parseDefBlock(lines, "argumentdef", capitalizeKeys=False, lineNum=lineNum)
attrs = parseDefBlock(lines, "argumentdef", doc=doc, capitalizeKeys=False, lineNum=lineNum)
el = h.parseHTML(firstLine + "</pre>")[0]
if "for" in el.attrib:
forValue = t.cast(str, el.get("for"))
Expand Down Expand Up @@ -585,6 +585,7 @@ def transformArgumentdef(
def parseDefBlock(
lines: list[str],
type: str,
doc: t.SpecT,
capitalizeKeys: bool = True,
lineNum: int | None = None,
) -> OrderedDict[str, str]:
Expand All @@ -609,6 +610,8 @@ def parseDefBlock(
vals[key] += "\n" + val
else:
vals[key] = val
for key, val in vals.items():
vals[key] = h.parseText(val)
return vals


Expand Down
5 changes: 4 additions & 1 deletion bikeshed/h/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
removeAttr,
removeClass,
removeNode,
replaceAwkwardCSSShorthands,
replaceContents,
replaceMacros,
replaceNode,
Expand All @@ -84,6 +83,7 @@
Result,
StartTag,
Stream,
initialDocumentParse,
isASCII,
isASCIIAlpha,
isASCIIAlphanum,
Expand All @@ -101,14 +101,17 @@
parseComment,
parseDoctype,
parseEndTag,
parseLines,
parseQuotedAttrValue,
parseScriptToEnd,
parseStartTag,
parseStyleToEnd,
parseTagName,
parseText,
parseUnquotedAttrValue,
parseWhitespace,
parseXmpToEnd,
strFromNodes,
test,
)
from .serializer import Serializer
26 changes: 0 additions & 26 deletions bikeshed/h/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,32 +856,6 @@ def macroReplacer(match: re.Match) -> str:
return re.sub(r"(\\|\[)?\[([A-Z0-9-]+)(\??)\]", macroReplacer, text)


def replaceAwkwardCSSShorthands(text: str) -> str:
# Replace the <<production>> shortcuts, because they won't survive the HTML parser.
def replaceProduction(match: re.Match) -> str:
syntaxAttr = escapeAttr(match.group(0))
escape, text = match.groups()
if escape:
return escapeHTML(match.group(0)[1:])
return f"<fake-production-placeholder class=production bs-autolink-syntax='{syntaxAttr}' data-opaque>{text}</fake-production-placeholder>"

text = re.sub(r"(\\)?<<([^>\n]+)>>", replaceProduction, text)

# Replace the ''maybe link'' shortcuts.
# They'll survive the HTML parser,
# but the current shorthand-recognizer code won't find them if they contain an element.
# (The other shortcuts are "atomic" and can't contain elements.)
def replaceMaybe(match: re.Match) -> str:
syntaxAttr = escapeAttr(match.group(0))
escape, text = match.groups()
if escape:
return escapeHTML(match.group(0)[1:])
return f"<fake-maybe-placeholder bs-autolink-syntax='{syntaxAttr}'>{text}</fake-maybe-placeholder>"

text = re.sub(r"(\\)?''([^=\n]+?)''", replaceMaybe, text)
return text


def fixupIDs(doc: t.SpecT, els: t.Iterable[t.ElementT]) -> None:
addOldIDs(els)
dedupIDs(doc)
Expand Down
Loading

0 comments on commit ad86d38

Please sign in to comment.