Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phase 1 of new HTML parser #2602

Merged
merged 17 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions bikeshed/Spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import glob
import json
import os
import re
import sys
from collections import OrderedDict, defaultdict
from datetime import datetime
Expand All @@ -28,7 +29,6 @@
includes,
inlineTags,
language,
line,
lint,
markdown,
mdn,
Expand All @@ -40,6 +40,7 @@
t,
wpt,
)
from . import line as l
from . import messages as m
from . import unsortedJunk as u

Expand Down Expand Up @@ -81,7 +82,7 @@ def __init__(
else:
self.dataFile = fileRequester

self.lines: list[line.Line] = []
self.lines: list[l.Line] = []
self.valid = self.initializeState()

def initializeState(self) -> bool:
Expand Down Expand Up @@ -121,7 +122,7 @@ def initializeState(self) -> bool:

try:
inputContent = self.inputSource.read()
self.lines = inputContent.lines
self.lines = self.earlyParse(inputContent)
if inputContent.date is not None:
self.mdBaseline.addParsedData("Date", inputContent.date)
except FileNotFoundError:
Expand All @@ -133,6 +134,22 @@ def initializeState(self) -> bool:

return True

def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
_, self.mdDocument = metadata.parse(lines=inputContent.lines)

# First load the metadata sources from 'local' data
self.md = metadata.join(self.mdBaseline, self.mdDocument, self.mdCommandLine)
# Using that to determine the Group and Status, load the correct defaults.include boilerplate
self.mdDefaults = metadata.fromJson(
data=retrieve.retrieveBoilerplateFile(self, "defaults", error=True),
source="defaults",
)
self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)

text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
inputContent.rawLines = [x + "\n" for x in text.split("\n")]
return inputContent.lines

def checkValidity(self) -> bool:
return True

Expand Down Expand Up @@ -188,6 +205,14 @@ def assembleDocument(self) -> Spec:
self.refs.initializeRefs(doc=self, datablocks=datablocks)
self.refs.initializeBiblio()

if "mixed-indents" in self.md.complainAbout:
if self.md.indentInfo and self.md.indentInfo.char:
checkForMixedIndents(self.lines, self.md.indentInfo)
else:
m.warn(
"`Complain About: mixed-indents yes` is active, but I couldn't infer the document's indentation. Be more consistent, or turn this lint off.",
)

# Deal with further <pre> blocks, and markdown
self.lines = datablocks.transformDataBlocks(self, self.lines)

Expand Down Expand Up @@ -446,17 +471,11 @@ def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
# Also handle markdown escapes.
if moreMacros is None:
moreMacros = {}
textFunctor: func.Functor
if "markdown" in self.md.markupShorthands:
textFunctor = u.MarkdownCodeSpans(text)
else:
textFunctor = func.Functor(text)
textFunctor: func.Functor = func.Functor(text)

macros = dict(self.macros, **moreMacros)
textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
textFunctor = textFunctor.map(h.fixTypography)
if "css" in self.md.markupShorthands:
textFunctor = textFunctor.map(h.replaceAwkwardCSSShorthands)

return t.cast(str, textFunctor.extract())

Expand All @@ -473,7 +492,7 @@ def printTargets(self) -> None:
def isOpaqueElement(self, el: t.ElementT) -> bool:
if el.tag in self.md.opaqueElements:
return True
if el.get("data-opaque") is not None:
if el.get("data-opaque") is not None or el.get("bs-opaque") is not None:
return True
return False

Expand Down Expand Up @@ -552,3 +571,17 @@ def addDomintroStyles(doc: Spec) -> None:
return

doc.extraStyles.setFile("domintro", "Spec-domintro.css")


def checkForMixedIndents(lines: t.Sequence[l.Line], info: metadata.IndentInfo) -> None:
badIndentChar = " " if info.char == "\t" else "\t"
for line in lines:
if not line.text:
continue
if line.text.startswith(badIndentChar):
if info.char == " ":
m.lint(f"Your document appears to use spaces to indent, but line {line.i} starts with tabs.")
else:
m.lint(f"Your document appears to use tabs to indent, but line {line.i} starts with spaces.")
if re.match(r"(\t+ +\t)|( +\t)", line.text):
m.lint(f"Line {line.i}'s indent contains tabs after spaces.")
2 changes: 1 addition & 1 deletion bikeshed/boilerplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def addAtRisk(doc: t.SpecT) -> None:
return
html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
for feature in doc.md.atRisk:
html += "<li>" + doc.fixText(feature)
html += "<li>" + doc.fixText(h.parseText(feature))
html += (
"</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
+ "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "
Expand Down
12 changes: 10 additions & 2 deletions bikeshed/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,13 @@ def main() -> None:
action="store_true",
help="Skip testing the real-world files in the repo, and only run the manually-written ones.",
)
testParser.add_argument(
"--folder",
dest="folder",
default=None,
nargs="+",
help="Only work on tests whose paths contain any of these folder names.",
)
testParser.add_argument(
"testFiles",
default=[],
Expand Down Expand Up @@ -650,10 +657,11 @@ def handleTest(options: argparse.Namespace, extras: list[str]) -> None:
md = metadata.fromCommandLine(extras)
constants.setErrorLevel("nothing")
constants.quiet = 100
filters = test.TestFilter.fromOptions(options)
if options.rebase:
test.rebase(options.testFiles, md=md)
test.rebase(filters, md=md)
else:
result = test.runAllTests(options.testFiles, manualOnly=options.manualOnly, md=md)
result = test.runAllTests(filters, md=md)
sys.exit(0 if result else 1)


Expand Down
13 changes: 8 additions & 5 deletions bikeshed/datablocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def transformPre(lines: list[str], tagName: str, firstLine: str, lineNum: int |


def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
rows = parseDefBlock(lines, "simpledef")
rows = parseDefBlock(lines, "simpledef", doc=doc)
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
Expand All @@ -269,7 +269,7 @@ def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum:

def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "propdef")
parsedAttrs = parseDefBlock(lines, "propdef", doc=doc)
# Displays entries in the order specified in attrs,
# then if there are any unknown parsedAttrs values,
# they're displayed afterward in the order they were specified.
Expand Down Expand Up @@ -382,7 +382,7 @@ def transformDescdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
vals = parseDefBlock(lines, "descdef")
vals = parseDefBlock(lines, "descdef", doc=doc)
if "partial" in firstLine or "New values" in vals:
requiredKeys = ["Name", "For"]
ret = [
Expand Down Expand Up @@ -434,7 +434,7 @@ def transformElementdef(lines: list[str], tagName: str, firstLine: str, lineNum:
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "elementdef")
parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc)
if "Attribute groups" in parsedAttrs or "Attributes" in parsedAttrs:
html = "<ul>"
if "Attribute groups" in parsedAttrs:
Expand Down Expand Up @@ -523,7 +523,7 @@ def transformArgumentdef(
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" line-number={lineNum}"
attrs = parseDefBlock(lines, "argumentdef", capitalizeKeys=False, lineNum=lineNum)
attrs = parseDefBlock(lines, "argumentdef", doc=doc, capitalizeKeys=False, lineNum=lineNum)
el = h.parseHTML(firstLine + "</pre>")[0]
if "for" in el.attrib:
forValue = t.cast(str, el.get("for"))
Expand Down Expand Up @@ -585,6 +585,7 @@ def transformArgumentdef(
def parseDefBlock(
lines: list[str],
type: str,
doc: t.SpecT,
capitalizeKeys: bool = True,
lineNum: int | None = None,
) -> OrderedDict[str, str]:
Expand All @@ -609,6 +610,8 @@ def parseDefBlock(
vals[key] += "\n" + val
else:
vals[key] = val
for key, val in vals.items():
vals[key] = h.parseText(val)
return vals


Expand Down
5 changes: 4 additions & 1 deletion bikeshed/h/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
removeAttr,
removeClass,
removeNode,
replaceAwkwardCSSShorthands,
replaceContents,
replaceMacros,
replaceNode,
Expand All @@ -84,6 +83,7 @@
Result,
StartTag,
Stream,
initialDocumentParse,
isASCII,
isASCIIAlpha,
isASCIIAlphanum,
Expand All @@ -101,14 +101,17 @@
parseComment,
parseDoctype,
parseEndTag,
parseLines,
parseQuotedAttrValue,
parseScriptToEnd,
parseStartTag,
parseStyleToEnd,
parseTagName,
parseText,
parseUnquotedAttrValue,
parseWhitespace,
parseXmpToEnd,
strFromNodes,
test,
)
from .serializer import Serializer
26 changes: 0 additions & 26 deletions bikeshed/h/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,32 +856,6 @@ def macroReplacer(match: re.Match) -> str:
return re.sub(r"(\\|\[)?\[([A-Z0-9-]+)(\??)\]", macroReplacer, text)


def replaceAwkwardCSSShorthands(text: str) -> str:
# Replace the <<production>> shortcuts, because they won't survive the HTML parser.
def replaceProduction(match: re.Match) -> str:
syntaxAttr = escapeAttr(match.group(0))
escape, text = match.groups()
if escape:
return escapeHTML(match.group(0)[1:])
return f"<fake-production-placeholder class=production bs-autolink-syntax='{syntaxAttr}' data-opaque>{text}</fake-production-placeholder>"

text = re.sub(r"(\\)?<<([^>\n]+)>>", replaceProduction, text)

# Replace the ''maybe link'' shortcuts.
# They'll survive the HTML parser,
# but the current shorthand-recognizer code won't find them if they contain an element.
# (The other shortcuts are "atomic" and can't contain elements.)
def replaceMaybe(match: re.Match) -> str:
syntaxAttr = escapeAttr(match.group(0))
escape, text = match.groups()
if escape:
return escapeHTML(match.group(0)[1:])
return f"<fake-maybe-placeholder bs-autolink-syntax='{syntaxAttr}'>{text}</fake-maybe-placeholder>"

text = re.sub(r"(\\)?''([^=\n]+?)''", replaceMaybe, text)
return text


def fixupIDs(doc: t.SpecT, els: t.Iterable[t.ElementT]) -> None:
addOldIDs(els)
dedupIDs(doc)
Expand Down
Loading