-
Notifications
You must be signed in to change notification settings - Fork 253
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add sources for new parser and tokenizer
- Loading branch information
Showing
2 changed files
with
355 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
from typing import Any, List, NamedTuple, Tuple, Union | ||
|
||
from ._tokenizer import Tokenizer | ||
|
||
|
||
class Node: | ||
def __init__(self, value: str) -> None: | ||
self.value = value | ||
|
||
def __str__(self) -> str: | ||
return str(self.value) | ||
|
||
def __repr__(self) -> str: | ||
return f"<{self.__class__.__name__}('{self}')>" | ||
|
||
def serialize(self) -> str: | ||
raise NotImplementedError | ||
|
||
|
||
class Variable(Node): | ||
def serialize(self) -> str: | ||
return str(self) | ||
|
||
|
||
class Value(Node): | ||
def serialize(self) -> str: | ||
return f'"{self}"' | ||
|
||
|
||
class Op(Node): | ||
def serialize(self) -> str: | ||
return str(self) | ||
|
||
|
||
MarkerVar = Union[Variable, Value] | ||
MarkerItem = Tuple[MarkerVar, Op, MarkerVar] | ||
# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]] | ||
# MarkerList = List[Union["MarkerList", MarkerAtom, str]] | ||
# mypy does not suport recursive type definition | ||
# https://github.com/python/mypy/issues/731 | ||
MarkerAtom = Any | ||
MarkerList = List[Any] | ||
|
||
|
||
class Requirement(NamedTuple): | ||
name: str | ||
url: str | ||
extras: List[str] | ||
specifier: str | ||
marker: str | ||
|
||
|
||
def parse_named_requirement(requirement: str) -> Requirement: | ||
""" | ||
NAMED_REQUIREMENT: NAME EXTRAS* URL_SPEC (SEMICOLON + MARKER)* | ||
""" | ||
tokens = Tokenizer(requirement) | ||
name = tokens.read("IDENTIFIER").text | ||
extras = parse_extras(tokens) | ||
specifier = "" | ||
url = "" | ||
if tokens.match("URL_SPEC"): | ||
url = tokens.read().text[1:].strip() | ||
elif not tokens.match("stringEnd"): | ||
specifier = parse_specifier(tokens) | ||
if tokens.match("SEMICOLON"): | ||
marker = "" | ||
while not tokens.match("stringEnd"): | ||
# we don't validate markers here, it's done later as part of | ||
# packaging/requirements.py | ||
marker += tokens.read().text | ||
else: | ||
marker = "" | ||
tokens.expect("stringEnd") | ||
return Requirement(name, url, extras, specifier, marker) | ||
|
||
|
||
def parse_extras(tokens: Tokenizer) -> List[str]: | ||
""" | ||
EXTRAS: (LBRACKET + IDENTIFIER + (COLON + IDENTIFIER)* + RBRACKET)* | ||
""" | ||
extras = [] | ||
if tokens.try_read("LBRACKET"): | ||
while tokens.match("IDENTIFIER"): | ||
extras.append(tokens.read("IDENTIFIER").text) | ||
tokens.try_read("COLON") | ||
if not tokens.try_read("RBRACKET"): | ||
tokens.raise_syntax_error(message="Closing square bracket is missing") | ||
return extras | ||
|
||
|
||
def parse_specifier(tokens: Tokenizer) -> str: | ||
""" | ||
SPECIFIER: LPAREN (OP + VERSION + COLON)+ RPAREN | OP + VERSION | ||
""" | ||
parsed_specifiers = "" | ||
lparen = False | ||
if tokens.try_read("LPAREN"): | ||
lparen = True | ||
while tokens.match("OP"): | ||
parsed_specifiers += tokens.read("OP").text | ||
if tokens.match("VERSION"): | ||
parsed_specifiers += tokens.read("VERSION").text | ||
else: | ||
tokens.raise_syntax_error(message="Missing version") | ||
if tokens.match("COLON"): | ||
parsed_specifiers += tokens.read("COLON").text | ||
if lparen and not tokens.try_read("RPAREN"): | ||
tokens.raise_syntax_error(message="Closing right parenthesis is missing") | ||
return parsed_specifiers | ||
|
||
|
||
def parse_quoted_marker(tokens: Tokenizer) -> MarkerList: | ||
tokens.try_read("SEMICOLON") | ||
return parse_marker_expr(tokens) | ||
|
||
|
||
def parse_marker_expr(tokens: Tokenizer) -> MarkerList: | ||
""" | ||
MARKER_EXPR: MARKER_ATOM (BOOLOP + MARKER_ATOM)+ | ||
""" | ||
expression = [parse_marker_atom(tokens)] | ||
while tokens.match("BOOLOP"): | ||
tok = tokens.read("BOOLOP") | ||
expr_right = parse_marker_atom(tokens) | ||
expression.extend((tok.text, expr_right)) | ||
return expression | ||
|
||
|
||
def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom: | ||
""" | ||
MARKER_ATOM: LPAREN MARKER_EXPR RPAREN | MARKER_ITEM | ||
""" | ||
if tokens.try_read("LPAREN"): | ||
marker = parse_marker_expr(tokens) | ||
if not tokens.try_read("RPAREN"): | ||
tokens.raise_syntax_error(message="Closing right parenthesis is missing") | ||
return marker | ||
else: | ||
return parse_marker_item(tokens) | ||
|
||
|
||
def parse_marker_item( | ||
tokens: Tokenizer, | ||
) -> MarkerItem: | ||
""" | ||
MARKER_ITEM: MARKER_VAR MARKER_OP MARKER_VAR | ||
""" | ||
marker_var_left = parse_marker_var(tokens) | ||
marker_op = parse_marker_op(tokens) | ||
marker_var_right = parse_marker_var(tokens) | ||
return (marker_var_left, marker_op, marker_var_right) | ||
|
||
|
||
def parse_marker_var(tokens: Tokenizer) -> MarkerVar: | ||
""" | ||
MARKER_VAR: VARIABLE MARKER_VALUE | ||
""" | ||
if tokens.match("VARIABLE"): | ||
return parse_variable(tokens) | ||
else: | ||
return parse_python_str(tokens) | ||
|
||
|
||
def parse_variable(tokens: Tokenizer) -> Variable: | ||
|
||
env_var = tokens.read("VARIABLE").text.replace(".", "_") | ||
if ( | ||
env_var == "platform_python_implementation" | ||
or env_var == "python_implementation" | ||
): | ||
return Variable("platform_python_implementation") | ||
else: | ||
return Variable(env_var) | ||
|
||
|
||
def parse_python_str(tokens: Tokenizer) -> Value: | ||
|
||
if tokens.match("QUOTED_STRING"): | ||
python_str = tokens.read().text.strip("'\"") | ||
return Value(str(python_str)) | ||
else: | ||
return tokens.raise_syntax_error( | ||
message="String with single or double quote at the beginning is expected" | ||
) | ||
|
||
|
||
def parse_marker_op(tokens: Tokenizer) -> Op: | ||
|
||
if tokens.try_read("IN"): | ||
return Op("in") | ||
elif tokens.try_read("NOT"): | ||
tokens.read("IN") | ||
return Op("not in") | ||
elif tokens.match("OP"): | ||
return Op(tokens.read().text) | ||
else: | ||
return tokens.raise_syntax_error( | ||
message='Couldn\'t parse marker operator. Expecting one of \ | ||
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"' | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import re | ||
from typing import Dict, Generator, NoReturn, Optional | ||
|
||
from .specifiers import Specifier | ||
|
||
|
||
class Token: | ||
def __init__(self, name: str, text: str, position: int) -> None: | ||
self.name = name | ||
self.text = text | ||
self.position = position | ||
|
||
def matches(self, name: str = "", text: str = "") -> bool: | ||
if name and self.name != name: | ||
return False | ||
return True | ||
|
||
|
||
class ParseException(Exception): | ||
"""Parsing failed""" | ||
|
||
def __init__(self, message: str, position: int) -> None: | ||
super().__init__(message) | ||
self.position = position | ||
|
||
|
||
DEFAULT_RULES = { | ||
None: r"[ \t]+", # whitespace: not returned as tokens | ||
"LPAREN": r"\(", | ||
"RPAREN": r"\)", | ||
"LBRACKET": r"\[", | ||
"RBRACKET": r"\]", | ||
"SEMICOLON": r";", | ||
"COLON": r",", | ||
"QUOTED_STRING": re.compile( | ||
r""" | ||
('[^']*') | ||
| | ||
("[^"]*") | ||
""", | ||
re.VERBOSE, | ||
), | ||
"OP": r"===|==|~=|!=|<=|>=|<|>", | ||
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), | ||
"BOOLOP": r"or|and", | ||
"IN": r"in", | ||
"NOT": r"not", | ||
"VARIABLE": re.compile( | ||
r""" | ||
python_version | ||
|python_full_version | ||
|os[._]name | ||
|sys[._]platform | ||
|platform_(release|system) | ||
|platform[._](version|machine|python_implementation) | ||
|python_implementation | ||
|implementation_(name|version) | ||
|extra | ||
""", | ||
re.VERBOSE, | ||
), | ||
"URL_SPEC": "@ *[^ ]+", | ||
"IDENTIFIER": r"([a-zA-Z0-9]|-|_|\.)+", | ||
} | ||
|
||
|
||
class Tokenizer: | ||
"""Stream of tokens for a LL(1) parser. | ||
Provides methods to examine the next token to be read, and to read it | ||
(advance to the next token). | ||
""" | ||
|
||
next_token: Optional[Token] | ||
|
||
def __init__( | ||
self, source: str, rules: Dict[Optional[str], object] = DEFAULT_RULES | ||
) -> None: | ||
self.source = source | ||
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} | ||
self.next_token = None | ||
self.generator = self._tokenize() | ||
self.position = 0 | ||
|
||
def peek(self) -> Token: | ||
"""Return the next token to be read""" | ||
if not self.next_token: | ||
self.next_token = next(self.generator) | ||
return self.next_token | ||
|
||
def match(self, *match_args: str) -> bool: | ||
"""Return True if the next token matches the given arguments""" | ||
token = self.peek() | ||
return token.matches(*match_args) | ||
|
||
def expect(self, *match_args: str, **match_kwargs: str) -> Token: | ||
"""Raise SyntaxError if the next token doesn't match given arguments""" | ||
token = self.peek() | ||
if not token.matches(*match_args, **match_kwargs): | ||
exp = " ".join( | ||
v | ||
for v in match_args | ||
+ tuple(f"{k}={v!r}" for k, v in match_kwargs.items()) | ||
if v | ||
) | ||
raise self.raise_syntax_error(message=f"Expected {exp}") | ||
return token | ||
|
||
def read(self, *match_args: str) -> Token: | ||
"""Return the next token and advance to the next token | ||
Raise SyntaxError if the token doesn't match. | ||
""" | ||
result = self.expect(*match_args) | ||
self.next_token = None | ||
return result | ||
|
||
def try_read(self, *match_args: str) -> Optional[Token]: | ||
"""read() if the next token matches the given arguments | ||
Do nothing if it does not match. | ||
""" | ||
if self.match(*match_args): | ||
return self.read() | ||
return None | ||
|
||
def raise_syntax_error(self, *, message: str) -> NoReturn: | ||
"""Raise SyntaxError at the given position in the marker""" | ||
at = f"at position {self.position}:" | ||
marker = " " * self.position + "^" | ||
raise ParseException( | ||
f"{message}\n{at}\n {self.source}\n {marker}", | ||
self.position, | ||
) | ||
|
||
def _make_token(self, name: str, text: str) -> Token: | ||
"""Make a token with the current position""" | ||
return Token(name, text, self.position) | ||
|
||
def _tokenize(self) -> Generator[Token, Token, None]: | ||
"""The main generator of tokens""" | ||
while self.position < len(self.source): | ||
for name, expression in self.rules.items(): | ||
match = expression.match(self.source, self.position) | ||
if match: | ||
token_text = match[0] | ||
|
||
if name: | ||
yield self._make_token(name, token_text) | ||
self.position += len(token_text) | ||
break | ||
else: | ||
raise self.raise_syntax_error(message="Unrecognized token") | ||
yield self._make_token("stringEnd", "") |