From 9ab90ae8e4122bd88cd9649f415059e57225ee6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Hrn=C4=8Diar?= Date: Wed, 3 Nov 2021 08:42:21 +0100 Subject: [PATCH] Add sources for new parser and tokenizer --- packaging/_parser.py | 221 ++++++++++++++++++++++++++++++++++++++++ packaging/_tokenizer.py | 160 +++++++++++++++++++++++++++++ 2 files changed, 381 insertions(+) create mode 100644 packaging/_parser.py create mode 100644 packaging/_tokenizer.py diff --git a/packaging/_parser.py b/packaging/_parser.py new file mode 100644 index 000000000..8f614711b --- /dev/null +++ b/packaging/_parser.py @@ -0,0 +1,221 @@ +# The docstring for each parse function contains the grammar for the rule. +# The grammar uses a simple EBNF-inspired syntax: +# +# - Uppercase names are tokens +# - Lowercase names are rules (parsed with a parse_* function) +# - Parentheses are used for grouping +# - A | means either-or +# - A * means 0 or more +# - A + means 1 or more +# - A ? means 0 or 1 + +from ast import literal_eval +from typing import Any, List, NamedTuple, Tuple, Union + +from ._tokenizer import Tokenizer + + +class Node: + def __init__(self, value: str) -> None: + self.value = value + + def __str__(self) -> str: + return str(self.value) + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}('{self}')>" + + def serialize(self) -> str: + raise NotImplementedError + + +class Variable(Node): + def serialize(self) -> str: + return str(self) + + +class Value(Node): + def serialize(self) -> str: + return f'"{self}"' + + +class Op(Node): + def serialize(self) -> str: + return str(self) + + +MarkerVar = Union[Variable, Value] +MarkerItem = Tuple[MarkerVar, Op, MarkerVar] +# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]] +# MarkerList = List[Union["MarkerList", MarkerAtom, str]] +# mypy does not suport recursive type definition +# https://github.com/python/mypy/issues/731 +MarkerAtom = Any +MarkerList = List[Any] + + +class Requirement(NamedTuple): + name: str + url: str + extras: List[str] + specifier: str + marker: str + + +def parse_named_requirement(requirement: str) -> Requirement: + """ + named_requirement: + IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END + """ + tokens = Tokenizer(requirement) + tokens.expect("IDENTIFIER", error_message="Expression must begin with package name") + name = tokens.read("IDENTIFIER").text + extras = parse_extras(tokens) + specifier = "" + url = "" + if tokens.match("URL_SPEC"): + url = tokens.read().text[1:].strip() + elif not tokens.match("END"): + specifier = parse_specifier(tokens) + if tokens.try_read("SEMICOLON"): + marker = "" + while not tokens.match("END"): + # we don't validate markers here, it's done later as part of + # packaging/requirements.py + marker += tokens.read().text + else: + marker = "" + tokens.expect( + "END", + error_message="Expected semicolon (followed by markers) or end of string", + ) + return Requirement(name, url, extras, specifier, marker) + + +def parse_extras(tokens: Tokenizer) -> List[str]: + """ + extras: LBRACKET (IDENTIFIER (COMMA IDENTIFIER)*)? RBRACKET + """ + extras = [] + if tokens.try_read("LBRACKET"): + while tokens.match("IDENTIFIER"): + extras.append(tokens.read("IDENTIFIER").text) + if not tokens.match("RBRACKET"): + tokens.read("COMMA", error_message="Missing comma after extra") + if not tokens.match("COMMA") and tokens.match("RBRACKET"): + break + tokens.read("RBRACKET", error_message="Closing square bracket is missing") + return extras + + +def parse_specifier(tokens: Tokenizer) -> str: + """ + specifier: + LPAREN (OP VERSION (COMMA OP VERSION)*)* RPAREN | + (OP VERSION (COMMA OP VERSION)*)* + """ + parsed_specifiers = "" + lparen = False + if tokens.try_read("LPAREN"): + lparen = True + while tokens.match("OP"): + parsed_specifiers += tokens.read("OP").text + if tokens.match("VERSION"): + parsed_specifiers += tokens.read("VERSION").text + else: + tokens.raise_syntax_error(message="Missing version") + if not tokens.match("COMMA"): + break + tokens.expect("COMMA", error_message="Missing comma after version") + parsed_specifiers += tokens.read("COMMA").text + if lparen and not tokens.try_read("RPAREN"): + tokens.raise_syntax_error(message="Closing right parenthesis is missing") + return parsed_specifiers + + +def parse_marker_expr(tokens: Tokenizer) -> MarkerList: + """ + marker_expr: marker_atom BOOLOP marker_atom + """ + expression = [parse_marker_atom(tokens)] + while tokens.match("BOOLOP"): + tok = tokens.read("BOOLOP") + expr_right = parse_marker_atom(tokens) + expression.extend((tok.text, expr_right)) + return expression + + +def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom: + """ + marker_atom: LPAREN marker_expr RPAREN | marker_item + """ + if tokens.try_read("LPAREN"): + marker = parse_marker_expr(tokens) + tokens.read("RPAREN", error_message="Closing right parenthesis is missing") + return marker + else: + return parse_marker_item(tokens) + + +def parse_marker_item(tokens: Tokenizer) -> MarkerItem: + """ + marker_item: marker_var marker_op marker_var + """ + marker_var_left = parse_marker_var(tokens) + marker_op = parse_marker_op(tokens) + marker_var_right = parse_marker_var(tokens) + return (marker_var_left, marker_op, marker_var_right) + + +def parse_marker_var(tokens: Tokenizer) -> MarkerVar: + """ + marker_var: env_var | python_str + """ + if tokens.match("VARIABLE"): + return parse_env_var(tokens) + else: + return parse_python_str(tokens) + + +def parse_env_var(tokens: Tokenizer) -> Variable: + """ + env_var: VARIABLE + """ + env_var = tokens.read("VARIABLE").text.replace(".", "_") + if ( + env_var == "platform_python_implementation" + or env_var == "python_implementation" + ): + return Variable("platform_python_implementation") + else: + return Variable(env_var) + + +def parse_python_str(tokens: Tokenizer) -> Value: + """ + python_str: QUOTED_STRING + """ + token = tokens.read( + "QUOTED_STRING", + error_message="String with single or double quote at the beginning is expected", + ).text + python_str = literal_eval(token) + return Value(str(python_str)) + + +def parse_marker_op(tokens: Tokenizer) -> Op: + """ + marker_op: IN | NOT IN | OP + """ + if tokens.try_read("IN"): + return Op("in") + elif tokens.try_read("NOT"): + tokens.read("IN", error_message="NOT token must be follewed by IN token") + return Op("not in") + elif tokens.match("OP"): + return Op(tokens.read().text) + else: + return tokens.raise_syntax_error( + message='Couldn\'t parse marker operator. Expecting one of \ + "<=, <, !=, ==, >=, >, ~=, ===, not, not in"' + ) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py new file mode 100644 index 000000000..d911cac7a --- /dev/null +++ b/packaging/_tokenizer.py @@ -0,0 +1,160 @@ +import re +from typing import Dict, Generator, NoReturn, Optional + +from .specifiers import Specifier + + +class Token: + def __init__(self, name: str, text: str, position: int) -> None: + self.name = name + self.text = text + self.position = position + + def matches(self, name: str = "") -> bool: + if name and self.name != name: + return False + return True + + +class ParseException(Exception): + """ + Parsing failed. + """ + + def __init__(self, message: str, position: int) -> None: + super().__init__(message) + self.position = position + + +DEFAULT_RULES = { + "LPAREN": r"\s*\(", + "RPAREN": r"\s*\)", + "LBRACKET": r"\s*\[", + "RBRACKET": r"\s*\]", + "SEMICOLON": r"\s*;", + "COMMA": r"\s*,", + "QUOTED_STRING": re.compile( + r""" + \s* + ( + ('[^']*') + | + ("[^"]*") + ) + """, + re.VERBOSE, + ), + "OP": r"\s*(===|==|~=|!=|<=|>=|<|>)", + "BOOLOP": r"\s*(or|and)", + "IN": r"\s*in", + "NOT": r"\s*not", + "VARIABLE": re.compile( + r""" + \s* + ( + python_version + |python_full_version + |os[._]name + |sys[._]platform + |platform_(release|system) + |platform[._](version|machine|python_implementation) + |python_implementation + |implementation_(name|version) + |extra + ) + """, + re.VERBOSE, + ), + "VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), + "URL_SPEC": r"\s*@ *[^ ]+", + "IDENTIFIER": r"\s*[a-zA-Z0-9._-]+", +} + + +class Tokenizer: + """Stream of tokens for a LL(1) parser. + + Provides methods to examine the next token to be read, and to read it + (advance to the next token). + """ + + next_token: Optional[Token] + + def __init__( + self, source: str, rules: Dict[Optional[str], object] = DEFAULT_RULES + ) -> None: + self.source = source + self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} + self.next_token = None + self.generator = self._tokenize() + self.position = 0 + + def peek(self) -> Token: + """ + Return the next token to be read. + """ + if not self.next_token: + self.next_token = next(self.generator) + return self.next_token + + def match(self, *name: str) -> bool: + """ + Return True if the next token matches the given arguments. + """ + token = self.peek() + return token.matches(*name) + + def expect(self, *name: str, error_message: str) -> Token: + """ + Raise SyntaxError if the next token doesn't match given arguments. + """ + token = self.peek() + if not token.matches(*name): + raise self.raise_syntax_error(message=error_message) + return token + + def read(self, *name: str, error_message: str = "") -> Token: + """Return the next token and advance to the next token. + + Raise SyntaxError if the token doesn't match. + """ + result = self.expect(*name, error_message=error_message) + self.next_token = None + return result + + def try_read(self, *name: str) -> Optional[Token]: + """read() if the next token matches the given arguments + + Do nothing if it does not match. + """ + if self.match(*name): + return self.read() + return None + + def raise_syntax_error(self, *, message: str) -> NoReturn: + """Raise SyntaxError at the given position in the marker""" + at = f"at position {self.position}:" + marker = " " * self.position + "^" + raise ParseException( + f"{message}\n{at}\n {self.source}\n {marker}", + self.position, + ) + + def _make_token(self, name: str, text: str) -> Token: + """Make a token with the current position""" + return Token(name, text, self.position) + + def _tokenize(self) -> Generator[Token, Token, None]: + """The main generator of tokens""" + while self.position < len(self.source): + for name, expression in self.rules.items(): + match = expression.match(self.source, self.position) + if match: + token_text = match[0] + + yield self._make_token(name, token_text.strip()) + self.position += len(token_text) + break + else: + raise self.raise_syntax_error(message="Unrecognized token") + yield self._make_token("END", "")