Skip to content

Commit

Permalink
Add sources for new parser and tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
hrnciar committed Nov 18, 2021
1 parent 1c1a4eb commit c6cf9fa
Show file tree
Hide file tree
Showing 2 changed files with 355 additions and 0 deletions.
201 changes: 201 additions & 0 deletions packaging/_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from typing import Any, List, NamedTuple, Tuple, Union

from ._tokenizer import Tokenizer


class Node:
def __init__(self, value: str) -> None:
self.value = value

def __str__(self) -> str:
return str(self.value)

def __repr__(self) -> str:
return f"<{self.__class__.__name__}('{self}')>"

def serialize(self) -> str:
raise NotImplementedError


class Variable(Node):
def serialize(self) -> str:
return str(self)


class Value(Node):
def serialize(self) -> str:
return f'"{self}"'


class Op(Node):
def serialize(self) -> str:
return str(self)


MarkerVar = Union[Variable, Value]
MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
# MarkerList = List[Union["MarkerList", MarkerAtom, str]]
# mypy does not suport recursive type definition
# https://github.com/python/mypy/issues/731
MarkerAtom = Any
MarkerList = List[Any]


class Requirement(NamedTuple):
name: str
url: str
extras: List[str]
specifier: str
marker: str


def parse_named_requirement(requirement: str) -> Requirement:
"""
NAMED_REQUIREMENT: NAME EXTRAS* URL_SPEC (SEMICOLON + MARKER)*
"""
tokens = Tokenizer(requirement)
name = tokens.read("IDENTIFIER").text
extras = parse_extras(tokens)
specifier = ""
url = ""
if tokens.match("URL_SPEC"):
url = tokens.read().text[1:].strip()
elif not tokens.match("stringEnd"):
specifier = parse_specifier(tokens)
if tokens.match("SEMICOLON"):
marker = ""
while not tokens.match("stringEnd"):
# we don't validate markers here, it's done later as part of
# packaging/requirements.py
marker += tokens.read().text
else:
marker = ""
tokens.expect("stringEnd")
return Requirement(name, url, extras, specifier, marker)


def parse_extras(tokens: Tokenizer) -> List[str]:
"""
EXTRAS: (LBRACKET + IDENTIFIER + (COLON + IDENTIFIER)* + RBRACKET)*
"""
extras = []
if tokens.try_read("LBRACKET"):
while tokens.match("IDENTIFIER"):
extras.append(tokens.read("IDENTIFIER").text)
tokens.try_read("COLON")
if not tokens.try_read("RBRACKET"):
tokens.raise_syntax_error(message="Closing square bracket is missing")
return extras


def parse_specifier(tokens: Tokenizer) -> str:
"""
SPECIFIER: LPAREN (OP + VERSION + COLON)+ RPAREN | OP + VERSION
"""
parsed_specifiers = ""
lparen = False
if tokens.try_read("LPAREN"):
lparen = True
while tokens.match("OP"):
parsed_specifiers += tokens.read("OP").text
if tokens.match("VERSION"):
parsed_specifiers += tokens.read("VERSION").text
else:
tokens.raise_syntax_error(message="Missing version")
if tokens.match("COLON"):
parsed_specifiers += tokens.read("COLON").text
if lparen and not tokens.try_read("RPAREN"):
tokens.raise_syntax_error(message="Closing right parenthesis is missing")
return parsed_specifiers


def parse_quoted_marker(tokens: Tokenizer) -> MarkerList:
tokens.try_read("SEMICOLON")
return parse_marker_expr(tokens)


def parse_marker_expr(tokens: Tokenizer) -> MarkerList:
"""
MARKER_EXPR: MARKER_ATOM (BOOLOP + MARKER_ATOM)+
"""
expression = [parse_marker_atom(tokens)]
while tokens.match("BOOLOP"):
tok = tokens.read("BOOLOP")
expr_right = parse_marker_atom(tokens)
expression.extend((tok.text, expr_right))
return expression


def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom:
"""
MARKER_ATOM: LPAREN MARKER_EXPR RPAREN | MARKER_ITEM
"""
if tokens.try_read("LPAREN"):
marker = parse_marker_expr(tokens)
if not tokens.try_read("RPAREN"):
tokens.raise_syntax_error(message="Closing right parenthesis is missing")
return marker
else:
return parse_marker_item(tokens)


def parse_marker_item(
tokens: Tokenizer,
) -> MarkerItem:
"""
MARKER_ITEM: MARKER_VAR MARKER_OP MARKER_VAR
"""
marker_var_left = parse_marker_var(tokens)
marker_op = parse_marker_op(tokens)
marker_var_right = parse_marker_var(tokens)
return (marker_var_left, marker_op, marker_var_right)


def parse_marker_var(tokens: Tokenizer) -> MarkerVar:
"""
MARKER_VAR: VARIABLE MARKER_VALUE
"""
if tokens.match("VARIABLE"):
return parse_variable(tokens)
else:
return parse_python_str(tokens)


def parse_variable(tokens: Tokenizer) -> Variable:

env_var = tokens.read("VARIABLE").text.replace(".", "_")
if (
env_var == "platform_python_implementation"
or env_var == "python_implementation"
):
return Variable("platform_python_implementation")
else:
return Variable(env_var)


def parse_python_str(tokens: Tokenizer) -> Value:

if tokens.match("QUOTED_STRING"):
python_str = tokens.read().text.strip("'\"")
return Value(str(python_str))
else:
return tokens.raise_syntax_error(
message="String with single or double quote at the beginning is expected"
)


def parse_marker_op(tokens: Tokenizer) -> Op:

if tokens.try_read("IN"):
return Op("in")
elif tokens.try_read("NOT"):
tokens.read("IN")
return Op("not in")
elif tokens.match("OP"):
return Op(tokens.read().text)
else:
return tokens.raise_syntax_error(
message='Couldn\'t parse marker operator. Expecting one of \
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"'
)
154 changes: 154 additions & 0 deletions packaging/_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import re
from typing import Dict, Generator, NoReturn, Optional

from .specifiers import Specifier


class Token:
def __init__(self, name: str, text: str, position: int) -> None:
self.name = name
self.text = text
self.position = position

def matches(self, name: str = "", text: str = "") -> bool:
if name and self.name != name:
return False
return True


class ParseException(Exception):
"""Parsing failed"""

def __init__(self, message: str, position: int) -> None:
super().__init__(message)
self.position = position


DEFAULT_RULES = {
None: r"[ \t]+", # whitespace: not returned as tokens
"LPAREN": r"\(",
"RPAREN": r"\)",
"LBRACKET": r"\[",
"RBRACKET": r"\]",
"SEMICOLON": r";",
"COLON": r",",
"QUOTED_STRING": re.compile(
r"""
('[^']*')
|
("[^"]*")
""",
re.VERBOSE,
),
"OP": r"===|==|~=|!=|<=|>=|<|>",
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE),
"BOOLOP": r"or|and",
"IN": r"in",
"NOT": r"not",
"VARIABLE": re.compile(
r"""
python_version
|python_full_version
|os[._]name
|sys[._]platform
|platform_(release|system)
|platform[._](version|machine|python_implementation)
|python_implementation
|implementation_(name|version)
|extra
""",
re.VERBOSE,
),
"URL_SPEC": "@ *[^ ]+",
"IDENTIFIER": r"([a-zA-Z0-9]|-|_|\.)+",
}


class Tokenizer:
"""Stream of tokens for a LL(1) parser.
Provides methods to examine the next token to be read, and to read it
(advance to the next token).
"""

next_token: Optional[Token]

def __init__(
self, source: str, rules: Dict[Optional[str], object] = DEFAULT_RULES
) -> None:
self.source = source
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()}
self.next_token = None
self.generator = self._tokenize()
self.position = 0

def peek(self) -> Token:
"""Return the next token to be read"""
if not self.next_token:
self.next_token = next(self.generator)
return self.next_token

def match(self, *match_args: str) -> bool:
"""Return True if the next token matches the given arguments"""
token = self.peek()
return token.matches(*match_args)

def expect(self, *match_args: str, **match_kwargs: str) -> Token:
"""Raise SyntaxError if the next token doesn't match given arguments"""
token = self.peek()
if not token.matches(*match_args, **match_kwargs):
exp = " ".join(
v
for v in match_args
+ tuple(f"{k}={v!r}" for k, v in match_kwargs.items())
if v
)
raise self.raise_syntax_error(message=f"Expected {exp}")
return token

def read(self, *match_args: str) -> Token:
"""Return the next token and advance to the next token
Raise SyntaxError if the token doesn't match.
"""
result = self.expect(*match_args)
self.next_token = None
return result

def try_read(self, *match_args: str) -> Optional[Token]:
"""read() if the next token matches the given arguments
Do nothing if it does not match.
"""
if self.match(*match_args):
return self.read()
return None

def raise_syntax_error(self, *, message: str) -> NoReturn:
"""Raise SyntaxError at the given position in the marker"""
at = f"at position {self.position}:"
marker = " " * self.position + "^"
raise ParseException(
f"{message}\n{at}\n {self.source}\n {marker}",
self.position,
)

def _make_token(self, name: str, text: str) -> Token:
"""Make a token with the current position"""
return Token(name, text, self.position)

def _tokenize(self) -> Generator[Token, Token, None]:
"""The main generator of tokens"""
while self.position < len(self.source):
for name, expression in self.rules.items():
match = expression.match(self.source, self.position)
if match:
token_text = match[0]

if name:
yield self._make_token(name, token_text)
self.position += len(token_text)
break
else:
raise self.raise_syntax_error(message="Unrecognized token")
yield self._make_token("stringEnd", "")

0 comments on commit c6cf9fa

Please sign in to comment.