"""Handwritten parser of dependency specifiers. The docstring for each __parse_* function contains ENBF-inspired grammar representing the implementation. """ from __future__ import annotations import ast from typing import Any from typing import List from typing import NamedTuple from typing import Optional from typing import Tuple from typing import Union from ._tokenizer import DEFAULT_RULES from ._tokenizer import Tokenizer class Node: def __init__(self, value: str) -> None: self.value = value def __str__(self) -> str: return self.value def __repr__(self) -> str: return f"<{self.__class__.__name__}('{self}')>" def serialize(self) -> str: raise NotImplementedError class Variable(Node): def serialize(self) -> str: return str(self) class Value(Node): def serialize(self) -> str: return f'"{self}"' class Op(Node): def serialize(self) -> str: return str(self) MarkerVar = Union[Variable, Value] MarkerItem = Tuple[MarkerVar, Op, MarkerVar] # MarkerAtom = Union[MarkerItem, List["MarkerAtom"]] # MarkerList = List[Union["MarkerList", MarkerAtom, str]] # mypy does not support recursive type definition # https://github.com/python/mypy/issues/731 MarkerAtom = Any MarkerList = List[Any] class ParsedRequirement(NamedTuple): name: str url: str extras: list[str] specifier: str marker: MarkerList | None # -------------------------------------------------------------------------------------- # Recursive descent parser for dependency specifier # -------------------------------------------------------------------------------------- def parse_requirement(source: str) -> ParsedRequirement: return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement: """ requirement = WS? IDENTIFIER WS? extras WS? requirement_details """ tokenizer.consume('WS') name_token = tokenizer.expect( 'IDENTIFIER', expected='package name at the start of dependency specifier', ) name = name_token.text tokenizer.consume('WS') extras = _parse_extras(tokenizer) tokenizer.consume('WS') url, specifier, marker = _parse_requirement_details(tokenizer) tokenizer.expect('END', expected='end of dependency specifier') return ParsedRequirement(name, url, extras, specifier, marker) def _parse_requirement_details( tokenizer: Tokenizer, ) -> tuple[str, str, MarkerList | None]: """ requirement_details = AT URL (WS requirement_marker?)? | specifier WS? (requirement_marker)? """ specifier = '' url = '' marker = None if tokenizer.check('AT'): tokenizer.read() tokenizer.consume('WS') url_start = tokenizer.position url = tokenizer.expect('URL', expected='URL after @').text if tokenizer.check('END', peek=True): return (url, specifier, marker) tokenizer.expect('WS', expected='whitespace after URL') # The input might end after whitespace. if tokenizer.check('END', peek=True): return (url, specifier, marker) marker = _parse_requirement_marker( tokenizer, span_start=url_start, after='URL and whitespace', ) else: specifier_start = tokenizer.position specifier = _parse_specifier(tokenizer) tokenizer.consume('WS') if tokenizer.check('END', peek=True): return (url, specifier, marker) marker = _parse_requirement_marker( tokenizer, span_start=specifier_start, after=( 'version specifier' if specifier else 'name and no valid version specifier' ), ) return (url, specifier, marker) def _parse_requirement_marker( tokenizer: Tokenizer, *, span_start: int, after: str, ) -> MarkerList: """ requirement_marker = SEMICOLON marker WS? """ if not tokenizer.check('SEMICOLON'): tokenizer.raise_syntax_error( f'Expected end or semicolon (after {after})', span_start=span_start, ) tokenizer.read() marker = _parse_marker(tokenizer) tokenizer.consume('WS') return marker def _parse_extras(tokenizer: Tokenizer) -> list[str]: """ extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)? """ if not tokenizer.check('LEFT_BRACKET', peek=True): return [] with tokenizer.enclosing_tokens( 'LEFT_BRACKET', 'RIGHT_BRACKET', around='extras', ): tokenizer.consume('WS') extras = _parse_extras_list(tokenizer) tokenizer.consume('WS') return extras def _parse_extras_list(tokenizer: Tokenizer) -> list[str]: """ extras_list = identifier (wsp* ',' wsp* identifier)* """ extras: list[str] = [] if not tokenizer.check('IDENTIFIER'): return extras extras.append(tokenizer.read().text) while True: tokenizer.consume('WS') if tokenizer.check('IDENTIFIER', peek=True): tokenizer.raise_syntax_error('Expected comma between extra names') elif not tokenizer.check('COMMA'): break tokenizer.read() tokenizer.consume('WS') extra_token = tokenizer.expect('IDENTIFIER', expected='extra name after comma') extras.append(extra_token.text) return extras def _parse_specifier(tokenizer: Tokenizer) -> str: """ specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS | WS? version_many WS? """ with tokenizer.enclosing_tokens( 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', around='version specifier', ): tokenizer.consume('WS') parsed_specifiers = _parse_version_many(tokenizer) tokenizer.consume('WS') return parsed_specifiers def _parse_version_many(tokenizer: Tokenizer) -> str: """ version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)? """ parsed_specifiers = '' while tokenizer.check('SPECIFIER'): span_start = tokenizer.position parsed_specifiers += tokenizer.read().text if tokenizer.check('VERSION_PREFIX_TRAIL', peek=True): tokenizer.raise_syntax_error( '.* suffix can only be used with `==` or `!=` operators', span_start=span_start, span_end=tokenizer.position + 1, ) if tokenizer.check('VERSION_LOCAL_LABEL_TRAIL', peek=True): tokenizer.raise_syntax_error( 'Local version label can only be used with `==` or `!=` operators', span_start=span_start, span_end=tokenizer.position, ) tokenizer.consume('WS') if not tokenizer.check('COMMA'): break parsed_specifiers += tokenizer.read().text tokenizer.consume('WS') return parsed_specifiers # -------------------------------------------------------------------------------------- # Recursive descent parser for marker expression # -------------------------------------------------------------------------------------- def parse_marker(source: str) -> MarkerList: return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES)) def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList: retval = _parse_marker(tokenizer) tokenizer.expect('END', expected='end of marker expression') return retval def _parse_marker(tokenizer: Tokenizer) -> MarkerList: """ marker = marker_atom (BOOLOP marker_atom)+ """ expression = [_parse_marker_atom(tokenizer)] while tokenizer.check('BOOLOP'): token = tokenizer.read() expr_right = _parse_marker_atom(tokenizer) expression.extend((token.text, expr_right)) return expression def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: """ marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS? | WS? marker_item WS? """ tokenizer.consume('WS') if tokenizer.check('LEFT_PARENTHESIS', peek=True): with tokenizer.enclosing_tokens( 'LEFT_PARENTHESIS', 'RIGHT_PARENTHESIS', around='marker expression', ): tokenizer.consume('WS') marker: MarkerAtom = _parse_marker(tokenizer) tokenizer.consume('WS') else: marker = _parse_marker_item(tokenizer) tokenizer.consume('WS') return marker def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem: """ marker_item = WS? marker_var WS? marker_op WS? marker_var WS? """ tokenizer.consume('WS') marker_var_left = _parse_marker_var(tokenizer) tokenizer.consume('WS') marker_op = _parse_marker_op(tokenizer) tokenizer.consume('WS') marker_var_right = _parse_marker_var(tokenizer) tokenizer.consume('WS') return (marker_var_left, marker_op, marker_var_right) def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar: """ marker_var = VARIABLE | QUOTED_STRING """ if tokenizer.check('VARIABLE'): return process_env_var(tokenizer.read().text.replace('.', '_')) elif tokenizer.check('QUOTED_STRING'): return process_python_str(tokenizer.read().text) else: tokenizer.raise_syntax_error( message='Expected a marker variable or quoted string', ) def process_env_var(env_var: str) -> Variable: if env_var in ('platform_python_implementation', 'python_implementation'): return Variable('platform_python_implementation') else: return Variable(env_var) def process_python_str(python_str: str) -> Value: value = ast.literal_eval(python_str) return Value(str(value)) def _parse_marker_op(tokenizer: Tokenizer) -> Op: """ marker_op = IN | NOT IN | OP """ if tokenizer.check('IN'): tokenizer.read() return Op('in') elif tokenizer.check('NOT'): tokenizer.read() tokenizer.expect('WS', expected="whitespace after 'not'") tokenizer.expect('IN', expected="'in' after 'not'") return Op('not in') elif tokenizer.check('OP'): return Op(tokenizer.read().text) else: return tokenizer.raise_syntax_error( 'Expected marker operator, one of ' '<=, <, !=, ==, >=, >, ~=, ===, in, not in', )