diff --git a/grammar.py b/grammar.py index bffc037..69f4de8 100644 --- a/grammar.py +++ b/grammar.py @@ -12,6 +12,7 @@ from parser import ( Terminal, Re, ) +from parser.parser import compile_lexer, dump_lexer_table class FineGrammar(Grammar): @@ -355,20 +356,7 @@ class FineGrammar(Grammar): RCURLY = Terminal("}") RETURN = Terminal("return") SEMICOLON = Terminal(";") - STRING = Terminal( - # Double-quoted string. - Re.seq( - Re.literal('"'), - (~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(), - Re.literal('"'), - ) - # Single-quoted string. - | Re.seq( - Re.literal("'"), - (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), - Re.literal("'"), - ) - ) + STRING = Terminal('""') # TODO WHILE = Terminal("while") EQUAL = Terminal("=") LPAREN = Terminal("(") @@ -388,20 +376,7 @@ class FineGrammar(Grammar): MINUS = Terminal("-") STAR = Terminal("*") SLASH = Terminal("/") - NUMBER = Terminal( - Re.seq( - Re.set(("0", "9")).plus(), - Re.seq( - Re.literal("."), - Re.set(("0", "9")), - Re.seq( - Re.set("e", "E"), - Re.set("+", "-").question(), - Re.set(("0", "9")).plus(), - ).question(), - ).question(), - ) - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) TRUE = Terminal("true") FALSE = Terminal("false") BANG = Terminal("!") @@ -595,8 +570,6 @@ class FineTokens: if __name__ == "__main__": - from parser.parser import compile_lexer, dump_lexer_table - grammar = FineGrammar() grammar.build_table() diff --git a/parser/parser.py b/parser/parser.py index cb29d10..8a23d4e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1609,7 +1609,7 @@ class Terminal(Rule): value: str | None pattern: "str | Re" - def __init__(self, pattern, *, name=None): + def __init__(self, pattern, name=None): self.value = name self.pattern = pattern @@ -2180,46 +2180,36 @@ class Re: def seq(cls, *values: "Re") -> "Re": result = values[0] for v in values[1:]: - result = ReSeq(result, v) + result = RegexSequence(result, v) return result @classmethod def literal(cls, value: str) -> "Re": - return cls.seq(*[ReSet.from_ranges(c) for c in value]) + return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) @classmethod - def set(cls, *args: str | tuple[str, str]) -> "ReSet": - return ReSet.from_ranges(*args) - - @classmethod - def any(cls) -> "ReSet": - return ReSet.any() + def set(cls, *args: str | tuple[str, str]) -> "Re": + return RegexLiteral.from_ranges(*args) def plus(self) -> "Re": - return RePlus(self) + return RegexPlus(self) def star(self) -> "Re": - return ReStar(self) + return RegexStar(self) def question(self) -> "Re": - return ReQuestion(self) + return RegexQuestion(self) def __or__(self, value: "Re", /) -> "Re": - return ReAlt(self, value) - - def __add__(self, value: "Re") -> "Re": - return ReSeq(self, value) - - -UNICODE_MAX_CP = 1114112 + return RegexAlternation(self, value) @dataclasses.dataclass -class ReSet(Re): +class RegexLiteral(Re): values: list[Span] @classmethod - def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet": + def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": values = [] for a in args: if isinstance(a, str): @@ -2227,36 +2217,7 @@ class ReSet(Re): else: values.append(Span.from_str(a[0], a[1])) - return ReSet(values) - - @classmethod - def any(cls) -> "ReSet": - return ReSet(values=[Span(0, UNICODE_MAX_CP)]) - - def invert(self) -> "ReSet": - spans = [] - lower = 0 - for span in self.values: - upper = span.lower - if upper != lower: - assert lower < upper - spans.append(Span(lower, upper)) - lower = span.upper - - # What... is.... the top end here? Are we dealing with bytes? Are we - # dealing with unicode character ranges? In python we're dealing with - # "ord". I feel like this... here... is correct but might need to - # change when the state machine is converted for other languages. - # - upper = UNICODE_MAX_CP - if upper != lower: - assert lower < upper - spans.append(Span(lower, upper)) - - return ReSet(spans) - - def __invert__(self) -> "ReSet": - return self.invert() + return RegexLiteral(values) def to_nfa(self, start: NFAState) -> NFAState: end = NFAState() @@ -2282,7 +2243,7 @@ class ReSet(Re): @dataclasses.dataclass -class RePlus(Re): +class RegexPlus(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2295,7 +2256,7 @@ class RePlus(Re): @dataclasses.dataclass -class ReStar(Re): +class RegexStar(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2309,7 +2270,7 @@ class ReStar(Re): @dataclasses.dataclass -class ReQuestion(Re): +class RegexQuestion(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2322,7 +2283,7 @@ class ReQuestion(Re): @dataclasses.dataclass -class ReSeq(Re): +class RegexSequence(Re): left: Re right: Re @@ -2335,7 +2296,7 @@ class ReSeq(Re): @dataclasses.dataclass -class ReAlt(Re): +class RegexAlternation(Re): left: Re right: Re diff --git a/tests/test_grammar.py b/tests/test_grammar.py index ee28ed6..26e5057 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,3 +1,5 @@ +import typing + import pytest import parser diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 79d8d98..fe442d8 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,8 +1,9 @@ import collections from hypothesis import assume, example, given -from hypothesis.strategies import integers, lists +from hypothesis.strategies import integers, lists, tuples +import pytest from parser import ( EdgeList,