From 6d6aabdeb347686f83f837927c07b3b1f4e75d4d Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 24 Aug 2024 08:35:10 -0700 Subject: [PATCH 1/4] Terminal name must be explicit on construction --- parser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/parser.py b/parser/parser.py index 8a23d4e..3c93d61 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1609,7 +1609,7 @@ class Terminal(Rule): value: str | None pattern: "str | Re" - def __init__(self, pattern, name=None): + def __init__(self, pattern, *, name=None): self.value = name self.pattern = pattern From 454e6fd6fd0a2ef5dd751db74876d5d7e932b244 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 24 Aug 2024 08:35:45 -0700 Subject: [PATCH 2/4] Regex API "improvements" I mean, is it better than a regex parser? No, probably not. --- parser/parser.py | 71 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 3c93d61..cb29d10 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2180,36 +2180,46 @@ class Re: def seq(cls, *values: "Re") -> "Re": result = values[0] for v in values[1:]: - result = RegexSequence(result, v) + result = ReSeq(result, v) return result @classmethod def literal(cls, value: str) -> "Re": - return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) + return cls.seq(*[ReSet.from_ranges(c) for c in value]) @classmethod - def set(cls, *args: str | tuple[str, str]) -> "Re": - return RegexLiteral.from_ranges(*args) + def set(cls, *args: str | tuple[str, str]) -> "ReSet": + return ReSet.from_ranges(*args) + + @classmethod + def any(cls) -> "ReSet": + return ReSet.any() def plus(self) -> "Re": - return RegexPlus(self) + return RePlus(self) def star(self) -> "Re": - return RegexStar(self) + return ReStar(self) def question(self) -> "Re": - return RegexQuestion(self) + return ReQuestion(self) def __or__(self, value: "Re", /) -> "Re": - return RegexAlternation(self, value) + return ReAlt(self, value) + + def __add__(self, value: "Re") -> "Re": + return ReSeq(self, value) + + +UNICODE_MAX_CP = 1114112 @dataclasses.dataclass -class RegexLiteral(Re): +class ReSet(Re): values: list[Span] @classmethod - def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": + def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet": values = [] for a in args: if isinstance(a, str): @@ -2217,7 +2227,36 @@ class RegexLiteral(Re): else: values.append(Span.from_str(a[0], a[1])) - return RegexLiteral(values) + return ReSet(values) + + @classmethod + def any(cls) -> "ReSet": + return ReSet(values=[Span(0, UNICODE_MAX_CP)]) + + def invert(self) -> "ReSet": + spans = [] + lower = 0 + for span in self.values: + upper = span.lower + if upper != lower: + assert lower < upper + spans.append(Span(lower, upper)) + lower = span.upper + + # What... is.... the top end here? Are we dealing with bytes? Are we + # dealing with unicode character ranges? In python we're dealing with + # "ord". I feel like this... here... is correct but might need to + # change when the state machine is converted for other languages. + # + upper = UNICODE_MAX_CP + if upper != lower: + assert lower < upper + spans.append(Span(lower, upper)) + + return ReSet(spans) + + def __invert__(self) -> "ReSet": + return self.invert() def to_nfa(self, start: NFAState) -> NFAState: end = NFAState() @@ -2243,7 +2282,7 @@ class RegexLiteral(Re): @dataclasses.dataclass -class RegexPlus(Re): +class RePlus(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2256,7 +2295,7 @@ class RegexPlus(Re): @dataclasses.dataclass -class RegexStar(Re): +class ReStar(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2270,7 +2309,7 @@ class RegexStar(Re): @dataclasses.dataclass -class RegexQuestion(Re): +class ReQuestion(Re): child: Re def to_nfa(self, start: NFAState) -> NFAState: @@ -2283,7 +2322,7 @@ class RegexQuestion(Re): @dataclasses.dataclass -class RegexSequence(Re): +class ReSeq(Re): left: Re right: Re @@ -2296,7 +2335,7 @@ class RegexSequence(Re): @dataclasses.dataclass -class RegexAlternation(Re): +class ReAlt(Re): left: Re right: Re From c0b623bd6dfd32421a9afc388f9d8a56a353bba9 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 24 Aug 2024 08:36:20 -0700 Subject: [PATCH 3/4] Remove unused imports --- tests/test_grammar.py | 2 -- tests/test_lexer.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 26e5057..ee28ed6 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,5 +1,3 @@ -import typing - import pytest import parser diff --git a/tests/test_lexer.py b/tests/test_lexer.py index fe442d8..79d8d98 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,9 +1,8 @@ import collections from hypothesis import assume, example, given -from hypothesis.strategies import integers, lists, tuples +from hypothesis.strategies import integers, lists -import pytest from parser import ( EdgeList, From 30f7798719d6767edca3325ced3a0a73c480f28e Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 24 Aug 2024 08:36:28 -0700 Subject: [PATCH 4/4] Actual strings and floats Using the new regex features --- grammar.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/grammar.py b/grammar.py index 69f4de8..bffc037 100644 --- a/grammar.py +++ b/grammar.py @@ -12,7 +12,6 @@ from parser import ( Terminal, Re, ) -from parser.parser import compile_lexer, dump_lexer_table class FineGrammar(Grammar): @@ -356,7 +355,20 @@ class FineGrammar(Grammar): RCURLY = Terminal("}") RETURN = Terminal("return") SEMICOLON = Terminal(";") - STRING = Terminal('""') # TODO + STRING = Terminal( + # Double-quoted string. + Re.seq( + Re.literal('"'), + (~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(), + Re.literal('"'), + ) + # Single-quoted string. + | Re.seq( + Re.literal("'"), + (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), + Re.literal("'"), + ) + ) WHILE = Terminal("while") EQUAL = Terminal("=") LPAREN = Terminal("(") @@ -376,7 +388,20 @@ class FineGrammar(Grammar): MINUS = Terminal("-") STAR = Terminal("*") SLASH = Terminal("/") - NUMBER = Terminal(Re.set(("0", "9")).plus()) + NUMBER = Terminal( + Re.seq( + Re.set(("0", "9")).plus(), + Re.seq( + Re.literal("."), + Re.set(("0", "9")), + Re.seq( + Re.set("e", "E"), + Re.set("+", "-").question(), + Re.set(("0", "9")).plus(), + ).question(), + ).question(), + ) + ) TRUE = Terminal("true") FALSE = Terminal("false") BANG = Terminal("!") @@ -570,6 +595,8 @@ class FineTokens: if __name__ == "__main__": + from parser.parser import compile_lexer, dump_lexer_table + grammar = FineGrammar() grammar.build_table()