Compare commits

...

4 commits

Author SHA1 Message Date
30f7798719 Actual strings and floats
Using the new regex features
2024-08-24 08:36:28 -07:00
c0b623bd6d Remove unused imports 2024-08-24 08:36:20 -07:00
454e6fd6fd Regex API "improvements"
I mean, is it better than a regex parser? No, probably not.
2024-08-24 08:35:45 -07:00
6d6aabdeb3 Terminal name must be explicit on construction 2024-08-24 08:35:10 -07:00
4 changed files with 87 additions and 24 deletions

View file

@ -12,7 +12,6 @@ from parser import (
Terminal, Terminal,
Re, Re,
) )
from parser.parser import compile_lexer, dump_lexer_table
class FineGrammar(Grammar): class FineGrammar(Grammar):
@ -356,7 +355,20 @@ class FineGrammar(Grammar):
RCURLY = Terminal("}") RCURLY = Terminal("}")
RETURN = Terminal("return") RETURN = Terminal("return")
SEMICOLON = Terminal(";") SEMICOLON = Terminal(";")
STRING = Terminal('""') # TODO STRING = Terminal(
# Double-quoted string.
Re.seq(
Re.literal('"'),
(~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(),
Re.literal('"'),
)
# Single-quoted string.
| Re.seq(
Re.literal("'"),
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
Re.literal("'"),
)
)
WHILE = Terminal("while") WHILE = Terminal("while")
EQUAL = Terminal("=") EQUAL = Terminal("=")
LPAREN = Terminal("(") LPAREN = Terminal("(")
@ -376,7 +388,20 @@ class FineGrammar(Grammar):
MINUS = Terminal("-") MINUS = Terminal("-")
STAR = Terminal("*") STAR = Terminal("*")
SLASH = Terminal("/") SLASH = Terminal("/")
NUMBER = Terminal(Re.set(("0", "9")).plus()) NUMBER = Terminal(
Re.seq(
Re.set(("0", "9")).plus(),
Re.seq(
Re.literal("."),
Re.set(("0", "9")),
Re.seq(
Re.set("e", "E"),
Re.set("+", "-").question(),
Re.set(("0", "9")).plus(),
).question(),
).question(),
)
)
TRUE = Terminal("true") TRUE = Terminal("true")
FALSE = Terminal("false") FALSE = Terminal("false")
BANG = Terminal("!") BANG = Terminal("!")
@ -570,6 +595,8 @@ class FineTokens:
if __name__ == "__main__": if __name__ == "__main__":
from parser.parser import compile_lexer, dump_lexer_table
grammar = FineGrammar() grammar = FineGrammar()
grammar.build_table() grammar.build_table()

View file

@ -1609,7 +1609,7 @@ class Terminal(Rule):
value: str | None value: str | None
pattern: "str | Re" pattern: "str | Re"
def __init__(self, pattern, name=None): def __init__(self, pattern, *, name=None):
self.value = name self.value = name
self.pattern = pattern self.pattern = pattern
@ -2180,36 +2180,46 @@ class Re:
def seq(cls, *values: "Re") -> "Re": def seq(cls, *values: "Re") -> "Re":
result = values[0] result = values[0]
for v in values[1:]: for v in values[1:]:
result = RegexSequence(result, v) result = ReSeq(result, v)
return result return result
@classmethod @classmethod
def literal(cls, value: str) -> "Re": def literal(cls, value: str) -> "Re":
return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) return cls.seq(*[ReSet.from_ranges(c) for c in value])
@classmethod @classmethod
def set(cls, *args: str | tuple[str, str]) -> "Re": def set(cls, *args: str | tuple[str, str]) -> "ReSet":
return RegexLiteral.from_ranges(*args) return ReSet.from_ranges(*args)
@classmethod
def any(cls) -> "ReSet":
return ReSet.any()
def plus(self) -> "Re": def plus(self) -> "Re":
return RegexPlus(self) return RePlus(self)
def star(self) -> "Re": def star(self) -> "Re":
return RegexStar(self) return ReStar(self)
def question(self) -> "Re": def question(self) -> "Re":
return RegexQuestion(self) return ReQuestion(self)
def __or__(self, value: "Re", /) -> "Re": def __or__(self, value: "Re", /) -> "Re":
return RegexAlternation(self, value) return ReAlt(self, value)
def __add__(self, value: "Re") -> "Re":
return ReSeq(self, value)
UNICODE_MAX_CP = 1114112
@dataclasses.dataclass @dataclasses.dataclass
class RegexLiteral(Re): class ReSet(Re):
values: list[Span] values: list[Span]
@classmethod @classmethod
def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet":
values = [] values = []
for a in args: for a in args:
if isinstance(a, str): if isinstance(a, str):
@ -2217,7 +2227,36 @@ class RegexLiteral(Re):
else: else:
values.append(Span.from_str(a[0], a[1])) values.append(Span.from_str(a[0], a[1]))
return RegexLiteral(values) return ReSet(values)
@classmethod
def any(cls) -> "ReSet":
return ReSet(values=[Span(0, UNICODE_MAX_CP)])
def invert(self) -> "ReSet":
spans = []
lower = 0
for span in self.values:
upper = span.lower
if upper != lower:
assert lower < upper
spans.append(Span(lower, upper))
lower = span.upper
# What... is.... the top end here? Are we dealing with bytes? Are we
# dealing with unicode character ranges? In python we're dealing with
# "ord". I feel like this... here... is correct but might need to
# change when the state machine is converted for other languages.
#
upper = UNICODE_MAX_CP
if upper != lower:
assert lower < upper
spans.append(Span(lower, upper))
return ReSet(spans)
def __invert__(self) -> "ReSet":
return self.invert()
def to_nfa(self, start: NFAState) -> NFAState: def to_nfa(self, start: NFAState) -> NFAState:
end = NFAState() end = NFAState()
@ -2243,7 +2282,7 @@ class RegexLiteral(Re):
@dataclasses.dataclass @dataclasses.dataclass
class RegexPlus(Re): class RePlus(Re):
child: Re child: Re
def to_nfa(self, start: NFAState) -> NFAState: def to_nfa(self, start: NFAState) -> NFAState:
@ -2256,7 +2295,7 @@ class RegexPlus(Re):
@dataclasses.dataclass @dataclasses.dataclass
class RegexStar(Re): class ReStar(Re):
child: Re child: Re
def to_nfa(self, start: NFAState) -> NFAState: def to_nfa(self, start: NFAState) -> NFAState:
@ -2270,7 +2309,7 @@ class RegexStar(Re):
@dataclasses.dataclass @dataclasses.dataclass
class RegexQuestion(Re): class ReQuestion(Re):
child: Re child: Re
def to_nfa(self, start: NFAState) -> NFAState: def to_nfa(self, start: NFAState) -> NFAState:
@ -2283,7 +2322,7 @@ class RegexQuestion(Re):
@dataclasses.dataclass @dataclasses.dataclass
class RegexSequence(Re): class ReSeq(Re):
left: Re left: Re
right: Re right: Re
@ -2296,7 +2335,7 @@ class RegexSequence(Re):
@dataclasses.dataclass @dataclasses.dataclass
class RegexAlternation(Re): class ReAlt(Re):
left: Re left: Re
right: Re right: Re

View file

@ -1,5 +1,3 @@
import typing
import pytest import pytest
import parser import parser

View file

@ -1,9 +1,8 @@
import collections import collections
from hypothesis import assume, example, given from hypothesis import assume, example, given
from hypothesis.strategies import integers, lists, tuples from hypothesis.strategies import integers, lists
import pytest
from parser import ( from parser import (
EdgeList, EdgeList,