Compare commits
5 commits
dc03bf7373
...
066d2d8439
| Author | SHA1 | Date | |
|---|---|---|---|
| 066d2d8439 | |||
| 2d87207b54 | |||
| 80d932b36a | |||
| f8b62bf4a4 | |||
| 344dde51be |
4 changed files with 475 additions and 272 deletions
534
parser/parser.py
534
parser/parser.py
|
|
@ -1607,13 +1607,13 @@ class Rule:
|
|||
class Terminal(Rule):
|
||||
"""A token, or terminal symbol in the grammar."""
|
||||
|
||||
value: str | None
|
||||
name: str | None
|
||||
pattern: "str | Re"
|
||||
meta: dict[str, typing.Any]
|
||||
regex: bool
|
||||
|
||||
def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs):
|
||||
self.value = name
|
||||
self.name = name
|
||||
self.pattern = pattern
|
||||
self.meta = kwargs
|
||||
self.regex = isinstance(pattern, Re)
|
||||
|
|
@ -1623,7 +1623,7 @@ class Terminal(Rule):
|
|||
yield [self]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.value or "???"
|
||||
return self.name or "<Unknown terminal>"
|
||||
|
||||
|
||||
class NonTerminal(Rule):
|
||||
|
|
@ -1782,217 +1782,6 @@ def rule(
|
|||
return wrapper
|
||||
|
||||
|
||||
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
|
||||
|
||||
|
||||
class Grammar:
|
||||
"""The base class for defining a grammar.
|
||||
|
||||
Inherit from this, and and define members for your nonterminals, and then
|
||||
use the `build_tables` method to construct the parse tables.
|
||||
|
||||
|
||||
Here's an example of a simple grammar:
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
Not very exciting, perhaps, but it's something.
|
||||
"""
|
||||
|
||||
_precedence: dict[str, typing.Tuple[Assoc, int]]
|
||||
_start: str
|
||||
_generator: type[GenerateLR0]
|
||||
_terminals: list[Terminal]
|
||||
_trivia: list[Terminal]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
start: str | None = None,
|
||||
precedence: PrecedenceList | None = None,
|
||||
generator: type[GenerateLR0] | None = None,
|
||||
trivia: list[str | Terminal] | None = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
if start is None:
|
||||
start = getattr(self, "start", None)
|
||||
if start is None:
|
||||
raise ValueError(
|
||||
"The default start rule must either be specified in the constructor or as an "
|
||||
"attribute in the class."
|
||||
)
|
||||
|
||||
if precedence is None:
|
||||
precedence = getattr(self, "precedence", [])
|
||||
assert precedence is not None
|
||||
|
||||
if generator is None:
|
||||
generator = getattr(self, "generator", GenerateLALR)
|
||||
assert generator is not None
|
||||
|
||||
if trivia is None:
|
||||
trivia = getattr(self, "trivia", [])
|
||||
assert trivia is not None
|
||||
|
||||
# Fixup terminal names with the name of the member that declared it.
|
||||
terminals = {}
|
||||
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
|
||||
if t.value is None:
|
||||
t.value = n
|
||||
|
||||
if n in terminals:
|
||||
raise ValueError(f"More than one terminal has the name '{n}'")
|
||||
terminals[n] = t
|
||||
|
||||
# Resolve the trivia declarations correctly.
|
||||
resolved_trivia: list[Terminal] = []
|
||||
for t in trivia:
|
||||
if isinstance(t, str):
|
||||
resolved = terminals.get(t)
|
||||
if resolved is None:
|
||||
raise ValueError(f"The trivia '{t}' is not a terminal name")
|
||||
resolved_trivia.append(resolved)
|
||||
else:
|
||||
resolved_trivia.append(t)
|
||||
|
||||
# Fix up the precedence table.
|
||||
precedence_table = {}
|
||||
for prec, (associativity, symbols) in enumerate(precedence):
|
||||
for symbol in symbols:
|
||||
if isinstance(symbol, Terminal):
|
||||
key = symbol.value
|
||||
elif isinstance(symbol, NonTerminal):
|
||||
key = symbol.name
|
||||
else:
|
||||
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
|
||||
|
||||
precedence_table[key] = (associativity, prec + 1)
|
||||
|
||||
if name is None:
|
||||
name = getattr(self, "name", None)
|
||||
if name is None:
|
||||
name = self.__class__.__name__.removesuffix("Grammar").lower()
|
||||
|
||||
self._precedence = precedence_table
|
||||
self._start = start
|
||||
self._generator = generator
|
||||
self._terminals = list(terminals.values())
|
||||
self._trivia = resolved_trivia
|
||||
self.name = name
|
||||
|
||||
@property
|
||||
def terminals(self) -> list[Terminal]:
|
||||
return self._terminals
|
||||
|
||||
@property
|
||||
def resolved_trivia(self) -> list[Terminal]:
|
||||
return self._trivia
|
||||
|
||||
def generate_nonterminal_dict(
|
||||
self, start: str | None = None
|
||||
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
|
||||
"""Convert the rules into a dictionary of productions.
|
||||
|
||||
Our table generators work on a very flat set of productions. This is the
|
||||
first step in flattening the productions from the members: walk the rules
|
||||
starting from the given start rule and flatten them, one by one, into a
|
||||
dictionary that maps nonterminal rule name to its associated list of
|
||||
productions.
|
||||
"""
|
||||
if start is None:
|
||||
start = self._start
|
||||
|
||||
rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))
|
||||
nonterminals = {rule.name: rule for _, rule in rules}
|
||||
transparents = {rule.name for _, rule in rules if rule.transparent}
|
||||
|
||||
grammar = {}
|
||||
|
||||
rule = nonterminals.get(start)
|
||||
if rule is None:
|
||||
raise ValueError(f"Cannot find a rule named '{start}'")
|
||||
queue = [rule]
|
||||
while len(queue) > 0:
|
||||
rule = queue.pop()
|
||||
if rule.name in grammar:
|
||||
continue
|
||||
|
||||
body = rule.generate_body(self)
|
||||
for clause in body:
|
||||
for symbol in clause:
|
||||
if not isinstance(symbol, Terminal):
|
||||
assert isinstance(symbol, str)
|
||||
nonterminal = nonterminals.get(symbol)
|
||||
if nonterminal is None:
|
||||
raise ValueError(f"While processing {rule.name}: cannot find {symbol}")
|
||||
queue.append(nonterminal)
|
||||
|
||||
grammar[rule.name] = body
|
||||
|
||||
return (grammar, transparents)
|
||||
|
||||
def desugar(
|
||||
self, start: str | None = None
|
||||
) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
|
||||
"""Convert the rules into a flat list of productions.
|
||||
|
||||
Our table generators work from a very flat set of productions. The form
|
||||
produced by this function is one level flatter than the one produced by
|
||||
generate_nonterminal_dict- less useful to people, probably, but it is
|
||||
the input form needed by the Generator.
|
||||
"""
|
||||
temp_grammar, transparents = self.generate_nonterminal_dict(start)
|
||||
|
||||
grammar = []
|
||||
for rule_name, clauses in temp_grammar.items():
|
||||
for clause in clauses:
|
||||
new_clause = []
|
||||
for symbol in clause:
|
||||
if isinstance(symbol, Terminal):
|
||||
if symbol.value in temp_grammar:
|
||||
raise ValueError(
|
||||
f"'{symbol.value}' is the name of both a Terminal and a NonTerminal rule. This will cause problems."
|
||||
)
|
||||
new_clause.append(symbol.value)
|
||||
else:
|
||||
new_clause.append(symbol)
|
||||
|
||||
grammar.append((rule_name, new_clause))
|
||||
|
||||
return grammar, transparents
|
||||
|
||||
def build_table(self, start: str | None = None, generator=None) -> ParseTable:
|
||||
"""Construct a parse table for this grammar, starting at the named
|
||||
nonterminal rule.
|
||||
"""
|
||||
if start is None:
|
||||
start = self._start
|
||||
desugared, transparents = self.desugar(start)
|
||||
|
||||
if generator is None:
|
||||
generator = self._generator
|
||||
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
|
||||
table = gen.gen_table()
|
||||
|
||||
for t in self._trivia:
|
||||
assert t.value is not None
|
||||
table.trivia.add(t.value)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Lexer support
|
||||
###############################################################################
|
||||
|
|
@ -2211,7 +2000,7 @@ class NFAState:
|
|||
continue
|
||||
visited.add(state)
|
||||
|
||||
label = state.accept.value if state.accept is not None else ""
|
||||
label = state.accept.name if state.accept is not None else ""
|
||||
f.write(f' {id(state)} [label="{label}"];\n')
|
||||
for target in state.epsilons:
|
||||
stack.append(target)
|
||||
|
|
@ -2275,6 +2064,7 @@ UNICODE_MAX_CP = 1114112
|
|||
@dataclasses.dataclass
|
||||
class ReSet(Re):
|
||||
values: list[Span]
|
||||
inversion: bool = False # No semantic meaning, just pretty.
|
||||
|
||||
@classmethod
|
||||
def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet":
|
||||
|
|
@ -2311,7 +2101,7 @@ class ReSet(Re):
|
|||
assert lower < upper
|
||||
spans.append(Span(lower, upper))
|
||||
|
||||
return ReSet(spans)
|
||||
return ReSet(spans, inversion=not self.inversion)
|
||||
|
||||
def __invert__(self) -> "ReSet":
|
||||
return self.invert()
|
||||
|
|
@ -2495,69 +2285,24 @@ class NFASuperState:
|
|||
|
||||
if accept is None:
|
||||
accept = st.accept
|
||||
elif accept.value != st.accept.value:
|
||||
elif accept.name != st.accept.name:
|
||||
if accept.regex and not st.accept.regex:
|
||||
accept = st.accept
|
||||
elif st.accept.regex and not accept.regex:
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')"
|
||||
f"Lexer is ambiguous: cannot distinguish between {accept.name} ('{accept.pattern}') and {st.accept.name} ('{st.accept.pattern}')"
|
||||
)
|
||||
|
||||
return accept
|
||||
|
||||
|
||||
def compile_lexer(grammar: Grammar) -> LexerTable:
|
||||
# Parse the terminals all together into a big NFA rooted at `NFA`.
|
||||
NFA = NFAState()
|
||||
for terminal in grammar.terminals:
|
||||
pattern = terminal.pattern
|
||||
if isinstance(pattern, Re):
|
||||
start, ends = pattern.to_nfa()
|
||||
for end in ends:
|
||||
end.accept = terminal
|
||||
NFA.epsilons.append(start)
|
||||
|
||||
else:
|
||||
start = end = NFAState()
|
||||
for c in pattern:
|
||||
end = end.add_edge(Span.from_str(c), NFAState())
|
||||
end.accept = terminal
|
||||
NFA.epsilons.append(start)
|
||||
|
||||
NFA.dump_graph()
|
||||
|
||||
# Convert the NFA into a DFA in the most straightforward way (by tracking
|
||||
# sets of state closures, called SuperStates.)
|
||||
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
|
||||
|
||||
stack = [NFASuperState([NFA])]
|
||||
while len(stack) > 0:
|
||||
ss = stack.pop()
|
||||
if ss in DFA:
|
||||
continue
|
||||
|
||||
edges = ss.edges()
|
||||
|
||||
DFA[ss] = (len(DFA), edges)
|
||||
for _, target in edges:
|
||||
stack.append(target)
|
||||
|
||||
return [
|
||||
(
|
||||
ss.accept_terminal(),
|
||||
[(k, DFA[v][0]) for k, v in edges],
|
||||
)
|
||||
for ss, (_, edges) in DFA.items()
|
||||
]
|
||||
|
||||
|
||||
def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"):
|
||||
with open(name, "w", encoding="utf-8") as f:
|
||||
f.write("digraph G {\n")
|
||||
for index, (accept, edges) in enumerate(table):
|
||||
label = accept.value if accept is not None else ""
|
||||
label = accept.name if accept is not None else ""
|
||||
f.write(f' {index} [label="{label}"];\n')
|
||||
for span, target in edges:
|
||||
label = str(span).replace('"', '\\"')
|
||||
|
|
@ -2661,3 +2406,264 @@ class Highlight(SyntaxMeta):
|
|||
class Variable(SyntaxMeta):
|
||||
class Language(SyntaxMeta):
|
||||
pass
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Finally, the base class for grammars
|
||||
###############################################################################
|
||||
|
||||
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
|
||||
|
||||
|
||||
class Grammar:
|
||||
"""The base class for defining a grammar.
|
||||
|
||||
Inherit from this, and and define members for your nonterminals, and then
|
||||
use the `build_tables` method to construct the parse tables.
|
||||
|
||||
|
||||
Here's an example of a simple grammar:
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
Not very exciting, perhaps, but it's something.
|
||||
"""
|
||||
|
||||
_precedence: dict[str, typing.Tuple[Assoc, int]]
|
||||
_generator: type[GenerateLR0]
|
||||
_terminals: list[Terminal]
|
||||
_trivia: list[Terminal]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
start: str | None = None,
|
||||
precedence: PrecedenceList | None = None,
|
||||
generator: type[GenerateLR0] | None = None,
|
||||
trivia: list[str | Terminal] | None = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
if start is None:
|
||||
start = getattr(self, "start", None)
|
||||
if start is None:
|
||||
raise ValueError(
|
||||
"The default start rule must either be specified in the constructor or as an "
|
||||
"attribute in the class."
|
||||
)
|
||||
|
||||
if precedence is None:
|
||||
precedence = getattr(self, "precedence", [])
|
||||
assert precedence is not None
|
||||
|
||||
if generator is None:
|
||||
generator = getattr(self, "generator", GenerateLALR)
|
||||
assert generator is not None
|
||||
|
||||
if trivia is None:
|
||||
trivia = getattr(self, "trivia", [])
|
||||
assert trivia is not None
|
||||
|
||||
# Fixup terminal names with the name of the member that declared it.
|
||||
terminals = {}
|
||||
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
|
||||
if t.name is None:
|
||||
t.name = n
|
||||
|
||||
if n in terminals:
|
||||
raise ValueError(f"More than one terminal has the name '{n}'")
|
||||
terminals[n] = t
|
||||
|
||||
# Resolve the trivia declarations correctly.
|
||||
resolved_trivia: list[Terminal] = []
|
||||
for t in trivia:
|
||||
if isinstance(t, str):
|
||||
resolved = terminals.get(t)
|
||||
if resolved is None:
|
||||
raise ValueError(f"The trivia '{t}' is not a terminal name")
|
||||
resolved_trivia.append(resolved)
|
||||
else:
|
||||
resolved_trivia.append(t)
|
||||
|
||||
# Fix up the precedence table.
|
||||
precedence_table = {}
|
||||
for prec, (associativity, symbols) in enumerate(precedence):
|
||||
for symbol in symbols:
|
||||
if isinstance(symbol, Terminal):
|
||||
key = symbol.name
|
||||
elif isinstance(symbol, NonTerminal):
|
||||
key = symbol.name
|
||||
else:
|
||||
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
|
||||
|
||||
precedence_table[key] = (associativity, prec + 1)
|
||||
|
||||
if name is None:
|
||||
name = getattr(self, "name", None)
|
||||
if name is None:
|
||||
name = self.__class__.__name__.removesuffix("Grammar").lower()
|
||||
|
||||
self._precedence = precedence_table
|
||||
self.start = start
|
||||
self._generator = generator
|
||||
self._terminals = list(terminals.values())
|
||||
self._trivia = resolved_trivia
|
||||
self.name = name
|
||||
|
||||
def terminals(self) -> list[Terminal]:
|
||||
return self._terminals
|
||||
|
||||
@property
|
||||
def resolved_trivia(self) -> list[Terminal]:
|
||||
return self._trivia
|
||||
|
||||
def non_terminals(self) -> list[NonTerminal]:
|
||||
return [nt for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))]
|
||||
|
||||
def generate_nonterminal_dict(
|
||||
self, start: str | None = None
|
||||
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
|
||||
"""Convert the rules into a dictionary of productions.
|
||||
|
||||
Our table generators work on a very flat set of productions. This is the
|
||||
first step in flattening the productions from the members: walk the rules
|
||||
starting from the given start rule and flatten them, one by one, into a
|
||||
dictionary that maps nonterminal rule name to its associated list of
|
||||
productions.
|
||||
"""
|
||||
if start is None:
|
||||
start = self.start
|
||||
|
||||
rules = self.non_terminals()
|
||||
nonterminals = {rule.name: rule for rule in rules}
|
||||
transparents = {rule.name for rule in rules if rule.transparent}
|
||||
|
||||
grammar = {}
|
||||
|
||||
rule = nonterminals.get(start)
|
||||
if rule is None:
|
||||
raise ValueError(f"Cannot find a rule named '{start}'")
|
||||
queue = [rule]
|
||||
while len(queue) > 0:
|
||||
rule = queue.pop()
|
||||
if rule.name in grammar:
|
||||
continue
|
||||
|
||||
body = rule.generate_body(self)
|
||||
for clause in body:
|
||||
for symbol in clause:
|
||||
if not isinstance(symbol, Terminal):
|
||||
assert isinstance(symbol, str)
|
||||
nonterminal = nonterminals.get(symbol)
|
||||
if nonterminal is None:
|
||||
raise ValueError(f"While processing {rule.name}: cannot find {symbol}")
|
||||
queue.append(nonterminal)
|
||||
|
||||
grammar[rule.name] = body
|
||||
|
||||
return (grammar, transparents)
|
||||
|
||||
def desugar(
|
||||
self, start: str | None = None
|
||||
) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
|
||||
"""Convert the rules into a flat list of productions.
|
||||
|
||||
Our table generators work from a very flat set of productions. The form
|
||||
produced by this function is one level flatter than the one produced by
|
||||
generate_nonterminal_dict- less useful to people, probably, but it is
|
||||
the input form needed by the Generator.
|
||||
"""
|
||||
temp_grammar, transparents = self.generate_nonterminal_dict(start)
|
||||
|
||||
grammar = []
|
||||
for rule_name, clauses in temp_grammar.items():
|
||||
for clause in clauses:
|
||||
new_clause = []
|
||||
for symbol in clause:
|
||||
if isinstance(symbol, Terminal):
|
||||
if symbol.name in temp_grammar:
|
||||
raise ValueError(
|
||||
f"'{symbol.name}' is the name of both a Terminal and a NonTerminal rule. This will cause problems."
|
||||
)
|
||||
new_clause.append(symbol.name)
|
||||
else:
|
||||
new_clause.append(symbol)
|
||||
|
||||
grammar.append((rule_name, new_clause))
|
||||
|
||||
return grammar, transparents
|
||||
|
||||
def build_table(self, start: str | None = None, generator=None) -> ParseTable:
|
||||
"""Construct a parse table for this grammar, starting at the named
|
||||
nonterminal rule.
|
||||
"""
|
||||
if start is None:
|
||||
start = self.start
|
||||
desugared, transparents = self.desugar(start)
|
||||
|
||||
if generator is None:
|
||||
generator = self._generator
|
||||
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
|
||||
table = gen.gen_table()
|
||||
|
||||
for t in self._trivia:
|
||||
assert t.name is not None
|
||||
table.trivia.add(t.name)
|
||||
|
||||
return table
|
||||
|
||||
def compile_lexer(self) -> LexerTable:
|
||||
"""Construct a lexer table for this grammar."""
|
||||
# Parse the terminals all together into a big NFA rooted at `NFA`.
|
||||
NFA = NFAState()
|
||||
for terminal in self.terminals():
|
||||
pattern = terminal.pattern
|
||||
if isinstance(pattern, Re):
|
||||
start, ends = pattern.to_nfa()
|
||||
for end in ends:
|
||||
end.accept = terminal
|
||||
NFA.epsilons.append(start)
|
||||
|
||||
else:
|
||||
start = end = NFAState()
|
||||
for c in pattern:
|
||||
end = end.add_edge(Span.from_str(c), NFAState())
|
||||
end.accept = terminal
|
||||
NFA.epsilons.append(start)
|
||||
|
||||
# NFA.dump_graph()
|
||||
|
||||
# Convert the NFA into a DFA in the most straightforward way (by tracking
|
||||
# sets of state closures, called SuperStates.)
|
||||
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
|
||||
|
||||
stack = [NFASuperState([NFA])]
|
||||
while len(stack) > 0:
|
||||
ss = stack.pop()
|
||||
if ss in DFA:
|
||||
continue
|
||||
|
||||
edges = ss.edges()
|
||||
|
||||
DFA[ss] = (len(DFA), edges)
|
||||
for _, target in edges:
|
||||
stack.append(target)
|
||||
|
||||
return [
|
||||
(
|
||||
ss.accept_terminal(),
|
||||
[(k, DFA[v][0]) for k, v in edges],
|
||||
)
|
||||
for ss, (_, edges) in DFA.items()
|
||||
]
|
||||
|
|
|
|||
|
|
@ -292,9 +292,9 @@ class Parser:
|
|||
# accessible in the tree.
|
||||
input_tokens = tokens.tokens()
|
||||
input: list[TokenValue] = [
|
||||
TokenValue(kind=kind.value, start=start, end=start + length)
|
||||
TokenValue(kind=kind.name, start=start, end=start + length)
|
||||
for (kind, start, length) in input_tokens
|
||||
if kind.value is not None and kind.value not in self.table.trivia
|
||||
if kind.name is not None and kind.name not in self.table.trivia
|
||||
]
|
||||
|
||||
eof = 0 if len(input) == 0 else input[-1].end
|
||||
|
|
@ -514,9 +514,9 @@ class GenericTokenStream:
|
|||
end = len(self._tokens)
|
||||
|
||||
max_terminal_name = max(
|
||||
len(terminal.value)
|
||||
len(terminal.name)
|
||||
for terminal, _ in self.lexer
|
||||
if terminal is not None and terminal.value is not None
|
||||
if terminal is not None and terminal.name is not None
|
||||
)
|
||||
max_offset_len = len(str(len(self.src)))
|
||||
|
||||
|
|
@ -539,6 +539,6 @@ class GenericTokenStream:
|
|||
else:
|
||||
line_part = " |"
|
||||
|
||||
line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.value:{max_terminal_name}} {repr(value)}"
|
||||
line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.name:{max_terminal_name}} {repr(value)}"
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
|
|
|||
198
parser/tree_sitter.py
Normal file
198
parser/tree_sitter.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
import json
|
||||
import pathlib
|
||||
|
||||
from . import parser
|
||||
|
||||
|
||||
def to_js_string(s: str) -> str:
|
||||
result = json.dumps(s)[1:-1]
|
||||
# JSON escapes double-quotes but we don't need to in our context.
|
||||
result = result.replace('\\"', '"')
|
||||
return result
|
||||
|
||||
|
||||
def to_javascript_regex(re: parser.Re) -> str:
|
||||
# NOTE: In general it's bad to introduce parenthesis into regular
|
||||
# expressions where they're not required because they also create
|
||||
# capture groups, but I think it doesn't apply to tree-sitter
|
||||
# regular expressions (and it doesn't mean anything to me either.)
|
||||
if isinstance(re, parser.ReSeq):
|
||||
final = []
|
||||
queue = []
|
||||
queue.append(re)
|
||||
while len(queue) > 0:
|
||||
part = queue.pop()
|
||||
if isinstance(part, parser.ReSeq):
|
||||
queue.append(part.right)
|
||||
queue.append(part.left)
|
||||
else:
|
||||
final.append(part)
|
||||
|
||||
s = "".join([to_javascript_regex(p) for p in final])
|
||||
if len(final) > 1:
|
||||
s = f"({s})"
|
||||
return s
|
||||
|
||||
elif isinstance(re, parser.ReAlt):
|
||||
final = []
|
||||
queue = []
|
||||
queue.append(re)
|
||||
while len(queue) > 0:
|
||||
part = queue.pop()
|
||||
if isinstance(part, parser.ReAlt):
|
||||
queue.append(part.right)
|
||||
queue.append(part.left)
|
||||
else:
|
||||
final.append(part)
|
||||
|
||||
s = "|".join([to_javascript_regex(p) for p in final])
|
||||
if len(final) > 1:
|
||||
s = f"({s})"
|
||||
return s
|
||||
|
||||
elif isinstance(re, parser.ReQuestion):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}?"
|
||||
|
||||
elif isinstance(re, parser.RePlus):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}+"
|
||||
|
||||
elif isinstance(re, parser.ReStar):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}*"
|
||||
|
||||
elif isinstance(re, parser.ReSet):
|
||||
if (
|
||||
len(re.values) == 1
|
||||
and re.values[0].lower == 0
|
||||
and re.values[0].upper == parser.UNICODE_MAX_CP
|
||||
):
|
||||
return "."
|
||||
|
||||
inverted = re.inversion
|
||||
if inverted:
|
||||
re = re.invert()
|
||||
|
||||
parts = []
|
||||
for value in re.values:
|
||||
if len(value) == 1:
|
||||
parts.append(to_js_string(chr(value.lower)))
|
||||
else:
|
||||
parts.append(
|
||||
"{}-{}".format(
|
||||
to_js_string(chr(value.lower)),
|
||||
to_js_string(chr(value.upper - 1)),
|
||||
)
|
||||
)
|
||||
|
||||
s = "".join(parts)
|
||||
if inverted:
|
||||
s = "^" + s
|
||||
if len(s) > 1:
|
||||
# The only time this isn't a "set" is if this is a set of one
|
||||
# range that is one character long, in which case it's better
|
||||
# represented as a literal.
|
||||
s = f"[{s}]"
|
||||
return s
|
||||
|
||||
raise Exception(f"Regex node {re} not supported for tree-sitter")
|
||||
|
||||
|
||||
def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str:
|
||||
# TODO: Precedence?
|
||||
|
||||
method = getattr(rule, "convert_to_tree_sitter", None)
|
||||
if method is not None:
|
||||
return method(grammar)
|
||||
|
||||
if isinstance(rule, parser.Terminal):
|
||||
if isinstance(rule.pattern, parser.Re):
|
||||
regex = to_javascript_regex(rule.pattern)
|
||||
return f"/{regex}/"
|
||||
else:
|
||||
string = to_js_string(rule.pattern)
|
||||
return f'"{string}"'
|
||||
|
||||
elif isinstance(rule, parser.AlternativeRule):
|
||||
final = []
|
||||
queue = []
|
||||
has_nothing = False
|
||||
queue.append(rule)
|
||||
while len(queue) > 0:
|
||||
part = queue.pop()
|
||||
if isinstance(part, parser.AlternativeRule):
|
||||
queue.append(part.right)
|
||||
queue.append(part.left)
|
||||
elif isinstance(part, parser.NothingRule):
|
||||
has_nothing = True
|
||||
else:
|
||||
final.append(part)
|
||||
|
||||
if len(final) == 0:
|
||||
raise Exception("Unsupported rule: empty alternative")
|
||||
|
||||
result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final])
|
||||
if len(final) > 1:
|
||||
result = f"choice({result})"
|
||||
if has_nothing:
|
||||
result = f"opt({result})"
|
||||
return result
|
||||
|
||||
elif isinstance(rule, parser.SequenceRule):
|
||||
final = []
|
||||
queue = []
|
||||
queue.append(rule)
|
||||
while len(queue) > 0:
|
||||
part = queue.pop()
|
||||
if isinstance(part, parser.SequenceRule):
|
||||
queue.append(part.second)
|
||||
queue.append(part.first)
|
||||
elif isinstance(part, parser.NothingRule):
|
||||
pass
|
||||
else:
|
||||
final.append(part)
|
||||
|
||||
if len(final) == 0:
|
||||
raise Exception("Unsupported rule: empty sequence")
|
||||
|
||||
result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final])
|
||||
if len(final) > 1:
|
||||
result = f"seq({result})"
|
||||
return result
|
||||
|
||||
elif isinstance(rule, parser.NonTerminal):
|
||||
return f"$['{rule.name}']"
|
||||
|
||||
elif isinstance(rule, parser.MetadataRule):
|
||||
return convert_to_tree_sitter(rule.rule, grammar)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Rule {rule} not supported for tree-sitter")
|
||||
|
||||
|
||||
# https://tree-sitter.github.io/tree-sitter/creating-parsers
|
||||
def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
|
||||
# TODO: PRECEDENCE
|
||||
path = pathlib.Path(path) / "grammar.js"
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write('/// <reference types="tree-sitter-cli/dsl" />\n')
|
||||
f.write("// @ts-check\n")
|
||||
f.write("\n")
|
||||
f.write("module.exports = grammar({\n")
|
||||
f.write(f" name: '{grammar.name}',\n")
|
||||
f.write(" rules: {\n")
|
||||
f.write(f" source_file: $ => $['{grammar.start}'],\n")
|
||||
for rule in grammar.non_terminals():
|
||||
f.write("\n")
|
||||
|
||||
rule_name = rule.name
|
||||
if rule.transparent:
|
||||
rule_name = "_" + rule_name
|
||||
|
||||
body = rule.fn(grammar)
|
||||
rule_definition = convert_to_tree_sitter(body, grammar)
|
||||
f.write(f" '{rule_name}': $ => {rule_definition},")
|
||||
|
||||
f.write(" }\n")
|
||||
f.write("});")
|
||||
|
|
@ -11,7 +11,6 @@ from parser import (
|
|||
Grammar,
|
||||
rule,
|
||||
Terminal,
|
||||
compile_lexer,
|
||||
dump_lexer_table,
|
||||
Re,
|
||||
)
|
||||
|
|
@ -372,7 +371,7 @@ def test_lexer_compile():
|
|||
)
|
||||
BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus())
|
||||
|
||||
lexer = compile_lexer(LexTest())
|
||||
lexer = LexTest().compile_lexer()
|
||||
dump_lexer_table(lexer)
|
||||
tokens = list(generic_tokenize("xy is ass", lexer))
|
||||
assert tokens == [
|
||||
|
|
@ -410,7 +409,7 @@ def test_lexer_numbers(n: float):
|
|||
)
|
||||
)
|
||||
|
||||
lexer = compile_lexer(LexTest())
|
||||
lexer = LexTest().compile_lexer()
|
||||
dump_lexer_table(lexer)
|
||||
|
||||
number_string = str(n)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue