From 0be0075cfe44a57e6d403a028b776afffc5268b7 Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 16:47:26 -0700 Subject: [PATCH 1/3] Generic token stream Compatible with the harness --- parser/runtime.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/parser/runtime.py b/parser/runtime.py index 2aefd7b..24e617f 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -1,7 +1,7 @@ import bisect import enum -import enum import logging +import re import typing from dataclasses import dataclass @@ -490,3 +490,55 @@ def generic_tokenize( pos = last_accept_pos start = pos state = 0 + + +class GenericTokenStream: + def __init__(self, src: str, lexer: parser.LexerTable): + self.src = src + self.lexer = lexer + self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list( + generic_tokenize(src, lexer) + ) + self._lines = [m.start() for m in re.finditer("\n", src)] + + def tokens(self): + return self._tokens + + def lines(self): + return self._lines + + def dump(self, *, start=None, end=None) -> list[str]: + if start is None: + start = 0 + if end is None: + end = len(self._tokens) + + max_terminal_name = max( + len(terminal.value) + for terminal, _ in self.lexer + if terminal is not None and terminal.value is not None + ) + max_offset_len = len(str(len(self.src))) + + prev_line = None + lines = [] + for token in self._tokens[start:end]: + (kind, start, length) = token + line_index = bisect.bisect_left(self._lines, start) + if line_index == 0: + col_start = 0 + else: + col_start = self._lines[line_index - 1] + 1 + column_index = start - col_start + value = self.src[start : start + length] + + line_number = line_index + 1 + if line_number != prev_line: + line_part = f"{line_number:4}" + prev_line = line_number + else: + line_part = " |" + + line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.value:{max_terminal_name}} {repr(value)}" + lines.append(line) + return lines From d03dc6e3d9c491d3db2f8a6a2d71092c585927c5 Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 16:47:42 -0700 Subject: [PATCH 2/3] Harness uses grammar-generated token stream --- harness.py | 81 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 14 deletions(-) diff --git a/harness.py b/harness.py index 90cee9c..4e94866 100644 --- a/harness.py +++ b/harness.py @@ -22,11 +22,6 @@ from parser import runtime # from parser import Token, Grammar, rule, seq -############################################################################### -# Parsing Stuff -############################################################################### - - ############################################################################### # Screen Stuff ############################################################################### @@ -84,13 +79,18 @@ def goto_cursor(x: int, y: int): # Dynamic Modules: Detect and Reload Modules when they Change ############################################################################### +VERSION = 0 -class DynamicModule: +MT = typing.TypeVar("MT") + + +class DynamicModule[MT]: file_name: str member_name: str | None last_time: float | None module: types.ModuleType | None + value: MT | None def __init__(self, file_name, member_name): self.file_name = file_name @@ -110,15 +110,18 @@ class DynamicModule: return True - def _transform(self, value): + def _transform(self, value) -> MT: return value - def get(self): + def get(self) -> MT: st = os.stat(self.file_name) if self.last_time == st.st_mtime: assert self.value is not None return self.value + global VERSION + VERSION += 1 + self.value = None if self.module is None: @@ -150,7 +153,7 @@ class DynamicModule: return self.value -class DynamicGrammarModule(DynamicModule): +class DynamicGrammarModule(DynamicModule[parser.ParseTable]): def __init__(self, file_name, member_name, start_rule): super().__init__(file_name, member_name) @@ -169,16 +172,24 @@ class DynamicGrammarModule(DynamicModule): return value().build_table(start=self.start_rule) -class DynamicLexerModule(DynamicModule): +class DynamicLexerModule(DynamicModule[typing.Callable[[str], runtime.TokenStream]]): def _predicate(self, member) -> bool: if not super()._predicate(member): return False - if getattr(member, "tokens", None): + if getattr(member, "terminals", None): return True return False + def _transform(self, value): + lexer_table = parser.compile_lexer(value()) + + def get_tokens(src: str) -> runtime.TokenStream: + return runtime.GenericTokenStream(src, lexer_table) + + return get_tokens + class DisplayMode(enum.Enum): TREE = 0 @@ -220,6 +231,8 @@ class Harness: line_start: int last_cols: int + last_version: int + def __init__( self, grammar_file, grammar_member, lexer_file, lexer_member, start_rule, source_path ): @@ -230,9 +243,12 @@ class Harness: self.start_rule = start_rule self.source_path = source_path + self.last_version = -1 + self.mode = DisplayMode.TREE self.source = None + self.table = None self.tokens = None self.tree = None @@ -250,7 +266,7 @@ class Harness: self.grammar_file, self.grammar_member, self.start_rule ) - self.lexer_module = DynamicLexerModule(self.lexer_file, self.lexer_member) + self.lexer_module = DynamicLexerModule(self.lexer_file, self.grammar_member) self.log_handler = ListHandler() logging.basicConfig(level=logging.INFO, handlers=[self.log_handler]) @@ -286,15 +302,25 @@ class Harness: return self.grammar_module.get() def update(self): - self.log_handler.clear() + global VERSION + start_time = time.time() try: table = self.load_grammar() lexer_func = self.lexer_module.get() with open(self.source_path, "r", encoding="utf-8") as f: - self.source = f.read() + source = f.read() + if source != self.source: + VERSION += 1 + self.source = source + if VERSION == self.last_version: + return # Just stop, do nothing, it's all the same. + self.last_version = VERSION + assert self.source is not None + + self.log_handler.clear() self.tokens = lexer_func(self.source) lex_time = time.time() @@ -321,6 +347,33 @@ class Harness: self.average_entries = 0 self.max_entries = 0 + # WHAT + try: + with open("tree.txt", "w", encoding="utf-8") as f: + lines = [] + if self.tree is not None: + self.format_node(lines, self.tree) + f.writelines([f"{l}\n" for l in lines]) + except Exception as e: + self.errors.extend([f"Unable to write tree.txt: {e}"]) + + try: + with open("errors.txt", "w", encoding="utf-8") as f: + f.writelines([f"{l}\n" for l in self.errors]) + except Exception as e: + self.errors.extend([f"Unable to write errors.txt: {e}"]) + + try: + with open("parse.log", "w", encoding="utf-8") as f: + f.writelines([f"{l}\n" for l in self.log_handler.logs]) + except Exception as e: + self.errors.extend([f"Unable to write parse.log: {e}"]) + + if hasattr(self.tokens, "dump"): + lines = self.tokens.dump() + with open("tokens.txt", "w", encoding="utf-8") as f: + f.writelines([f"{l}\n" for l in lines]) + def render(self): sys.stdout.buffer.write(CLEAR) rows, cols = termios.tcgetwinsize(sys.stdout.fileno()) From d62076f3c4ded23e1f3a5569f778ad15e7483eed Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 16:47:58 -0700 Subject: [PATCH 3/3] Fix a bug in terminal declaration whoops. now it parses correctly --- grammar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grammar.py b/grammar.py index 0912700..90d88ef 100644 --- a/grammar.py +++ b/grammar.py @@ -349,7 +349,7 @@ class FineGrammar(Grammar): IN = Terminal("in", kind=TerminalKind.Keyword.Operator) LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open) RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close) - LET = Terminal("Let", kind=TerminalKind.Keyword.Other) + LET = Terminal("let", kind=TerminalKind.Keyword.Other) RETURN = Terminal("return", kind=TerminalKind.Keyword.Control) SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator) STRING = Terminal(