diff --git a/README.md b/README.md index c0a12eb..44457ee 100644 --- a/README.md +++ b/README.md @@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule` decorator. Here's an example: ```python -from parser import * + from parser import * -@rule -def expression(): - return seq(expression, PLUS, term) | term + @rule + def expression(): + return seq(expression, PLUS, term) | term -@rule -def term(): - return seq(LPAREN, expression, RPAREN) | ID + @rule + def term(): + return seq(LPAREN, expression, RPAREN) | ID -PLUS = Terminal('PLUS', '+') -LPAREN = Terminal('LPAREN', '(') -RPAREN = Terminal('RPAREN', ')') -ID = Terminal( - 'ID', - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), -) + PLUS = Terminal('PLUS', '+') + LPAREN = Terminal('LPAREN', '(') + RPAREN = Terminal('RPAREN', ')') + ID = Terminal( + 'ID', + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) -SimpleGrammar = Grammar( - name="Simple", - start=expression, -) + SimpleGrammar = Grammar( + name="Simple", + start=expression, + ) ``` Terminals can be plain strings or regular expressions constructed with @@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be constructed in the classic context-free grammar way: ```python -@rule -def list(): - return NUMBER | (list + COMMA + NUMBER) + @rule + def list(): + return NUMBER | (list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` (Unlike with PEGs, you can write grammars with left or right-recursion, @@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their contents into the parent node instead. ```python -@rule -def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + @rule + def list(): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return transparent_list -@rule(transparent=True) -def transparent_list() -> Rule: - return NUMBER | (transparent_list + COMMA + NUMBER) + @rule(transparent=True) + def transparent_list() -> Rule: + return NUMBER | (transparent_list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` This grammar will generate the far more useful tree: @@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is probably better-written as: ```python -@rule -def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + @rule + def list(): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return transparent_list -@rule -def _list() -> Rule: - return NUMBER | (_list + COMMA + NUMBER) + @rule + def _list() -> Rule: + return NUMBER | (_list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` That will generate the same tree, but a little more succinctly. @@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive rules by hand all the time, so there are helpers that do it for you: ```python -@rule -def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + @rule + def list(): + return zero_or_more(NUMBER, COMMA) + NUMBER -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` Much better. @@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in our number lists, we would modify the grammar as follows: ```python -@rule -def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + @rule + def list(): + return zero_or_more(NUMBER, COMMA) + NUMBER -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) -NumberList = Grammar( - name="NumberList", - start=list, - trivia=[BLANKS], -) + NumberList = Grammar( + name="NumberList", + start=list, + trivia=[BLANKS], + ) ``` Now we can parse a list with spaces! "1 , 2, 3" will parse happily diff --git a/dingus/about.md b/dingus/about.md index b677cfc..890eaa4 100644 --- a/dingus/about.md +++ b/dingus/about.md @@ -20,31 +20,31 @@ object. Here's an example: ```python {.numberLines} -from parser import * + from parser import * -@rule -def expression(): - return seq(expression, PLUS, term) | term + @rule + def expression(): + return seq(expression, PLUS, term) | term -@rule -def term(): - return seq(LPAREN, expression, RPAREN) | ID + @rule + def term(): + return seq(LPAREN, expression, RPAREN) | ID -PLUS = Terminal('PLUS', '+') -LPAREN = Terminal('LPAREN', '(') -RPAREN = Terminal('RPAREN', ')') -ID = Terminal( - 'ID', - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), -) + PLUS = Terminal('PLUS', '+') + LPAREN = Terminal('LPAREN', '(') + RPAREN = Terminal('RPAREN', ')') + ID = Terminal( + 'ID', + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) -SimpleGrammar = Grammar( - name="Simple", - start=expression, -) + SimpleGrammar = Grammar( + name="Simple", + start=expression, + ) ``` Terminal patterns can be plain strings or regular expressions @@ -61,17 +61,17 @@ things can be freely nested, as desired. You can make lists in the classic context-free grammar way: ```python {.numberLines} -@rule -def list(): - return NUMBER | (list + COMMA + NUMBER) + @rule + def list(): + return NUMBER | (list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` (Unlike with PEGs, you can write grammars with left or right-recursion, @@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their contents into the parent node instead. ```python {.numberLines} -@rule -def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + @rule + def list(): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return transparent_list -@rule(transparent=True) -def transparent_list() -> Rule: - return NUMBER | (transparent_list + COMMA + NUMBER) + @rule(transparent=True) + def transparent_list() -> Rule: + return NUMBER | (transparent_list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` This grammar will generate the far more useful tree: @@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is probably better-written as: ```python {.numberLines} -@rule -def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + @rule + def list(): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return transparent_list -@rule -def _list() -> Rule: - return NUMBER | (_list + COMMA + NUMBER) + @rule + def _list() -> Rule: + return NUMBER | (_list + COMMA + NUMBER) -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` That will generate the same tree, but a little more succinctly. @@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive rules by hand all the time, so there are helpers that do it for you: ```python {.numberLines} -@rule -def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + @rule + def list(): + return zero_or_more(NUMBER, COMMA) + NUMBER -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -NumberList = Grammar( - name="NumberList", - start=list, -) + NumberList = Grammar( + name="NumberList", + start=list, + ) ``` Much better. @@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in our number lists, we would modify the grammar as follows: ```python {.numberLines} -@rule -def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + @rule + def list(): + return zero_or_more(NUMBER, COMMA) + NUMBER -NUMBER = Terminal(Re.set(("0", "9")).plus()) -COMMA = Terminal(',') + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') -BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) -# ^ and add a new terminal to describe what we're ignoring... + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + # ^ and add a new terminal to describe what we're ignoring... -NumberList = Grammar( - name="NumberList", - start=list, - trivia=[BLANKS], -) + NumberList = Grammar( + name="NumberList", + start=list, + trivia=[BLANKS], + ) ``` Now we can parse a list with spaces! "1 , 2, 3" will parse happily diff --git a/parser/runtime.py b/parser/runtime.py index c7fc9db..a74276a 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -8,22 +8,6 @@ from dataclasses import dataclass from . import parser -def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]: - """Convert a text offset to a line number and column number given a list - of line break positions. This is used to make errors intelligible. Lines - are 1-based, and columns are 0-based, in accordance with editor - traditions. - """ - line_index = bisect.bisect_left(lines, pos) - if line_index == 0: - col_start = 0 - else: - col_start = lines[line_index - 1] + 1 - column_index = pos - col_start - line_index += 1 - return (line_index, column_index) - - @dataclass class TokenValue: kind: str @@ -613,16 +597,21 @@ class Parser: if errors: lines = tokens.lines() for parse_error in errors: - line_index, column_index = offset_to_line_column(lines, parse_error.start) + line_index = bisect.bisect_left(lines, parse_error.start) + if line_index == 0: + col_start = 0 + else: + col_start = lines[line_index - 1] + 1 + column_index = parse_error.start - col_start + line_index += 1 + error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) def generic_tokenize( - src: str, - table: parser.LexerTable, - lines: list[int], + src: str, table: parser.LexerTable ) -> typing.Iterable[tuple[parser.Terminal, int, int]]: pos = 0 state = 0 @@ -658,8 +647,7 @@ def generic_tokenize( pass if last_accept is None: - line_index, column_index = offset_to_line_column(lines, pos) - raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'") + raise Exception(f"Token error at {pos}") yield (last_accept, start, last_accept_pos - start) @@ -673,10 +661,10 @@ class GenericTokenStream: def __init__(self, src: str, lexer: parser.LexerTable): self.src = src self.lexer = lexer - self._lines = [m.start() for m in re.finditer("\n", src)] self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list( - generic_tokenize(src, lexer, self._lines) + generic_tokenize(src, lexer) ) + self._lines = [m.start() for m in re.finditer("\n", src)] def tokens(self): return self._tokens