From dca03b9d99f4d92e18187fa87791af841acd6723 Mon Sep 17 00:00:00 2001 From: Jacob Errington Date: Sat, 18 Jan 2025 21:46:21 -0500 Subject: [PATCH 1/4] add (Byte|Str)Stream for sources on input data For backwards compatibility, these are only used to wrap input data when a source is given to `parse()` or `parse_partial()`. When a source is given, the following behaviours change: - the primitive `line_info` parser returns a 3-tuple instead of a 2-tuple - ParseError objects will include the source --- src/parsy/__init__.py | 72 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/parsy/__init__.py b/src/parsy/__init__.py index cf96e5c..f318655 100644 --- a/src/parsy/__init__.py +++ b/src/parsy/__init__.py @@ -12,13 +12,54 @@ noop = lambda x: x +class StrStream(str): + """String data to parse, possibly equipped with a name for the source it's + from, e.g. a file path.""" + + def __new__(cls, string, source): + instance = super().__new__(cls, string) + instance.source = source + return instance + + +class ByteStream(str): + """String data to parse, possibly equipped with a name for the source it's + from, e.g. a file path.""" + + def __new__(cls, bs, source): + instance = super().__new__(cls, bs) + instance.source = source + return instance + + +def make_stream(data: str | bytes, source: Any): + """Constructs an appropriate stream type for `data` when it's one of the + three core supported datatypes of parsy (viz. str, bytes, list). Otherwise, + the data is assumed to just support a minimum of __getitem__ and + __len__.""" + if isinstance(data, str): + return StrStream(data, source) + + if isinstance(data, bytes): + return ByteStream(data, source) + + raise RuntimeError( + "A Parsy stream can be formed only on str and bytes, but the given " + f"data has type {type(data)}. If you are separately tokenizing the " + "data to parse, consider instead equipping the tokens with source " + "location metadata.", + ) + def line_info_at(stream, index): if index > len(stream): raise ValueError("invalid index") line = stream.count("\n", 0, index) last_nl = stream.rfind("\n", 0, index) col = index - (last_nl + 1) - return (line, col) + if hasattr(stream, "source"): + return (line, col, stream.source) + else: + return (line, col) class ParseError(RuntimeError): @@ -29,7 +70,15 @@ def __init__(self, expected, stream, index): def line_info(self): try: - return "{}:{}".format(*line_info_at(self.stream, self.index)) + info = line_info_at(self.stream, self.index) + if len(info) == 2: + row, col = info + return f"{row}:{col}" + elif len(info) == 3: + source, row, col = info + return f"{source}:{row}:{col}" + else: + raise RuntimeError("Internal line_info_at violates length expectation.") except (TypeError, AttributeError): # not a str return str(self.index) @@ -90,20 +139,23 @@ def __init__(self, wrapped_fn: Callable[[str | bytes | list, int], Result]): """ self.wrapped_fn = wrapped_fn - def __call__(self, stream: str | bytes | list, index: int): + def __call__(self, stream, index: int): return self.wrapped_fn(stream, index) - def parse(self, stream: str | bytes | list) -> Any: + def parse(self, stream, source=None) -> Any: """Parses a string or list of tokens and returns the result or raise a ParseError.""" - (result, _) = (self << eof).parse_partial(stream) + (result, _) = (self << eof).parse_partial(stream, source) return result - def parse_partial(self, stream: str | bytes | list) -> tuple[Any, str | bytes | list]: + def parse_partial(self, stream, source=None) -> tuple[Any, str | bytes | list]: """ Parses the longest possible prefix of a given string. Returns a tuple of the result and the unparsed remainder, or raises ParseError """ + if source is not None: + stream = make_stream(stream, source) + result = self(stream, 0) if result.status: @@ -346,6 +398,11 @@ def marked(): start = yield line_info body = yield self end = yield line_info + # line_info returns a 3-tuple including the source when a source + # was given to `parse`, but older programs expect these tuples to + # have length 2, consisting of just row and col + start = start[:2] + end = end[:2] return (start, body, end) return marked @@ -578,8 +635,7 @@ def test_item(func: Callable[..., bool], description: str) -> Parser: def test_item_parser(stream, index): if index < len(stream): if isinstance(stream, bytes): - # Subscripting bytes with `[index]` instead of - # `[index:index + 1]` returns an int + # Otherwise directly indexing a bytes gives `int` item = stream[index : index + 1] else: item = stream[index] From e8b252a0b075398d28721a72b1d61850900a2eb0 Mon Sep 17 00:00:00 2001 From: Jacob Errington Date: Sat, 18 Jan 2025 21:49:27 -0500 Subject: [PATCH 2/4] add .span() method and test for it --- src/parsy/__init__.py | 41 +++++++++++++++++++++++++++++++++++++++++ tests/test_parsy.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/src/parsy/__init__.py b/src/parsy/__init__.py index f318655..17f2630 100644 --- a/src/parsy/__init__.py +++ b/src/parsy/__init__.py @@ -50,6 +50,22 @@ def make_stream(data: str | bytes, source: Any): "location metadata.", ) + +@dataclass +class SourceSpan: + """Identifies a span of material from the data to parse. + + Attributes: + source (str | None): the source of the data, e.g. a file path. + start ([int, int]): the start row and column of the span. + end ([int, int]): the end row and column of the span. + """ + + source: str | None + start: [int, int] + end: [int, int] + + def line_info_at(stream, index): if index > len(stream): raise ValueError("invalid index") @@ -391,6 +407,9 @@ def mark(self) -> Parser: ((start_row, start_column), original_value, (end_row, end_column)) + + ``.span()'' is a more powerful version of this combinator, returning a + SourceSpan. """ @generate @@ -407,6 +426,28 @@ def marked(): return marked + def span(self) -> Parser: + """ + Returns a parser that augments the initial parser's result with a + SourceSpan capturing where that parser started and stopped. + The new value is a tuple: + + (source_span, original_value) + """ + + @generate + def marked(): + start = yield line_info + body = yield self + end = yield line_info + try: + source = start[2] + except IndexError: + source = None + return (SourceSpan(source, start[:2], end[:2]), body) + + return marked + def tag(self, name: str) -> Parser: """ Returns a parser that wraps the produced value of the initial parser in a diff --git a/tests/test_parsy.py b/tests/test_parsy.py index f699508..b739272 100644 --- a/tests/test_parsy.py +++ b/tests/test_parsy.py @@ -7,6 +7,7 @@ from parsy import ( ParseError, + SourceSpan, alt, any_char, char_from, @@ -208,6 +209,35 @@ def test_mark(self): self.assertEqual(letters, ["q", "w", "e", "r"]) self.assertEqual(end, (1, 4)) + def test_span(self): + parser = (letter.many().span() << string("\n")).many() + source = "sample" + + lines = parser.parse("asdf\nqwer\n", source=source) + + self.assertEqual(len(lines), 2) + + (span, letters) = lines[0] + self.assertEqual(span, SourceSpan(source, (0, 0), (0, 4))) + self.assertEqual(letters, ["a", "s", "d", "f"]) + + (span, letters) = lines[1] + self.assertEqual(span, SourceSpan(source, (1, 0), (1, 4))) + + def test_span_no_source(self): + parser = (letter.many().span() << string("\n")).many() + + lines = parser.parse("asdf\nqwer\n") + + self.assertEqual(len(lines), 2) + + (span, letters) = lines[0] + self.assertEqual(span, SourceSpan(None, (0, 0), (0, 4))) + self.assertEqual(letters, ["a", "s", "d", "f"]) + + (span, letters) = lines[1] + self.assertEqual(span, SourceSpan(None, (1, 0), (1, 4))) + def test_tag(self): parser = letter.many().concat().tag("word") self.assertEqual( @@ -692,6 +722,7 @@ def test_line_info_at(self): self.assertRaises(ValueError, lambda: line_info_at(text, 8)) + class TestForwardDeclaration(unittest.TestCase): def test_forward_declaration_1(self): # This is the example from the docs From 381afac93115a6938081d1ebad44ab56d1c0c481 Mon Sep 17 00:00:00 2001 From: Jacob Errington Date: Sun, 19 Jan 2025 08:26:13 -0500 Subject: [PATCH 3/4] adjust line_info tests to include sources --- tests/test_parsy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_parsy.py b/tests/test_parsy.py index b739272..e07c61f 100644 --- a/tests/test_parsy.py +++ b/tests/test_parsy.py @@ -20,6 +20,7 @@ letter, line_info, line_info_at, + make_stream, match_item, peek, regex, @@ -619,6 +620,18 @@ def foo(): ], ) + source = "aaaaa" + self.assertEqual( + foo.many().parse("AB\nCD", source=source), + [ + ("A", (0, 0, source)), + ("B", (0, 1, source)), + ("\n", (0, 2, source)), + ("C", (1, 0, source)), + ("D", (1, 1, source)), + ], + ) + def test_should_fail(self): not_a_digit = digit.should_fail("not a digit") >> regex(r".*") @@ -713,14 +726,24 @@ def foo(): class TestUtils(unittest.TestCase): def test_line_info_at(self): + text = "abc\ndef" self.assertEqual(line_info_at(text, 0), (0, 0)) self.assertEqual(line_info_at(text, 2), (0, 2)) self.assertEqual(line_info_at(text, 3), (0, 3)) self.assertEqual(line_info_at(text, 4), (1, 0)) self.assertEqual(line_info_at(text, 7), (1, 3)) + self.assertRaises(ValueError, lambda: line_info_at(text, 8)) + text = make_stream("abc\ndef", source="aaaa") + self.assertEqual(line_info_at(text, 0), (0, 0, "aaaa")) + self.assertEqual(line_info_at(text, 2), (0, 2, "aaaa")) + self.assertEqual(line_info_at(text, 3), (0, 3, "aaaa")) + self.assertEqual(line_info_at(text, 4), (1, 0, "aaaa")) + self.assertEqual(line_info_at(text, 7), (1, 3, "aaaa")) + + self.assertRaises(ValueError, lambda: line_info_at(text, 8)) class TestForwardDeclaration(unittest.TestCase): From 5c5b293a8d820672087af2722e11849781addc2e Mon Sep 17 00:00:00 2001 From: Jacob Errington Date: Sun, 19 Jan 2025 08:18:44 -0500 Subject: [PATCH 4/4] document .span() and SourceSpan --- docs/ref/methods_and_combinators.rst | 45 ++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/docs/ref/methods_and_combinators.rst b/docs/ref/methods_and_combinators.rst index 5ef0a2e..7d2375b 100644 --- a/docs/ref/methods_and_combinators.rst +++ b/docs/ref/methods_and_combinators.rst @@ -23,7 +23,7 @@ can be used and manipulated as below. The following methods are for actually **using** the parsers that you have created: - .. method:: parse(string_or_list) + .. method:: parse(string_or_list[, source=None]) Attempts to parse the given string (or list). If the parse is successful and consumes the entire string, the result is returned - otherwise, a @@ -36,7 +36,11 @@ can be used and manipulated as below. library will work with tokens just as well. See :doc:`/howto/lexing` for more information. - .. method:: parse_partial(string_or_list) + When a non-None ``source`` is given, this name is reported automatically + in parse errors. Typically, this is the file path or URL where the data + to parse originates from. + + .. method:: parse_partial(string_or_list[, source=None]) Similar to ``parse``, except that it does not require the entire string (or list) to be consumed. Returns a tuple of @@ -401,6 +405,20 @@ can be used and manipulated as below. ` and want subsequent parsing of the token stream to be able to report original positions in error messages etc. + .. method:: span() + + Returns a parser that augments the initial parser's result with a :class:`SourceSpan` + containing information about where that parser started and stopped within the + source data. The new value is a tuple: + + .. code:: python + + (source_span, original_value) + + This enables reporting of custom errors involving source locations, such as when + using parsy as a :doc:`lexer` or when building a syntax tree that will be + further analyzed. + .. _operators: Parser operators @@ -594,3 +612,26 @@ Parsy does not try to include every possible combinator - there is no reason why you cannot create your own for your needs using the built-in combinators and primitives. If you find something that is very generic and would be very useful to have as a built-in, please :doc:`submit ` as a PR! + +Auxiliary data structures +========================= + +.. class:: SourceSpan + + Identifies a span of material from the data being parsed by its start row and column and its end + row and column. If the data stream was equipped with a source, that value is also available in + this object. + + .. attribute:: start + + The starting position of this span as a tuple (row, col) + + .. attribute:: end + + The stopping position of this span as a tuple (row, col) + + .. attribute:: source + + The name of the source the data was parsed from. This is the same value + that was passed to :meth:`Parser.parse` or :meth:`Parser.parse_partial`, + or `None` if no value was given.