From 37e36db714bc44c6b3ba6b128243bdacdbc9a58e Mon Sep 17 00:00:00 2001 From: Damien Daspit Date: Wed, 23 Apr 2025 14:12:42 -0500 Subject: [PATCH] Fix handling of implicitly closed char styles when updating USFM - only skip/collect tokens if the marker is explicitly closed - fm markers are not footnote elements - only end note text when a ft marker is closed --- .../scripture_ref_usfm_parser_handler.py | 9 ++++-- machine/corpora/update_usfm_parser_handler.py | 21 +++++++++----- .../test_update_usfm_parser_handler.py | 29 +++++++++++++++++-- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/machine/corpora/scripture_ref_usfm_parser_handler.py b/machine/corpora/scripture_ref_usfm_parser_handler.py index 6583c473..f9bd263d 100644 --- a/machine/corpora/scripture_ref_usfm_parser_handler.py +++ b/machine/corpora/scripture_ref_usfm_parser_handler.py @@ -170,7 +170,7 @@ def end_char( if self._is_embed_part_style(marker): if self._in_nested_embed: self._in_nested_embed = False - else: + elif self._is_note_text(marker): self._end_note_text_wrapper(state) if self._is_embed_style(marker): self._end_embed(state, marker, attributes, closed) @@ -272,14 +272,17 @@ def _is_in_embed(self, marker: Optional[str]) -> bool: def _is_in_nested_embed(self, marker: Optional[str]) -> bool: return self._in_nested_embed or ( - marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES + marker is not None + and marker.startswith("+") + and marker[1] in EMBED_PART_START_CHAR_STYLES + and marker != "fm" ) def _is_note_text(self, marker: Optional[str]) -> bool: return marker == "ft" def _is_embed_part_style(self, marker: Optional[str]) -> bool: - return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) + return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) and marker != "fm" def _is_embed_style(self, marker: Optional[str]) -> bool: return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index cb7cf7d5..b3ebe2be 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -184,10 +184,13 @@ def end_char( attributes: Sequence[UsfmAttribute], closed: bool, ) -> None: - if self._replace_with_new_tokens(state, closed): - self._skip_tokens(state) - else: - self._collect_tokens(state) + + skip_tokens = self._replace_with_new_tokens(state, closed) + if closed: + if skip_tokens: + self._skip_tokens(state) + else: + self._collect_tokens(state) super().end_char(state, marker, attributes, closed) @@ -207,10 +210,12 @@ def _start_embed( def _end_embed( self, state: UsfmParserState, marker: str, attributes: Sequence[UsfmAttribute], closed: bool ) -> None: - if self._replace_with_new_tokens(state, closed): - self._skip_tokens(state) - else: - self._collect_tokens(state) + skip_tokens = self._replace_with_new_tokens(state, closed) + if closed: + if skip_tokens: + self._skip_tokens(state) + else: + self._collect_tokens(state) self._embed_row_texts.clear() self._embed_updated = False diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 4b8d89fb..431e7e41 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -305,12 +305,13 @@ def test_get_usfm_verse_replace_note() -> None: ] usfm = r"""\id MAT - Test \c 1 -\v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one. +\v 1 Chapter \add one\add*, verse \f + \fr 2:1: \ft This is a \fq quotation \ft and an \fqa alternative quotation\f*one. """ target = update_usfm(rows, usfm) + # Only the first \ft marker is updated result = r"""\id MAT - Test \c 1 -\v 1 updated text \f + \fr 2:1: \ft This is a new footnote. \f* +\v 1 updated text \f + \fr 2:1: \ft This is a new footnote. \fq quotation \ft and an \fqa alternative quotation\f* """ assess(target, result) @@ -979,6 +980,30 @@ def test_multiple_ft_only_update_first() -> None: assess(target, result) +def test_implicitly_closed_char_style() -> None: + rows = [ + ( + scr_ref("MAT 1:1"), + str("Update text"), + ) + ] + usfm = r"""\id MAT - Test +\c 1 +\v 1 Verse \bd one. +\c 2 +\v 1 Verse one. +""" + + target = update_usfm(rows, usfm) + result = r"""\id MAT - Test +\c 1 +\v 1 Update text +\c 2 +\v 1 Verse one. +""" + assess(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs]