Skip to content

Commit 7967a9d

Browse files
authored
Handle out-of-order verses (#190)
* Handle out-of-order verses * Make test case non-trivial
1 parent e631be8 commit 7967a9d

File tree

2 files changed

+52
-5
lines changed

2 files changed

+52
-5
lines changed

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
7272
trg_sent = ""
7373
to_place = []
7474
adj_src_toks = []
75-
placed_elements = [elements.pop(0)] if elements[0].type == UsfmUpdateBlockElementType.OTHER else []
75+
placed_elements = []
7676
embed_elements = []
7777
ignored_elements = []
7878
for element in elements:
@@ -99,6 +99,9 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
9999
to_place.append(element)
100100
adj_src_toks.append(src_tok_idx)
101101

102+
if len(trg_sent.strip()) == 0:
103+
return block
104+
102105
trg_tok_starts = []
103106
prev_len = 0
104107
for tok in trg_toks:

tests/corpora/test_place_markers_usfm_update_block_handler.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,14 +201,17 @@ def test_headers() -> None:
201201
usfm = r"""\id MAT
202202
\c 1
203203
\s1 Start of chapter header
204+
\p
204205
\v 1 A
205206
\p B
206207
\s1 Mid-verse header
207208
\p C
208-
\s1 End of verse header
209+
\s1 Header between verse text and empty end-of-verse paragraphs
210+
\p
209211
\p
210212
\p
211-
\s1 Header after all paragraphs
213+
\s1 Header after all verse paragraphs
214+
\p
212215
\v 2 A
213216
\s1 Header followed by a reference
214217
\r (reference)
@@ -240,14 +243,17 @@ def test_headers() -> None:
240243
result = r"""\id MAT
241244
\c 1
242245
\s1 Start of chapter header
246+
\p
243247
\v 1 X
244248
\p Y
245249
\s1 Mid-verse header
246250
\p Z
247-
\s1 End of verse header
251+
\s1 Header between verse text and empty end-of-verse paragraphs
248252
\p
249253
\p
250-
\s1 Header after all paragraphs
254+
\p
255+
\s1 Header after all verse paragraphs
256+
\p
251257
\v 2 X
252258
\s1 Header followed by a reference
253259
\r (reference)
@@ -478,6 +484,44 @@ def test_consecutive_substring() -> None:
478484
assess(target, result)
479485

480486

487+
def test_verses_out_of_order() -> None:
488+
rows = [(scr_ref("MAT 1:1"), "new verse 1 new paragraph 2"), (scr_ref("MAT 1:2"), "new verse 2")]
489+
usfm = r"""\id MAT
490+
\c 1
491+
\v 2 verse 2
492+
\v 1 verse 1
493+
\p paragraph 2
494+
"""
495+
496+
align_info = [
497+
PlaceMarkersAlignmentInfo(
498+
refs=["MAT 1:1"],
499+
source_tokens=["verse", "1", "paragraph", "2"],
500+
translation_tokens=["new", "verse", "1", "new", "paragraph", "2"],
501+
alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5"),
502+
),
503+
PlaceMarkersAlignmentInfo(
504+
refs=["MAT 1:2"],
505+
source_tokens=["verse", "2"],
506+
translation_tokens=["new", "verse", "2"],
507+
alignment=to_word_alignment_matrix("0-1 1-2"),
508+
),
509+
]
510+
target = update_usfm(
511+
rows,
512+
usfm,
513+
text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING,
514+
update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)],
515+
)
516+
result = r"""\id MAT
517+
\c 1
518+
\v 2 new verse 2
519+
\v 1
520+
\p
521+
"""
522+
assess(target, result)
523+
524+
481525
def scr_ref(*refs: str) -> List[ScriptureRef]:
482526
return [ScriptureRef.parse(ref) for ref in refs]
483527

0 commit comments

Comments
 (0)