diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 523604c0..314b35b2 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -23,6 +23,7 @@ from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus +from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -112,6 +113,8 @@ "ParatextProjectTermsParserBase", "ParatextProjectTextUpdaterBase", "ParatextTextCorpus", + "PlaceMarkersAlignmentInfo", + "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", "RtlReferenceOrder", "ScriptureElement", diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py new file mode 100644 index 00000000..7e44e02b --- /dev/null +++ b/machine/corpora/place_markers_usfm_update_block_handler.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +from typing import Iterable, List, TypedDict + +from ..translation.word_alignment_matrix import WordAlignmentMatrix +from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler + + +class PlaceMarkersAlignmentInfo(TypedDict): + refs: List[str] + source_tokens: List[str] + translation_tokens: List[str] + alignment: WordAlignmentMatrix + + +class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): + + def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None: + self._align_info = {info["refs"][0]: info for info in align_info} + + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + ref = str(block.refs[0]) + elements = list(block.elements) + + # Nothing to do if there are no markers to place or no alignment to use + if ( + len(elements) == 0 + or ref not in self._align_info.keys() + or self._align_info[ref]["alignment"].row_count == 0 + or self._align_info[ref]["alignment"].column_count == 0 + or not any( + ( + e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE] + and not e.marked_for_removal + ) + for e in elements + ) + ): + return block + + # Paragraph markers at the end of the block should stay there + # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers + end_elements = [] + eob_empty_paras = True + header_elements = [] + para_markers_left = 0 + for i, element in reversed(list(enumerate(elements))): + if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal: + if len(element.tokens) > 1: + header_elements.insert(0, (para_markers_left, element)) + elements.pop(i) + else: + para_markers_left += 1 + + if eob_empty_paras: + end_elements.insert(0, element) + elements.pop(i) + elif not ( + element.type == UsfmUpdateBlockElementType.EMBED + or (element.type == UsfmUpdateBlockElementType.TEXT and len(element.tokens[0].to_usfm().strip()) == 0) + ): + eob_empty_paras = False + + src_toks = self._align_info[ref]["source_tokens"] + trg_toks = self._align_info[ref]["translation_tokens"] + src_tok_idx = 0 + + src_sent = "" + trg_sent = "" + to_place = [] + adj_src_toks = [] + placed_elements = [elements.pop(0)] if elements[0].type == UsfmUpdateBlockElementType.OTHER else [] + ignored_elements = [] + for element in elements: + if element.type == UsfmUpdateBlockElementType.TEXT: + if element.marked_for_removal: + text = element.tokens[0].to_usfm() + src_sent += text + + # Track seen tokens + while src_tok_idx < len(src_toks) and src_toks[src_tok_idx] in text: + text = text[text.index(src_toks[src_tok_idx]) + len(src_toks[src_tok_idx]) :] + src_tok_idx += 1 + # Handle tokens split across text elements + if len(text.strip()) > 0: + src_tok_idx += 1 + else: + trg_sent += element.tokens[0].to_usfm() + + if element.marked_for_removal or element.type == UsfmUpdateBlockElementType.EMBED: + ignored_elements.append(element) + elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]: + to_place.append(element) + adj_src_toks.append(src_tok_idx) + + trg_tok_starts = [] + for tok in trg_toks: + trg_tok_starts.append(trg_sent.index(tok, trg_tok_starts[-1] + 1 if len(trg_tok_starts) > 0 else 0)) + + # Predict marker placements and get insertion order + to_insert = [] + for element, adj_src_tok in zip(to_place, adj_src_toks): + adj_trg_tok = self._predict_marker_location( + self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks + ) + trg_str_idx = trg_tok_starts[adj_trg_tok] if adj_trg_tok < len(trg_tok_starts) else len(trg_sent) + + to_insert.append((trg_str_idx, element)) + to_insert.sort(key=lambda x: x[0]) + to_insert += [(len(trg_sent), element) for element in end_elements] + + # Construct new text tokens to put between markers + # and reincorporate headers and empty end-of-verse paragraph markers + if to_insert[0][0] > 0: + placed_elements.append( + UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, [UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]])] + ) + ) + for j, (insert_idx, element) in enumerate(to_insert): + if element.type == UsfmUpdateBlockElementType.PARAGRAPH: + while len(header_elements) > 0 and header_elements[0][0] == para_markers_left: + placed_elements.append(header_elements.pop(0)[1]) + para_markers_left -= 1 + + placed_elements.append(element) + if insert_idx < len(trg_sent) and (j + 1 == len(to_insert) or insert_idx < to_insert[j + 1][0]): + if j + 1 < len(to_insert): + text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx : to_insert[j + 1][0]])) + else: + text_token = UsfmToken(UsfmTokenType.TEXT, text=(trg_sent[insert_idx:])) + placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token])) + while len(header_elements) > 0: + placed_elements.append(header_elements.pop(0)[1]) + + block._elements = placed_elements + ignored_elements + return block + + def _predict_marker_location( + self, + alignment: WordAlignmentMatrix, + adj_src_tok: int, + src_toks: List[str], + trg_toks: List[str], + ) -> int: + # Gets the number of alignment pairs that "cross the line" between + # the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5) + def num_align_crossings(src_idx: int, trg_idx: int) -> int: + crossings = 0 + for i in range(alignment.row_count): + for j in range(alignment.column_count): + if alignment[i, j] and ((i < src_idx and j >= trg_idx) or (i >= src_idx and j < trg_idx)): + crossings += 1 + return crossings + + # If the token on either side of a potential target location is punctuation, + # use it as the basis for deciding the target marker location + trg_hyp = -1 + punct_hyps = [-1, 0] + for punct_hyp in punct_hyps: + src_hyp = adj_src_tok + punct_hyp + if src_hyp < 0 or src_hyp >= len(src_toks): + continue + # Only accept aligned pairs where both the src and trg token are punctuation + hyp_tok = src_toks[src_hyp] + if len(hyp_tok) > 0 and not any(c.isalpha() for c in hyp_tok) and src_hyp < alignment.row_count: + aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp)) + # If aligning to a token that precedes that marker, + # the trg token predicted to be closest to the marker + # is the last token aligned to the src rather than the first + for trg_idx in reversed(aligned_trg_toks) if punct_hyp < 0 else aligned_trg_toks: + trg_tok = trg_toks[trg_idx] + if len(trg_tok) > 0 and not any(c.isalpha() for c in trg_tok): + trg_hyp = trg_idx + break + if trg_hyp != -1: + # Since the marker location is represented by the token after the marker, + # adjust the index when aligning to punctuation that precedes the token + return trg_hyp + (1 if punct_hyp == -1 else 0) + + hyps = [0, 1, 2] + best_hyp = -1 + best_num_crossings = 200**2 # mostly meaningless, a big number + checked = set() + for hyp in hyps: + src_hyp = adj_src_tok + hyp + if src_hyp in checked: + continue + trg_hyp = -1 + while trg_hyp == -1 and src_hyp >= 0 and src_hyp < alignment.row_count: + checked.add(src_hyp) + aligned_trg_toks = list(alignment.get_row_aligned_indices(src_hyp)) + if len(aligned_trg_toks) > 0: + # If aligning with a source token that precedes the marker, + # the target token predicted to be closest to the marker is the last aligned token rather than the first + trg_hyp = aligned_trg_toks[-1 if hyp < 0 else 0] + else: # continue the search outwards + src_hyp += -1 if hyp < 0 else 1 + if trg_hyp != -1: + num_crossings = num_align_crossings(adj_src_tok, trg_hyp) + if num_crossings < best_num_crossings: + best_hyp = trg_hyp + best_num_crossings = num_crossings + if num_crossings == 0: + break + + # If no alignments found, insert at the end of the sentence + return best_hyp if best_hyp != -1 else len(trg_toks) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index b7b2afbc..5bb120a8 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -157,8 +157,8 @@ def _align( check_canceled() for i in range(len(pretranslations)): - pretranslations[i]["source_toks"] = list(src_tokenized[i]) - pretranslations[i]["translation_toks"] = list(trg_tokenized[i]) + pretranslations[i]["source_tokens"] = list(src_tokenized[i]) + pretranslations[i]["translation_tokens"] = list(trg_tokenized[i]) pretranslations[i]["alignment"] = alignments[i] return pretranslations diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py index 54c4ae90..f29e1005 100644 --- a/machine/jobs/translation_file_service.py +++ b/machine/jobs/translation_file_service.py @@ -16,8 +16,8 @@ class PretranslationInfo(TypedDict): textId: str # noqa: N815 refs: List[str] translation: str - source_toks: List[str] - translation_toks: List[str] + source_tokens: List[str] + translation_tokens: List[str] alignment: str @@ -65,8 +65,8 @@ def generator() -> Generator[PretranslationInfo, None, None]: textId=pi["textId"], refs=list(pi["refs"]), translation=pi["translation"], - source_toks=list(pi["source_toks"]), - translation_toks=list(pi["translation_toks"]), + source_tokens=list(pi["source_tokens"]), + translation_tokens=list(pi["translation_tokens"]), alignment=pi["alignment"], ) diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py new file mode 100644 index 00000000..9f41ce63 --- /dev/null +++ b/tests/corpora/test_place_markers_usfm_update_block_handler.py @@ -0,0 +1,467 @@ +from typing import List, Optional, Sequence, Tuple + +from machine.corpora import ( + AlignedWordPair, + PlaceMarkersAlignmentInfo, + PlaceMarkersUsfmUpdateBlockHandler, + ScriptureRef, + UpdateUsfmMarkerBehavior, + UpdateUsfmParserHandler, + UpdateUsfmTextBehavior, + UsfmUpdateBlockHandler, + parse_usfm, +) +from machine.tokenization import LatinWordTokenizer +from machine.translation import WordAlignmentMatrix + +TOKENIZER = LatinWordTokenizer() + + +def test_paragraph_markers() -> None: + source = "This is the first paragraph. This text is in English, and this test is for paragraph markers." + pretranslation = "Este es el primer párrafo. Este texto está en inglés y esta prueba es para marcadores de párrafo." + rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + usfm = r"""\id MAT +\c 1 +\v 1 This is the first paragraph. +\p This text is in English, +\p and this test is for paragraph markers. +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Este es el primer párrafo. +\p Este texto está en inglés +\p y esta prueba es para marcadores de párrafo. +""" + assess(target, result) + + +def test_style_markers() -> None: + source = "This is the first sentence. This text is in English, and this test is for style markers." + pretranslation = "Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo." + rows = [(scr_ref("MAT 1:1"), str(pretranslation))] + usfm = r"""\id MAT +\c 1 +\v 1 This is the \w first\w* sentence. This text is in \w English\w*, and this test is \w for\w* style markers. +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=[t for t in TOKENIZER.tokenize(source)], + translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)], + alignment=to_word_alignment_matrix( + "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19" + ), + ), + ] + target = update_usfm( + rows, + usfm, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Esta es la \w primera \w*oración. Este texto está en \w inglés \w*y esta prueba es \w para \w*marcadores de estilo. +""" + # NOTE: the spacing before/after end markers is incorrect, + # but this is an issue with how the is USFM is generated from the tokens + assess(target, result) + + target = update_usfm( + rows, + usfm, + style_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Esta es la primera oración. Este texto está en inglés y esta prueba es para marcadores de estilo. +""" + assess(target, result) + + +# NOTE: Not currently updating embeds, will need to change test when we do +def test_embeds() -> None: + rows = [ + (scr_ref("MAT 1:1"), "New verse 1"), + (scr_ref("MAT 1:2"), "New verse 2"), + (scr_ref("MAT 1:3"), "New verse 3"), + (scr_ref("MAT 1:4"), "New verse 4"), + (scr_ref("MAT 1:4/1:f"), "New embed text"), + (scr_ref("MAT 1:5"), "New verse 5"), + (scr_ref("MAT 1:6"), "New verse 6"), + (scr_ref("MAT 1:6/1:f"), "New verse 6 embed text"), + ] + usfm = r"""\id MAT +\c 1 +\v 1 \f \fr 1.1 \ft Some note \f*Start of sentence embed +\v 2 Middle of sentence \f \fr 1.2 \ft Some other note \f*embed +\v 3 End of sentence embed\f \fr 1.3 \ft A third note \f* +\v 4 Updated embed\f \fr 1.4 \ft A fourth note \f* +\v 5 Embed with style markers \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 Updated embed with style markers \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +""" + + align_info = [] + target = update_usfm( + rows, + usfm, + embed_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 \f \fr 1.1 \ft Some note \f* +\v 2 New verse 2 \f \fr 1.2 \ft Some other note \f* +\v 3 New verse 3 \f \fr 1.3 \ft A third note \f* +\v 4 New verse 4 \f \fr 1.4 \ft A fourth note \f* +\v 5 New verse 5 \f \fr 1.5 \ft A \+w stylish\+w* note \f* +\v 6 New verse 6 \f \fr 1.6 \ft Another \+w stylish\+w* note \f* +""" + assess(target, result) + + target = update_usfm( + rows, + usfm, + embed_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\v 2 New verse 2 +\v 3 New verse 3 +\v 4 New verse 4 +\v 5 New verse 5 +\v 6 New verse 6 +""" + assess(target, result) + + +def test_trailing_empty_paragraphs() -> None: + rows = [(scr_ref("MAT 1:1"), "New verse 1")] + usfm = r"""\id MAT +\c 1 +\v 1 Verse 1 +\p +\b +\q1 \f embed \f* +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["Verse" "1"], + translation_tokens=["New", "verse", "1"], + alignment=to_word_alignment_matrix("0-1 1-2"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\p +\b +\q1 \f embed \f* +""" + assess(target, result) + + +def test_headers() -> None: + rows = [ + (scr_ref("MAT 1:1"), "X Y Z"), + (scr_ref("MAT 1:2"), "X"), + (scr_ref("MAT 1:3"), "Y"), + (scr_ref("MAT 1:3/1:s1"), "Updated header"), + ] + usfm = r"""\id MAT +\c 1 +\s1 Start of chapter header +\v 1 A +\p B +\s1 Mid-verse header +\p C +\s1 End of verse header +\p +\p +\s1 Header after all paragraphs +\v 2 A +\s1 Header followed by a reference +\r (reference) +\p +\v 3 B +\s1 Header to be updated +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["A", "B", "C"], + translation_tokens=["X", "Y", "Z"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2"), + ), + PlaceMarkersAlignmentInfo( + refs=["MAT 1:2"], + source_tokens=["A"], + translation_tokens=["X"], + alignment=to_word_alignment_matrix("0-0"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\s1 Start of chapter header +\v 1 X +\p Y +\s1 Mid-verse header +\p Z +\s1 End of verse header +\p +\p +\s1 Header after all paragraphs +\v 2 X +\s1 Header followed by a reference +\r (reference) +\p +\v 3 Y +\s1 Updated header +""" + assess(target, result) + + +def test_consecutive_markers() -> None: + rows = [(scr_ref("MAT 1:1"), "New verse 1 WORD")] + usfm = r"""\id MAT +\c 1 +\v 1 Old verse 1 +\p \qt \+w word \+w* \qt* +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["Old", "verse", "1", "word"], + translation_tokens=["New", "verse", "1", "WORD"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New verse 1 +\p \qt \+w WORD \+w*\qt* +""" + assess(target, result) + + +def test_verse_ranges() -> None: + rows = [([ScriptureRef.parse(f"MAT 1:{i}") for i in range(1, 6)], "New verse range text new paragraph 2")] + usfm = r"""\id MAT +\c 1 +\v 1-5 Verse range +\p old paragraph 2 +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=[str(ScriptureRef.parse(f"MAT 1:{i}")) for i in range(1, 6)], + source_tokens=["Verse", "range", "old", "paragraph", "2"], + translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1-5 New verse range text +\p new paragraph 2 +""" + assess(target, result) + + +def test_no_update() -> None: + rows = [(scr_ref("MAT 1:1"), "New paragraph 1 New paragraph 2")] + usfm = r"""\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +""" + + # Strip paragraphs + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"], + translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +""" + assess(target, result) + + # No alignment + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=[], + translation_tokens=[], + alignment=to_word_alignment_matrix(""), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 New paragraph 1 New paragraph 2 +\p +""" + assess(target, result) + + # No text update + rows = [] + align_info = [] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 Old paragraph 1 +\p Old paragraph 2 +""" + assess(target, result) + + +def test_split_tokens() -> None: + rows = [(scr_ref("MAT 1:1"), "words split words split words split")] + usfm = r"""\id MAT +\c 1 +\v 1 words spl +\p it words spl +\p it words split +""" + + align_info = [ + PlaceMarkersAlignmentInfo( + refs=["MAT 1:1"], + source_tokens=["words", "split", "words", "split", "words", "split"], + translation_tokens=["words", "split", "words", "split", "words", "split"], + alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"), + ), + ] + target = update_usfm( + rows, + usfm, + paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE, + update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler(align_info)], + ) + result = r"""\id MAT +\c 1 +\v 1 words split +\p words split +\p words split +""" + assess(target, result) + + +def scr_ref(*refs: str) -> List[ScriptureRef]: + return [ScriptureRef.parse(ref) for ref in refs] + + +def to_word_alignment_matrix(alignment_str: str) -> WordAlignmentMatrix: + word_pairs = AlignedWordPair.from_string(alignment_str) + row_count = 0 + column_count = 0 + for pair in word_pairs: + if pair.source_index + 1 > row_count: + row_count = pair.source_index + 1 + if pair.target_index + 1 > column_count: + column_count = pair.target_index + 1 + return WordAlignmentMatrix.from_word_pairs(row_count, column_count, word_pairs) + + +def update_usfm( + rows: Sequence[Tuple[Sequence[ScriptureRef], str]], + source: str, + id_text: Optional[str] = None, + text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, + paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, + style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP, + preserve_paragraph_styles: Optional[Sequence[str]] = None, + update_block_handlers: Optional[list[UsfmUpdateBlockHandler]] = None, +) -> Optional[str]: + source = source.strip().replace("\r\n", "\n") + "\r\n" + updater = UpdateUsfmParserHandler( + rows, + id_text, + text_behavior, + paragraph_behavior, + embed_behavior, + style_behavior, + preserve_paragraph_styles, + update_block_handlers, + ) + parse_usfm(source, updater) + return updater.get_usfm() + + +def assess(target: Optional[str], truth: str) -> None: + assert target is not None + for target_line, truth_line in zip(target.split("\n"), truth.split("\n")): + assert target_line.strip() == truth_line.strip() diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index 227b909b..014fc743 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -38,7 +38,7 @@ def test_run(decoy: Decoy) -> None: assert len(pretranslations) == 1 assert pretranslations[0]["translation"] == "Please, I have booked a room." if is_eflomal_available(): - assert pretranslations[0]["source_toks"] == [ + assert pretranslations[0]["source_tokens"] == [ "Por", "favor", ",", @@ -48,11 +48,11 @@ def test_run(decoy: Decoy) -> None: "habitación", ".", ] - assert pretranslations[0]["translation_toks"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] + assert pretranslations[0]["translation_tokens"] == ["Please", ",", "I", "have", "booked", "a", "room", "."] assert len(pretranslations[0]["alignment"]) > 0 else: - assert pretranslations[0]["source_toks"] == [] - assert pretranslations[0]["translation_toks"] == [] + assert pretranslations[0]["source_tokens"] == [] + assert pretranslations[0]["translation_tokens"] == [] assert len(pretranslations[0]["alignment"]) == 0 decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1) @@ -131,8 +131,8 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", - source_toks=[], - translation_toks=[], + source_tokens=[], + translation_tokens=[], alignment="", ) ] diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py index 16afcacf..eff4649f 100644 --- a/tests/jobs/test_smt_engine_build_job.py +++ b/tests/jobs/test_smt_engine_build_job.py @@ -137,8 +137,8 @@ def __init__(self, decoy: Decoy) -> None: textId="text1", refs=["ref1"], translation="Por favor, tengo reservada una habitación.", - source_toks=[], - translation_toks=[], + source_tokens=[], + translation_tokens=[], alignment="", ) ]