diff --git a/poetry.lock b/poetry.lock index 896a2185..c2d57a9d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5159,13 +5159,13 @@ type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12 [[package]] name = "sil-machine" -version = "1.7.2" +version = "1.7.3" description = "A natural language processing library that is focused on providing tools for resource-poor languages." optional = false python-versions = "<3.13,>=3.9" files = [ - {file = "sil_machine-1.7.2-py3-none-any.whl", hash = "sha256:5c6045056068bf02e3764122fd289a85951a41587f4d7ae12e4f08c6e5bff5d0"}, - {file = "sil_machine-1.7.2.tar.gz", hash = "sha256:01fd29b61994bf0a57a51beff3b4352570f24ee5f00288347c66fa1bc8c223b7"}, + {file = "sil_machine-1.7.3-py3-none-any.whl", hash = "sha256:f53e11a38e30ba52c57399466d83765fbb6a4b69bfce1ee13f53107d859b7c29"}, + {file = "sil_machine-1.7.3.tar.gz", hash = "sha256:272f7c89ed49d166d44a68092afb72049a78c2ec0bbdde324734d2c2fcf8cb6e"}, ] [package.dependencies] @@ -6177,4 +6177,4 @@ eflomal = ["eflomal"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "2fd9f08f90aee293b04f627a59b1f8c45e8c309052ee6ffacf8c2e8a8687623e" +content-hash = "63c6e3d81fa851530a7bf2e99edda02868764bb495649f3f8451d23270c82fc5" diff --git a/pyproject.toml b/pyproject.toml index 8a34da5c..dbad03e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ tqdm = "^4.62.2" sacrebleu = "^2.3.1" ctranslate2 = "^3.5.1" libclang = "14.0.6" -sil-machine = {extras = ["thot"], version = "1.7.2"} +sil-machine = {extras = ["thot"], version = "1.7.3"} datasets = "^2.7.1" torch = {version = "^2.4", source = "torch"} sacremoses = "^0.0.53" diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index 0eb8411e..3216f7d2 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -31,17 +31,6 @@ nltk.download("punkt") -def insert_draft_remarks(usfm: str, remarks: List[str]) -> str: - lines = usfm.split("\n") - insert_idx = ( - 1 - + (len(lines) > 1 and (lines[1].startswith("\\ide") or lines[1].startswith("\\usfm"))) - + (len(lines) > 2 and (lines[2].startswith("\\ide") or lines[2].startswith("\\usfm"))) - ) - remarks = [f"\\rem {r}" for r in remarks] - return "\n".join(lines[:insert_idx] + remarks + lines[insert_idx:]) - - # A group of multiple translations of a single sentence TranslationGroup = List[str] @@ -224,6 +213,12 @@ def translate_usfm( postprocess_handler.create_update_block_handlers(vrefs, sentences, translated_draft) for config in postprocess_handler.configs: + # Compile draft remarks + draft_src_str = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}" + draft_remark = f"This draft of {vrefs[0].book} was machine translated on {date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be reviewed and edited carefully." + postprocess_remark = config.get_postprocess_remark() + remarks = [draft_remark] + ([postprocess_remark] if len(postprocess_remark) > 0 else []) + # Insert translation into the USFM structure of an existing project # If the target project is not the same as the translated file's original project, # no verses outside of the ones translated will be overwritten @@ -239,6 +234,7 @@ def translate_usfm( embed_behavior=config.get_embed_behavior(), style_behavior=config.get_style_behavior(), update_block_handlers=config.update_block_handlers, + remarks=remarks, ) if usfm_out is None: @@ -256,20 +252,11 @@ def translate_usfm( embed_behavior=config.get_embed_behavior(), style_behavior=config.get_style_behavior(), update_block_handlers=config.update_block_handlers, + remarks=remarks, ) parse_usfm(usfm, handler) usfm_out = handler.get_usfm() - # Insert draft remarks - description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}" - remarks = [ - f"This draft of {vrefs[0].book} was machine translated on {date.today()} from {description} using model {experiment_ckpt_str}. It should be reviewed and edited carefully." - ] - postprocess_remark = config.get_postprocess_remark() - if len(postprocess_remark) > 0: - remarks.append(postprocess_remark) - usfm_out = insert_draft_remarks(usfm_out, remarks) - # Construct output file name write to file trg_draft_file_path = trg_file_path.with_stem(trg_file_path.stem + config.get_postprocess_suffix()) if produce_multiple_translations: diff --git a/silnlp/nmt/postprocess.py b/silnlp/nmt/postprocess.py index 8c4f34cb..77e6ecaa 100644 --- a/silnlp/nmt/postprocess.py +++ b/silnlp/nmt/postprocess.py @@ -30,13 +30,6 @@ LOGGER = logging.getLogger(__package__ + ".postprocess") -# NOTE: to be replaced by new machine.py remark functionality -def insert_draft_remarks(usfm: str, remarks: List[str]) -> str: - lines = usfm.split("\n") - remark_lines = [f"\\rem {r}" for r in remarks] - return "\n".join(lines[:1] + remark_lines + lines[1:]) - - # Takes the path to a USFM file and the relevant info to parse it # and returns the text of all non-embed sentences and their respective references, # along with any remarks (\rem) that were inserted at the beginning of the file @@ -48,7 +41,7 @@ def get_sentences( draft_remarks = [] for sent in UsfmFileText(stylesheet, encoding, book, book_path, include_all_text=True): marker = sent.ref.path[-1].name if len(sent.ref.path) > 0 else "" - if marker == "rem" and len(refs) == 0: # TODO: \ide and \usfm lines could potentially come before the remark(s) + if marker == "rem" and len(refs) == 0: draft_remarks.append(sent.text) continue if ( @@ -154,12 +147,11 @@ def postprocess_draft( embed_behavior=config.get_embed_behavior(), style_behavior=config.get_style_behavior(), update_block_handlers=config.update_block_handlers, + remarks=(draft_remarks + [config.get_postprocess_remark()]), ) parse_usfm(usfm, handler) usfm_out = handler.get_usfm() - usfm_out = insert_draft_remarks(usfm_out, draft_remarks + [config.get_postprocess_remark()]) - if not out_dir: out_dir = draft_path.parent out_path = out_dir / f"{draft_path.stem}{config.get_postprocess_suffix()}{draft_path.suffix}"