Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ tqdm = "^4.62.2"
sacrebleu = "^2.3.1"
ctranslate2 = "^3.5.1"
libclang = "14.0.6"
sil-machine = {extras = ["thot"], version = "1.7.2"}
sil-machine = {extras = ["thot"], version = "1.7.3"}
datasets = "^2.7.1"
torch = {version = "^2.4", source = "torch"}
sacremoses = "^0.0.53"
Expand Down
29 changes: 8 additions & 21 deletions silnlp/common/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,6 @@
nltk.download("punkt")


def insert_draft_remarks(usfm: str, remarks: List[str]) -> str:
lines = usfm.split("\n")
insert_idx = (
1
+ (len(lines) > 1 and (lines[1].startswith("\\ide") or lines[1].startswith("\\usfm")))
+ (len(lines) > 2 and (lines[2].startswith("\\ide") or lines[2].startswith("\\usfm")))
)
remarks = [f"\\rem {r}" for r in remarks]
return "\n".join(lines[:insert_idx] + remarks + lines[insert_idx:])


# A group of multiple translations of a single sentence
TranslationGroup = List[str]

Expand Down Expand Up @@ -224,6 +213,12 @@ def translate_usfm(
postprocess_handler.create_update_block_handlers(vrefs, sentences, translated_draft)

for config in postprocess_handler.configs:
# Compile draft remarks
draft_src_str = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
draft_remark = f"This draft of {vrefs[0].book} was machine translated on {date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be reviewed and edited carefully."
postprocess_remark = config.get_postprocess_remark()
remarks = [draft_remark] + ([postprocess_remark] if len(postprocess_remark) > 0 else [])

# Insert translation into the USFM structure of an existing project
# If the target project is not the same as the translated file's original project,
# no verses outside of the ones translated will be overwritten
Expand All @@ -239,6 +234,7 @@ def translate_usfm(
embed_behavior=config.get_embed_behavior(),
style_behavior=config.get_style_behavior(),
update_block_handlers=config.update_block_handlers,
remarks=remarks,
)

if usfm_out is None:
Expand All @@ -256,20 +252,11 @@ def translate_usfm(
embed_behavior=config.get_embed_behavior(),
style_behavior=config.get_style_behavior(),
update_block_handlers=config.update_block_handlers,
remarks=remarks,
)
parse_usfm(usfm, handler)
usfm_out = handler.get_usfm()

# Insert draft remarks
description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
remarks = [
f"This draft of {vrefs[0].book} was machine translated on {date.today()} from {description} using model {experiment_ckpt_str}. It should be reviewed and edited carefully."
]
postprocess_remark = config.get_postprocess_remark()
if len(postprocess_remark) > 0:
remarks.append(postprocess_remark)
usfm_out = insert_draft_remarks(usfm_out, remarks)

# Construct output file name write to file
trg_draft_file_path = trg_file_path.with_stem(trg_file_path.stem + config.get_postprocess_suffix())
if produce_multiple_translations:
Expand Down
12 changes: 2 additions & 10 deletions silnlp/nmt/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,6 @@
LOGGER = logging.getLogger(__package__ + ".postprocess")


# NOTE: to be replaced by new machine.py remark functionality
def insert_draft_remarks(usfm: str, remarks: List[str]) -> str:
lines = usfm.split("\n")
remark_lines = [f"\\rem {r}" for r in remarks]
return "\n".join(lines[:1] + remark_lines + lines[1:])


# Takes the path to a USFM file and the relevant info to parse it
# and returns the text of all non-embed sentences and their respective references,
# along with any remarks (\rem) that were inserted at the beginning of the file
Expand All @@ -48,7 +41,7 @@ def get_sentences(
draft_remarks = []
for sent in UsfmFileText(stylesheet, encoding, book, book_path, include_all_text=True):
marker = sent.ref.path[-1].name if len(sent.ref.path) > 0 else ""
if marker == "rem" and len(refs) == 0: # TODO: \ide and \usfm lines could potentially come before the remark(s)
if marker == "rem" and len(refs) == 0:
draft_remarks.append(sent.text)
continue
if (
Expand Down Expand Up @@ -154,12 +147,11 @@ def postprocess_draft(
embed_behavior=config.get_embed_behavior(),
style_behavior=config.get_style_behavior(),
update_block_handlers=config.update_block_handlers,
remarks=(draft_remarks + [config.get_postprocess_remark()]),
)
parse_usfm(usfm, handler)
usfm_out = handler.get_usfm()

usfm_out = insert_draft_remarks(usfm_out, draft_remarks + [config.get_postprocess_remark()])

if not out_dir:
out_dir = draft_path.parent
out_path = out_dir / f"{draft_path.stem}{config.get_postprocess_suffix()}{draft_path.suffix}"
Expand Down