Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
node-version: "14"
- name: Lint with pyright
run: |
npm install -g [email protected].386
npm install -g [email protected].400
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand Down
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
"source.organizeImports": "explicit"
},
},
"files.associations": {
"*.SFM": "usfm",
},
"black-formatter.path": [
"poetry",
"run",
Expand Down
11 changes: 9 additions & 2 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
from .usfm_update_block import UsfmUpdateBlock
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
from .usfm_update_block_handler import UsfmUpdateBlockHandler
from .usx_file_alignment_collection import UsxFileAlignmentCollection
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
from .usx_file_text import UsxFileText
Expand Down Expand Up @@ -92,8 +95,8 @@
"is_scripture",
"lowercase",
"MemoryAlignmentCollection",
"MemoryText",
"MemoryStreamContainer",
"MemoryText",
"MultiKeyRef",
"nfc_normalize",
"nfd_normalize",
Expand Down Expand Up @@ -126,9 +129,9 @@
"TextRow",
"TextRowFlags",
"unescape_spaces",
"UpdateUsfmTextBehavior",
"UpdateUsfmMarkerBehavior",
"UpdateUsfmParserHandler",
"UpdateUsfmTextBehavior",
"UsfmAttribute",
"UsfmElementType",
"UsfmFileText",
Expand All @@ -148,6 +151,10 @@
"UsfmToken",
"UsfmTokenizer",
"UsfmTokenType",
"UsfmUpdateBlock",
"UsfmUpdateBlockElement",
"UsfmUpdateBlockElementType",
"UsfmUpdateBlockHandler",
"UsxFileAlignmentCollection",
"UsxFileAlignmentCorpus",
"UsxFileText",
Expand Down
8 changes: 5 additions & 3 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import re
from abc import ABC, abstractmethod
from collections import defaultdict
Expand Down Expand Up @@ -45,7 +47,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
else:
term_id_to_category_dict = {}

terms_glosses_doc: Optional[ElementTree.ElementTree] = None
terms_glosses_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
resource_name = None
if self._settings.language_code is not None:
resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code)
Expand All @@ -57,7 +59,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream:
terms_glosses_doc = ElementTree.parse(stream)

term_renderings_doc: Optional[ElementTree.ElementTree] = None
term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
if self._exists("TermRenderings.xml"):
with self._open("TermRenderings.xml") as stream:
term_renderings_doc = ElementTree.parse(stream)
Expand Down Expand Up @@ -136,7 +138,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
return term_string


def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]:
def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element]) -> Dict[str, str]:
term_id_to_category_dict: Dict[str, str] = {}

for term in biblical_terms_doc.findall(".//Term"):
Expand Down
7 changes: 5 additions & 2 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from abc import ABC, abstractmethod
from typing import BinaryIO, Optional, Sequence, Tuple, Union
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union

from ..utils.typeshed import StrPath
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .scripture_ref import ScriptureRef
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
from .usfm_parser import parse_usfm
from .usfm_update_block_handler import UsfmUpdateBlockHandler


class ParatextProjectTextUpdaterBase(ABC):
Expand All @@ -25,7 +26,8 @@ def update_usfm(
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Sequence[str]] = None,
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
) -> Optional[str]:
file_name: str = self._settings.get_book_file_name(book_id)
if not self._exists(file_name):
Expand All @@ -40,6 +42,7 @@ def update_usfm(
embed_behavior,
style_behavior,
preserve_paragraph_styles,
update_block_handlers=update_block_handlers,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
Expand Down
124 changes: 38 additions & 86 deletions machine/corpora/scripture_ref_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ class ScriptureTextType(Enum):
NONE = auto()
NONVERSE = auto()
VERSE = auto()
NOTE_TEXT = auto()
EMBED = auto()


EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")
_EMBED_STYLES = {"f", "fe", "x", "fig"}


def _is_embed_style(marker: Optional[str]) -> bool:
return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))


class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
Expand All @@ -28,18 +31,11 @@ def __init__(self) -> None:
self._cur_elements_stack: List[ScriptureElement] = []
self._cur_text_type_stack: List[ScriptureTextType] = []
self._duplicate_verse: bool = False
self._in_preserved_paragraph: bool = False
self._in_embed: bool = False
self._in_note_text: bool = False
self._in_nested_embed: bool = False

@property
def _current_text_type(self) -> ScriptureTextType:
return ScriptureTextType.NONE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1]

def _is_in_note_text(self) -> bool:
return self._in_note_text

def end_usfm(self, state: UsfmParserState) -> None:
self._end_verse_text_wrapper(state)

Expand Down Expand Up @@ -115,32 +111,6 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N
def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_parent_element()

def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
self._in_embed = True
self._start_embed_wrapper(state, marker)

def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_note_text_wrapper(state)
self._end_embed(state, marker, None, closed)
self._in_embed = False

def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)

if not self._duplicate_verse:
self._check_convert_verse_para_to_non_verse(state)
self._next_element(marker)

self._start_embed(state, self._create_non_verse_ref())

def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_embed(
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
) -> None:
pass

def text(self, state: UsfmParserState, text: str) -> None:
# if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment
if text.strip():
Expand All @@ -152,29 +122,23 @@ def opt_break(self, state: UsfmParserState) -> None:
def start_char(
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
) -> None:
if self._is_embed_part_style(marker) and self._in_note_text:
self._in_nested_embed = True
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
self._check_convert_verse_para_to_non_verse(state)

if self._is_embed_style(marker):
self._in_embed = True
self._start_embed_wrapper(state, marker)

if self._is_note_text(marker):
self._start_note_text_wrapper(state)
if _is_embed_style(marker):
self._start_embed_text_wrapper(state, marker)

def end_char(
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
) -> None:
if self._is_embed_part_style(marker):
if self._in_nested_embed:
self._in_nested_embed = False
elif self._is_note_text(marker):
self._end_note_text_wrapper(state)
if self._is_embed_style(marker):
self._end_embed(state, marker, attributes, closed)
self._in_embed = False
if _is_embed_style(marker):
self._end_embed_text_wrapper(state)

def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
self._start_embed_text_wrapper(state, marker)

def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
self._end_embed_text_wrapper(state)

def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ...

Expand All @@ -184,20 +148,9 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture

def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_note_text_wrapper(self, state: UsfmParserState):
self._in_note_text = True
self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT)
self._start_note_text(state)

def _start_note_text(self, state: UsfmParserState) -> None: ...
def _start_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _end_note_text_wrapper(self, state: UsfmParserState):
if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT:
self._end_note_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()
self._in_note_text = False

def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...

def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._duplicate_verse = False
Expand Down Expand Up @@ -225,6 +178,25 @@ def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
self._cur_elements_stack.append(ScriptureElement(0, marker))
self._cur_verse_ref = verse_ref.copy()

def _start_embed_text_wrapper(self, state: UsfmParserState, marker: str) -> None:
if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)

if not self._duplicate_verse:
self._check_convert_verse_para_to_non_verse(state)
self._next_element(marker)
self._cur_text_type_stack.append(ScriptureTextType.EMBED)
self._start_embed_text(state, self._create_non_verse_ref())

def _end_embed_text_wrapper(self, state: UsfmParserState) -> None:
if (
not self._duplicate_verse
and self._cur_text_type_stack
and self._cur_text_type_stack[-1] == ScriptureTextType.EMBED
):
self._end_embed_text(state, self._create_non_verse_ref())
self._cur_text_type_stack.pop()

def _next_element(self, marker: str) -> None:
prev_elem: ScriptureElement = self._cur_elements_stack.pop()
self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker))
Expand All @@ -237,7 +209,7 @@ def _end_parent_element(self) -> None:
self._cur_elements_stack.pop()

def _end_embed_elements(self) -> None:
if self._cur_elements_stack and self._is_embed_style(self._cur_elements_stack[-1].name):
if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name):
self._cur_elements_stack.pop()

def _create_verse_refs(self) -> List[ScriptureRef]:
Expand Down Expand Up @@ -266,23 +238,3 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
):
self._start_parent_element(para_tag.marker)
self._start_non_verse_text_wrapper(state)

def _is_in_embed(self, marker: Optional[str]) -> bool:
return self._in_embed or self._is_embed_style(marker)

def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
return self._in_nested_embed or (
marker is not None
and marker.startswith("+")
and marker[1] in EMBED_PART_START_CHAR_STYLES
and marker != "fm"
)

def _is_note_text(self, marker: Optional[str]) -> bool:
return marker == "ft"

def _is_embed_part_style(self, marker: Optional[str]) -> bool:
return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) and marker != "fm"

def _is_embed_style(self, marker: Optional[str]) -> bool:
return marker is not None and marker.strip("*") in EMBED_STYLES
Loading