Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b0f223a
Some tests pass
johnml1135 Apr 2, 2025
8f0be69
Added more test framework
johnml1135 Apr 3, 2025
5f4f9bc
Basic implementation and tests for quote convention detection
Apr 8, 2025
e39e841
Initial working version of quotation denormalization
Apr 11, 2025
32fb53b
I want to process the data in segments that correspond to individual …
johnml1135 Apr 3, 2025
072bcb7
Updates for reviewer comments
johnml1135 Apr 10, 2025
d7d804f
Respond to reviewer comments
johnml1135 Apr 11, 2025
390baa0
Additional denormalization tests (not all passing)
Apr 14, 2025
3fe808b
Rebase + additional denormalization tests
Apr 15, 2025
dd01cbc
Improved handling for NLLB-produced quote errors
Apr 25, 2025
a43147e
Unit tests for basic quotation mark resolver
May 8, 2025
c17b79b
Unit tests for UsfmStructureExtractor
May 9, 2025
46ed639
Unit tests for several quotation mark analysis classes
Jun 6, 2025
51ee352
Fix a bug related to verse markers before quotation marks
Jun 6, 2025
f826ab2
Refactoring to allow arbitrary quote convention changes + more unit t…
Jun 20, 2025
e7c279c
Remaining unit tests
Jun 27, 2025
8d9c2f6
Change update_usfm_parser_handler.py to match main branch
Jun 28, 2025
cb31252
Fix linting for test_quote_convention_detector.py
Jun 28, 2025
a9da4a8
Address reviewer comments + refactor
Jul 1, 2025
247fa4c
Correct TextSegment equality function
Jul 2, 2025
5463f29
Damien's requested code-review changes
Jul 9, 2025
59946ce
Eli's requested code-review changes
Jul 10, 2025
7375fe9
One code review change that was left out of the previous commit
Jul 10, 2025
5195ee3
Fixes for Eli's hopefully final code review comments
Jul 17, 2025
fcc9e3b
Fix typing issue in tests for QuotationMarkUpdateFirstPass
Jul 17, 2025
7392827
Bug fix for multi-character quotation marks
Jul 22, 2025
ee01a7f
Use is_at_start_of_segment for consistency
Jul 22, 2025
29be81e
Include analysis details in QuoteConventionAnalysis
Jul 22, 2025
f93a195
Better guesses for ambiguous quotation marks + quote continuer edge c…
Jul 25, 2025
c17c54f
Code review changes requested for tests
Jul 30, 2025
f291d4d
Fix fallback denormalization logic + line endings in a test
Jul 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .dbl_bundle_text_corpus import DblBundleTextCorpus
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
from .dictionary_text_corpus import DictionaryTextCorpus
from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .flatten import flatten
Expand All @@ -24,6 +25,13 @@
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass
from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
Expand Down Expand Up @@ -86,6 +94,7 @@
"AlignmentCollection",
"AlignmentCorpus",
"AlignmentRow",
"FallbackQuotationMarkResolver",
"batch",
"Corpus",
"create_versification_ref_corpus",
Expand Down Expand Up @@ -121,6 +130,13 @@
"PlaceMarkersAlignmentInfo",
"PlaceMarkersUsfmUpdateBlockHandler",
"parse_usfm",
"QuoteConventionChangingUsfmUpdateBlockHandler",
"QuotationMarkUpdateResolutionSettings",
"QuotationMarkUpdateStrategy",
"QuotationMarkUpdateFirstPass",
"QuotationMarkDenormalizationFirstPass",
"QuotationMarkDenormalizationUsfmUpdateBlockHandler",
"QuotationMarkUpdateSettings",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
Expand Down
133 changes: 133 additions & 0 deletions machine/corpora/fallback_quotation_mark_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from typing import Generator, Optional, Set

from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection
from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata
from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings
from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver
from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch


class FallbackQuotationMarkResolver(QuotationMarkResolver):

def __init__(self, settings: QuotationMarkResolutionSettings):
self._settings: QuotationMarkResolutionSettings = settings
self._last_quotation_mark: Optional[QuotationMarkMetadata] = None
self._issues: Set[QuotationMarkResolutionIssue] = set()

def reset(self) -> None:
self._last_quotation_mark = None
self._issues = set()

def resolve_quotation_marks(
self, quotation_mark_matches: list[QuotationMarkStringMatch]
) -> Generator[QuotationMarkMetadata, None, None]:
for quotation_mark_match in quotation_mark_matches:
yield from self._resolve_quotation_mark(quotation_mark_match)

def _resolve_quotation_mark(
self,
quotation_mark_match: QuotationMarkStringMatch,
) -> Generator[QuotationMarkMetadata, None, None]:
if self._is_opening_quotation_mark(quotation_mark_match):
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark
else:
self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
elif self._is_closing_quotation_mark(quotation_mark_match):
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark
else:
self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK)
else:
# Make a reasonable guess about the direction of the quotation mark
if (
self._last_quotation_mark is None
or self._last_quotation_mark.direction is QuotationMarkDirection.CLOSING
):
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark
else:
quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match)
if quotation_mark is not None:
yield quotation_mark

self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK)

def _is_opening_quotation_mark(
self,
match: QuotationMarkStringMatch,
) -> bool:

if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
match
):
return (
match.is_at_start_of_segment()
or match.has_leading_whitespace()
or self._does_most_recent_opening_mark_immediately_precede(match)
or match.has_quote_introducer_in_leading_substring()
) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation())
elif self._settings.is_valid_opening_quotation_mark(match):
return True

return False

def _does_most_recent_opening_mark_immediately_precede(
self,
match: QuotationMarkStringMatch,
) -> bool:
if (
self._last_quotation_mark is None
or self._last_quotation_mark.direction is not QuotationMarkDirection.OPENING
):
return False

return (
self._last_quotation_mark.text_segment == match.text_segment
and self._last_quotation_mark.end_index == match.start_index
)

def _is_closing_quotation_mark(
self,
match: QuotationMarkStringMatch,
) -> bool:

if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark(
match
):
return (
match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment()
) and not match.has_leading_whitespace()
elif self._settings.is_valid_closing_quotation_mark(match):
return True

return False

def _resolve_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
possible_depths: Set[int] = self._settings.get_possible_depths(
quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING
)
if len(possible_depths) == 0:
return None

quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING)
self._last_quotation_mark = quotation_mark
return quotation_mark

def _resolve_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]:
possible_depths: Set[int] = self._settings.get_possible_depths(
quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING
)
if len(possible_depths) == 0:
return None

quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING)
self._last_quotation_mark = quotation_mark
return quotation_mark

def get_issues(self) -> Set[QuotationMarkResolutionIssue]:
return self._issues
68 changes: 68 additions & 0 deletions machine/corpora/punctuation_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from .chapter import Chapter
from .depth_based_quotation_mark_resolver import (
DepthBasedQuotationMarkResolver,
QuotationMarkCategorizer,
QuotationMarkResolverState,
QuoteContinuerState,
QuoteContinuerStyle,
)
from .preliminary_quotation_mark_analyzer import (
ApostropheProportionStatistics,
PreliminaryApostropheAnalyzer,
PreliminaryQuotationMarkAnalyzer,
QuotationMarkGrouper,
QuotationMarkSequences,
QuotationMarkWordPositions,
)
from .quotation_mark_direction import QuotationMarkDirection
from .quotation_mark_finder import QuotationMarkFinder
from .quotation_mark_metadata import QuotationMarkMetadata
from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue
from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
from .quotation_mark_resolver import QuotationMarkResolver
from .quotation_mark_string_match import QuotationMarkStringMatch
from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
from .quote_convention_set import QuoteConventionSet
from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS
from .text_segment import TextSegment
from .usfm_marker_type import UsfmMarkerType
from .usfm_structure_extractor import UsfmStructureExtractor
from .verse import Verse

__all__ = [
"ApostropheProportionStatistics",
"Chapter",
"DepthBasedQuotationMarkResolver",
"PreliminaryApostropheAnalyzer",
"PreliminaryQuotationMarkAnalyzer",
"SingleLevelQuoteConvention",
"QuoteContinuerState",
"QuoteContinuerStyle",
"QuotationMarkCategorizer",
"QuotationMarkCounts",
"QuotationMarkDirection",
"QuotationMarkGrouper",
"QuotationMarkMetadata",
"QuotationMarkResolverState",
"QuotationMarkSequences",
"QuotationMarkStringMatch",
"QuotationMarkWordPositions",
"QuoteConvention",
"QuoteConventionAnalysis",
"QuoteConventionDetectionResolutionSettings",
"QuotationMarkFinder",
"QuotationMarkResolutionIssue",
"QuotationMarkResolutionSettings",
"QuotationMarkResolver",
"QuotationMarkTabulator",
"QuoteConventionDetector",
"QuoteConventionSet",
"STANDARD_QUOTE_CONVENTIONS",
"TextSegment",
"UsfmMarkerType",
"UsfmStructureExtractor",
"Verse",
]
8 changes: 8 additions & 0 deletions machine/corpora/punctuation_analysis/chapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass

from .verse import Verse


@dataclass(frozen=True)
class Chapter:
verses: list[Verse]
Loading