diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index 45eb628b..f48b6990 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -7,6 +7,7 @@ from .dbl_bundle_text_corpus import DblBundleTextCorpus from .dictionary_alignment_corpus import DictionaryAlignmentCorpus from .dictionary_text_corpus import DictionaryTextCorpus +from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser from .file_paratext_project_text_updater import FileParatextProjectTextUpdater from .flatten import flatten @@ -24,6 +25,13 @@ from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler +from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass +from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler +from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy +from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler from .scripture_element import ScriptureElement from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType @@ -86,6 +94,7 @@ "AlignmentCollection", "AlignmentCorpus", "AlignmentRow", + "FallbackQuotationMarkResolver", "batch", "Corpus", "create_versification_ref_corpus", @@ -121,6 +130,13 @@ "PlaceMarkersAlignmentInfo", "PlaceMarkersUsfmUpdateBlockHandler", "parse_usfm", + "QuoteConventionChangingUsfmUpdateBlockHandler", + "QuotationMarkUpdateResolutionSettings", + "QuotationMarkUpdateStrategy", + "QuotationMarkUpdateFirstPass", + "QuotationMarkDenormalizationFirstPass", + "QuotationMarkDenormalizationUsfmUpdateBlockHandler", + "QuotationMarkUpdateSettings", "RtlReferenceOrder", "ScriptureElement", "ScriptureRef", diff --git a/machine/corpora/fallback_quotation_mark_resolver.py b/machine/corpora/fallback_quotation_mark_resolver.py new file mode 100644 index 00000000..0221cbbb --- /dev/null +++ b/machine/corpora/fallback_quotation_mark_resolver.py @@ -0,0 +1,133 @@ +from typing import Generator, Optional, Set + +from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection +from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata +from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch + + +class FallbackQuotationMarkResolver(QuotationMarkResolver): + + def __init__(self, settings: QuotationMarkResolutionSettings): + self._settings: QuotationMarkResolutionSettings = settings + self._last_quotation_mark: Optional[QuotationMarkMetadata] = None + self._issues: Set[QuotationMarkResolutionIssue] = set() + + def reset(self) -> None: + self._last_quotation_mark = None + self._issues = set() + + def resolve_quotation_marks( + self, quotation_mark_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for quotation_mark_match in quotation_mark_matches: + yield from self._resolve_quotation_mark(quotation_mark_match) + + def _resolve_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._is_opening_quotation_mark(quotation_mark_match): + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + else: + self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + elif self._is_closing_quotation_mark(quotation_mark_match): + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + else: + self._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + else: + # Make a reasonable guess about the direction of the quotation mark + if ( + self._last_quotation_mark is None + or self._last_quotation_mark.direction is QuotationMarkDirection.CLOSING + ): + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_opening_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + else: + quotation_mark: Optional[QuotationMarkMetadata] = self._resolve_closing_mark(quotation_mark_match) + if quotation_mark is not None: + yield quotation_mark + + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) + + def _is_opening_quotation_mark( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( + match + ): + return ( + match.is_at_start_of_segment() + or match.has_leading_whitespace() + or self._does_most_recent_opening_mark_immediately_precede(match) + or match.has_quote_introducer_in_leading_substring() + ) and not (match.has_trailing_whitespace() or match.has_trailing_punctuation()) + elif self._settings.is_valid_opening_quotation_mark(match): + return True + + return False + + def _does_most_recent_opening_mark_immediately_precede( + self, + match: QuotationMarkStringMatch, + ) -> bool: + if ( + self._last_quotation_mark is None + or self._last_quotation_mark.direction is not QuotationMarkDirection.OPENING + ): + return False + + return ( + self._last_quotation_mark.text_segment == match.text_segment + and self._last_quotation_mark.end_index == match.start_index + ) + + def _is_closing_quotation_mark( + self, + match: QuotationMarkStringMatch, + ) -> bool: + + if self._settings.is_valid_opening_quotation_mark(match) and self._settings.is_valid_closing_quotation_mark( + match + ): + return ( + match.has_trailing_whitespace() or match.has_trailing_punctuation() or match.is_at_end_of_segment() + ) and not match.has_leading_whitespace() + elif self._settings.is_valid_closing_quotation_mark(match): + return True + + return False + + def _resolve_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]: + possible_depths: Set[int] = self._settings.get_possible_depths( + quotation_mark_match.quotation_mark, QuotationMarkDirection.OPENING + ) + if len(possible_depths) == 0: + return None + + quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.OPENING) + self._last_quotation_mark = quotation_mark + return quotation_mark + + def _resolve_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> Optional[QuotationMarkMetadata]: + possible_depths: Set[int] = self._settings.get_possible_depths( + quotation_mark_match.quotation_mark, QuotationMarkDirection.CLOSING + ) + if len(possible_depths) == 0: + return None + + quotation_mark = quotation_mark_match.resolve(min(possible_depths), QuotationMarkDirection.CLOSING) + self._last_quotation_mark = quotation_mark + return quotation_mark + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return self._issues diff --git a/machine/corpora/punctuation_analysis/__init__.py b/machine/corpora/punctuation_analysis/__init__.py new file mode 100644 index 00000000..5aaeb99a --- /dev/null +++ b/machine/corpora/punctuation_analysis/__init__.py @@ -0,0 +1,68 @@ +from .chapter import Chapter +from .depth_based_quotation_mark_resolver import ( + DepthBasedQuotationMarkResolver, + QuotationMarkCategorizer, + QuotationMarkResolverState, + QuoteContinuerState, + QuoteContinuerStyle, +) +from .preliminary_quotation_mark_analyzer import ( + ApostropheProportionStatistics, + PreliminaryApostropheAnalyzer, + PreliminaryQuotationMarkAnalyzer, + QuotationMarkGrouper, + QuotationMarkSequences, + QuotationMarkWordPositions, +) +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_finder import QuotationMarkFinder +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .quotation_mark_resolver import QuotationMarkResolver +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator +from .quote_convention import QuoteConvention, SingleLevelQuoteConvention +from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings +from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector +from .quote_convention_set import QuoteConventionSet +from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType +from .usfm_structure_extractor import UsfmStructureExtractor +from .verse import Verse + +__all__ = [ + "ApostropheProportionStatistics", + "Chapter", + "DepthBasedQuotationMarkResolver", + "PreliminaryApostropheAnalyzer", + "PreliminaryQuotationMarkAnalyzer", + "SingleLevelQuoteConvention", + "QuoteContinuerState", + "QuoteContinuerStyle", + "QuotationMarkCategorizer", + "QuotationMarkCounts", + "QuotationMarkDirection", + "QuotationMarkGrouper", + "QuotationMarkMetadata", + "QuotationMarkResolverState", + "QuotationMarkSequences", + "QuotationMarkStringMatch", + "QuotationMarkWordPositions", + "QuoteConvention", + "QuoteConventionAnalysis", + "QuoteConventionDetectionResolutionSettings", + "QuotationMarkFinder", + "QuotationMarkResolutionIssue", + "QuotationMarkResolutionSettings", + "QuotationMarkResolver", + "QuotationMarkTabulator", + "QuoteConventionDetector", + "QuoteConventionSet", + "STANDARD_QUOTE_CONVENTIONS", + "TextSegment", + "UsfmMarkerType", + "UsfmStructureExtractor", + "Verse", +] diff --git a/machine/corpora/punctuation_analysis/chapter.py b/machine/corpora/punctuation_analysis/chapter.py new file mode 100644 index 00000000..342a91de --- /dev/null +++ b/machine/corpora/punctuation_analysis/chapter.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + +from .verse import Verse + + +@dataclass(frozen=True) +class Chapter: + verses: list[Verse] diff --git a/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py new file mode 100644 index 00000000..4643701d --- /dev/null +++ b/machine/corpora/punctuation_analysis/depth_based_quotation_mark_resolver.py @@ -0,0 +1,430 @@ +from enum import Enum, auto +from typing import Generator, Optional, Set + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .quotation_mark_resolver import QuotationMarkResolver +from .quotation_mark_string_match import QuotationMarkStringMatch +from .usfm_marker_type import UsfmMarkerType + + +class QuotationMarkResolverState: + + def __init__(self): + self.reset() + + def reset(self) -> None: + self._quotation_stack: list[QuotationMarkMetadata] = [] + + @property + def current_depth(self) -> int: + return len(self._quotation_stack) + + def has_open_quotation_mark(self) -> bool: + return self.current_depth > 0 + + def are_more_than_n_quotes_open(self, n: int) -> bool: + return self.current_depth > n + + def add_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quotation_mark = quotation_mark_match.resolve(self.current_depth + 1, QuotationMarkDirection.OPENING) + self._quotation_stack.append(quotation_mark) + return quotation_mark + + def add_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + quotation_mark = quotation_mark_match.resolve(self.current_depth, QuotationMarkDirection.CLOSING) + self._quotation_stack.pop() + return quotation_mark + + def get_opening_quotation_mark_at_depth(self, depth: int) -> str: + if depth > self.current_depth: + raise RuntimeError( + f"Opening quotation mark at depth ${depth} was requested from a quotation stack " + + f"with depth ${self.current_depth}." + ) + return self._quotation_stack[depth - 1].quotation_mark + + def get_deepest_opening_quotation_mark(self) -> str: + if not self.has_open_quotation_mark(): + raise RuntimeError("The deepest opening quotation mark was requested from an empty quotation stack.") + return self._quotation_stack[-1].quotation_mark + + +class QuoteContinuerStyle(Enum): + UNDETERMINED = auto() + ENGLISH = auto() + SPANISH = auto() + + +class QuoteContinuerState: + def __init__(self): + self.reset() + + def reset(self) -> None: + self._quote_continuer_mark_stack: list[QuotationMarkMetadata] = [] + self._continuer_style = QuoteContinuerStyle.UNDETERMINED + + @property + def current_depth(self) -> int: + return len(self._quote_continuer_mark_stack) + + def continuer_has_been_observed(self) -> bool: + return len(self._quote_continuer_mark_stack) > 0 + + @property + def continuer_style(self) -> QuoteContinuerStyle: + return self._continuer_style + + def add_quote_continuer( + self, + quotation_mark_match: QuotationMarkStringMatch, + quotation_mark_resolver_state: QuotationMarkResolverState, + quote_continuer_style: QuoteContinuerStyle, + ) -> QuotationMarkMetadata: + quotation_mark = quotation_mark_match.resolve( + len(self._quote_continuer_mark_stack) + 1, QuotationMarkDirection.OPENING + ) + self._quote_continuer_mark_stack.append(quotation_mark) + self._continuer_style = quote_continuer_style + if self.current_depth == quotation_mark_resolver_state.current_depth: + self._quote_continuer_mark_stack.clear() + return quotation_mark + + +class QuotationMarkCategorizer: + _APOSTROPHE_PATTERN = regex.compile(r"[\'\u2019\u2018]", regex.U) + + def __init__( + self, + quotation_mark_resolution_settings: QuotationMarkResolutionSettings, + quotation_mark_resolver_state: QuotationMarkResolverState, + quote_continuer_state: QuoteContinuerState, + ): + self._settings = quotation_mark_resolution_settings + self._quotation_mark_resolver_state = quotation_mark_resolver_state + self._quote_continuer_state = quote_continuer_state + + def is_english_quote_continuer( + self, + quotation_mark_match: QuotationMarkStringMatch, + previous_match: Optional[QuotationMarkStringMatch], + next_match: Optional[QuotationMarkStringMatch], + ) -> bool: + if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH: + return False + if not self._meets_quote_continuer_prerequisites(quotation_mark_match): + return False + + if ( + quotation_mark_match.quotation_mark + != self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quote_continuer_state.current_depth + 1 + ) + ): + return False + + if not self._quote_continuer_state.continuer_has_been_observed(): + if quotation_mark_match._start_index > 0: + return False + + # Check the next quotation mark match, since quote continuers must appear consecutively + if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if next_match is None or next_match.start_index != quotation_mark_match.end_index: + return False + + return True + + def is_spanish_quote_continuer( + self, + quotation_mark_match: QuotationMarkStringMatch, + previous_match: Optional[QuotationMarkStringMatch], + next_match: Optional[QuotationMarkStringMatch], + ) -> bool: + if self._quote_continuer_state.continuer_style == QuoteContinuerStyle.ENGLISH: + return False + if not self._meets_quote_continuer_prerequisites(quotation_mark_match): + return False + + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_opening_quotation_mark_at_depth( + self._quote_continuer_state.current_depth + 1 + ), + quotation_mark_match.quotation_mark, + ): + return False + + if not self._quote_continuer_state.continuer_has_been_observed(): + if quotation_mark_match._start_index > 0: + return False + + # This has only been observed with guillemets so far + if quotation_mark_match.quotation_mark != "»": + return False + + # Check the next quotation mark match, since quote continuers must appear consecutively + if self._quotation_mark_resolver_state.are_more_than_n_quotes_open(1): + if next_match is None or next_match.start_index != quotation_mark_match.end_index: + return False + + return True + + def _meets_quote_continuer_prerequisites( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + if self._quote_continuer_state.current_depth >= self._quotation_mark_resolver_state.current_depth: + return False + + if ( + self._settings.should_rely_on_paragraph_markers + and not quotation_mark_match._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) + ): + return False + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return True + + def is_opening_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + + if not self._settings.is_valid_opening_quotation_mark(quotation_mark_match): + return False + + # If the quote convention is ambiguous, use whitespace as a clue + if self._settings.is_valid_closing_quotation_mark(quotation_mark_match): + return ( + quotation_mark_match.has_leading_whitespace() + or self._most_recent_opening_mark_immediately_precedes(quotation_mark_match) + or quotation_mark_match.has_quote_introducer_in_leading_substring() + ) and not ( + quotation_mark_match.has_trailing_whitespace() or quotation_mark_match.has_trailing_punctuation() + ) + return True + + def is_closing_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): + return False + + # If the quote convention is ambiguous, use whitespace as a clue + if self._settings.is_valid_opening_quotation_mark(quotation_mark_match): + return ( + quotation_mark_match.has_trailing_whitespace() + or quotation_mark_match.has_trailing_punctuation() + or quotation_mark_match.is_at_end_of_segment() + or quotation_mark_match.next_character_matches(self._settings.closing_quotation_mark_regex) + ) and not quotation_mark_match.has_leading_whitespace() + return True + + def is_malformed_opening_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_opening_quotation_mark(quotation_mark_match): + return False + + if quotation_mark_match.has_quote_introducer_in_leading_substring(): + return True + + if ( + quotation_mark_match.has_leading_whitespace() + and quotation_mark_match.has_trailing_whitespace() + and not self._quotation_mark_resolver_state.has_open_quotation_mark() + ): + return True + + return False + + def is_malformed_closing_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): + return False + + return ( + ( + quotation_mark_match.is_at_end_of_segment() + or not quotation_mark_match.has_trailing_whitespace() + or (quotation_mark_match.has_leading_whitespace() and quotation_mark_match.has_trailing_whitespace()) + ) + and self._quotation_mark_resolver_state.has_open_quotation_mark() + and self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, + ) + ) + + def is_unpaired_closing_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + ) -> bool: + if not self._settings.is_valid_closing_quotation_mark(quotation_mark_match): + return False + + if self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return not quotation_mark_match.has_leading_whitespace() and ( + quotation_mark_match.is_at_end_of_segment() or quotation_mark_match.has_trailing_whitespace() + ) + + def _most_recent_opening_mark_immediately_precedes(self, match: QuotationMarkStringMatch) -> bool: + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return False + + return self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == match.previous_character + + def is_apostrophe( + self, + quotation_mark_match: QuotationMarkStringMatch, + next_match: Optional[QuotationMarkStringMatch], + ) -> bool: + if not quotation_mark_match.quotation_mark_matches(self._APOSTROPHE_PATTERN): + return False + + # Latin letters on both sides of punctuation mark + if ( + quotation_mark_match.previous_character is not None + and quotation_mark_match.has_leading_latin_letter() + and quotation_mark_match.next_character is not None + and quotation_mark_match.has_trailing_latin_letter() + ): + return True + + # Potential final s possessive (e.g. Moses') + if quotation_mark_match.previous_character_matches(regex.compile(r"s")) and ( + quotation_mark_match.has_trailing_whitespace() or quotation_mark_match.has_trailing_punctuation() + ): + # Check whether it could be a closing quotation mark + if not self._quotation_mark_resolver_state.has_open_quotation_mark(): + return True + if not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, + ): + return True + if next_match is not None and self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + next_match.quotation_mark, + ): + return True + + # For languages that use apostrophes at the start and end of words + if ( + not self._quotation_mark_resolver_state.has_open_quotation_mark() + and quotation_mark_match.quotation_mark == "'" + or self._quotation_mark_resolver_state.has_open_quotation_mark() + and not self._settings.are_marks_a_valid_pair( + self._quotation_mark_resolver_state.get_deepest_opening_quotation_mark(), + quotation_mark_match.quotation_mark, + ) + ): + return True + + return False + + +class DepthBasedQuotationMarkResolver(QuotationMarkResolver): + def __init__(self, settings: QuotationMarkResolutionSettings): + self._settings = settings + self._quotation_mark_resolver_state = QuotationMarkResolverState() + self._quote_continuer_state = QuoteContinuerState() + self._quotation_mark_categorizer = QuotationMarkCategorizer( + self._settings, self._quotation_mark_resolver_state, self._quote_continuer_state + ) + self._issues: Set[QuotationMarkResolutionIssue] = set() + + def reset(self) -> None: + self._quotation_mark_resolver_state.reset() + self._quote_continuer_state.reset() + self._issues = set() + + def resolve_quotation_marks( + self, quotation_mark_matches: list[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + for index, quotation_mark_match in enumerate(quotation_mark_matches): + previous_mark = None if index == 0 else quotation_mark_matches[index - 1] + next_mark = None if index == len(quotation_mark_matches) - 1 else quotation_mark_matches[index + 1] + yield from self._resolve_quotation_mark(quotation_mark_match, previous_mark, next_mark) + if self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + + def _resolve_quotation_mark( + self, + quotation_mark_match: QuotationMarkStringMatch, + previous_mark: Optional[QuotationMarkStringMatch], + next_mark: Optional[QuotationMarkStringMatch], + ) -> Generator[QuotationMarkMetadata, None, None]: + if self._quotation_mark_categorizer.is_opening_quotation_mark(quotation_mark_match): + if self._quotation_mark_categorizer.is_english_quote_continuer( + quotation_mark_match, previous_mark, next_mark + ): + yield self._process_quote_continuer(quotation_mark_match, QuoteContinuerStyle.ENGLISH) + else: + if self._is_depth_too_great(): + self._issues.add(QuotationMarkResolutionIssue.TOO_DEEP_NESTING) + return + + yield self._process_opening_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_apostrophe(quotation_mark_match, next_mark): + pass + elif self._quotation_mark_categorizer.is_closing_quotation_mark(quotation_mark_match): + if self._quotation_mark_categorizer.is_spanish_quote_continuer( + quotation_mark_match, previous_mark, next_mark + ): + yield self._process_quote_continuer(quotation_mark_match, QuoteContinuerStyle.SPANISH) + elif not self._quotation_mark_resolver_state.has_open_quotation_mark(): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + return + else: + yield self._process_closing_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_malformed_closing_quotation_mark(quotation_mark_match): + yield self._process_closing_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_malformed_opening_quotation_mark(quotation_mark_match): + yield self._process_opening_mark(quotation_mark_match) + elif self._quotation_mark_categorizer.is_unpaired_closing_quotation_mark(quotation_mark_match): + self._issues.add(QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK) + else: + self._issues.add(QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK) + + def _process_quote_continuer( + self, quotation_mark_match: QuotationMarkStringMatch, continuer_style: QuoteContinuerStyle + ) -> QuotationMarkMetadata: + return self._quote_continuer_state.add_quote_continuer( + quotation_mark_match, self._quotation_mark_resolver_state, continuer_style + ) + + def _is_depth_too_great(self) -> bool: + return self._quotation_mark_resolver_state.are_more_than_n_quotes_open(3) + + def _process_opening_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.metadata_matches_quotation_mark( + quotation_mark_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth + 1, + QuotationMarkDirection.OPENING, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_opening_quotation_mark(quotation_mark_match) + + def _process_closing_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> QuotationMarkMetadata: + if not self._settings.metadata_matches_quotation_mark( + quotation_mark_match.quotation_mark, + self._quotation_mark_resolver_state.current_depth, + QuotationMarkDirection.CLOSING, + ): + self._issues.add(QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK) + return self._quotation_mark_resolver_state.add_closing_quotation_mark(quotation_mark_match) + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return self._issues diff --git a/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py new file mode 100644 index 00000000..47ada522 --- /dev/null +++ b/machine/corpora/punctuation_analysis/preliminary_quotation_mark_analyzer.py @@ -0,0 +1,336 @@ +from collections import Counter, defaultdict +from typing import Dict, Generator, List, Tuple + +import regex + +from .chapter import Chapter +from .quotation_mark_finder import QuotationMarkFinder +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .verse import Verse + + +class ApostropheProportionStatistics: + def __init__(self): + self.reset() + + def reset(self) -> None: + self._num_characters = 0 + self._num_apostrophes = 0 + + def count_characters(self, text_segment: TextSegment) -> None: + self._num_characters += text_segment.length + + def add_apostrophe(self) -> None: + self._num_apostrophes += 1 + + def is_apostrophe_proportion_greater_than(self, threshold: float) -> bool: + if self._num_characters == 0: + return False + return self._num_apostrophes / self._num_characters > threshold + + +class QuotationMarkWordPositions: + _MAXIMUM_PROPORTION_FOR_RARITY = 0.1 + _MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD = 0.3 + + def __init__(self): + self.reset() + + def reset(self) -> None: + self._word_initial_occurrences: Counter[str] = Counter() + self._mid_word_occurrences: Counter[str] = Counter() + self._word_final_occurrences: Counter[str] = Counter() + self._total_occurrences: Counter[str] = Counter() + + def count_word_initial_apostrophe(self, quotation_mark: str) -> None: + self._word_initial_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) + + def count_mid_word_apostrophe(self, quotation_mark: str) -> None: + self._mid_word_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) + + def count_word_final_apostrophe(self, quotation_mark: str) -> None: + self._word_final_occurrences.update([quotation_mark]) + self._total_occurrences.update([quotation_mark]) + + def _get_total_occurrences(self, quotation_mark: str) -> int: + return ( + self._word_initial_occurrences[quotation_mark] + + self._mid_word_occurrences[quotation_mark] + + self._word_final_occurrences[quotation_mark] + ) + + def is_mark_rarely_initial(self, quotation_mark: str) -> bool: + num_initial_marks: int = self._word_initial_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return num_total_marks > 0 and num_initial_marks / num_total_marks < self._MAXIMUM_PROPORTION_FOR_RARITY + + def is_mark_rarely_final(self, quotation_mark: str) -> bool: + num_final_marks: int = self._word_final_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return num_total_marks > 0 and num_final_marks / num_total_marks < self._MAXIMUM_PROPORTION_FOR_RARITY + + def are_initial_and_final_rates_similar(self, quotation_mark: str) -> bool: + num_initial_marks: int = self._word_initial_occurrences[quotation_mark] + num_final_marks: int = self._word_final_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return ( + num_total_marks > 0 + and abs(num_initial_marks - num_final_marks) / num_total_marks + < self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD + ) + + def is_mark_commonly_mid_word(self, quotation_mark: str) -> bool: + num_mid_word_marks: int = self._mid_word_occurrences[quotation_mark] + num_total_marks: int = self._total_occurrences[quotation_mark] + return ( + num_total_marks > 0 and num_mid_word_marks / num_total_marks > self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD + ) + + +class QuotationMarkSequences: + _SOLE_OCCURRENCE_MINIMUM_COUNT = 5 + _MUCH_MORE_COMMON_MINIMUM_RATIO = 10 + _MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD = 0.2 + + def __init__(self): + self.reset() + + def reset(self) -> None: + self._earlier_quotation_mark_counts: Counter[str] = Counter() + self._later_quotation_mark_counts: Counter[str] = Counter() + + def count_earlier_quotation_mark(self, quotation_mark: str) -> None: + self._earlier_quotation_mark_counts.update([quotation_mark]) + + def count_later_quotation_mark(self, quotation_mark: str) -> None: + self._later_quotation_mark_counts.update([quotation_mark]) + + def is_mark_much_more_common_earlier(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] + return (num_late_occurrences == 0 and num_early_occurrences > self._SOLE_OCCURRENCE_MINIMUM_COUNT) or ( + num_early_occurrences > num_late_occurrences * self._MUCH_MORE_COMMON_MINIMUM_RATIO + ) + + def is_mark_much_more_common_later(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] + return (num_early_occurrences == 0 and num_late_occurrences > self._SOLE_OCCURRENCE_MINIMUM_COUNT) or ( + num_late_occurrences > num_early_occurrences * self._MUCH_MORE_COMMON_MINIMUM_RATIO + ) + + def are_early_and_late_mark_rates_similar(self, quotation_mark: str) -> bool: + num_early_occurrences: int = self._earlier_quotation_mark_counts[quotation_mark] + num_late_occurrences: int = self._later_quotation_mark_counts[quotation_mark] + return ( + num_early_occurrences > 0 + and abs(num_late_occurrences - num_early_occurrences) / (num_early_occurrences + num_late_occurrences) + < self._MAXIMUM_PROPORTION_DIFFERENCE_THRESHOLD + ) + + +class QuotationMarkGrouper: + def __init__(self, quotation_marks: List[QuotationMarkStringMatch], quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions + self._group_quotation_marks(quotation_marks) + + def _group_quotation_marks(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: + self._grouped_quotation_marks: Dict[str, List[QuotationMarkStringMatch]] = defaultdict(list) + for quotation_mark_match in quotation_marks: + self._grouped_quotation_marks[quotation_mark_match.quotation_mark].append(quotation_mark_match) + + def get_quotation_mark_pairs(self) -> Generator[Tuple[str, str], None, None]: + for mark1, matches1 in self._grouped_quotation_marks.items(): + # Handle cases of identical opening/closing marks + if ( + len(matches1) == 2 + and self._quote_conventions.is_quotation_mark_direction_ambiguous(mark1) + and not self.has_distinct_paired_quotation_mark(mark1) + ): + yield (mark1, mark1) + continue + + # Skip verses where quotation mark pairs are ambiguous + if len(matches1) > 1: + continue + + # Find matching closing marks + for mark2, matches2 in self._grouped_quotation_marks.items(): + if ( + len(matches2) == 1 + and self._quote_conventions.marks_are_a_valid_pair(mark1, mark2) + and matches1[0].precedes(matches2[0]) + ): + yield (mark1, mark2) + + def has_distinct_paired_quotation_mark(self, quotation_mark: str) -> bool: + return any( + [ + mark != quotation_mark and mark in self._grouped_quotation_marks + for mark in self._quote_conventions.get_possible_paired_quotation_marks(quotation_mark) + ] + ) + + +class PreliminaryApostropheAnalyzer: + _APOSTROPHE_PATTERN = regex.compile(r"[\'\u2019]", regex.U) + _MAXIMUM_APOSTROPHE_PROPORTION = 0.02 + + def __init__(self): + self._apostrophe_proportion_statistics = ApostropheProportionStatistics() + self._word_position_statistics = QuotationMarkWordPositions() + self.reset() + + def reset(self) -> None: + self._apostrophe_proportion_statistics.reset() + self._word_position_statistics.reset() + + def process_quotation_marks( + self, text_segments: List[TextSegment], quotation_marks: List[QuotationMarkStringMatch] + ) -> None: + for text_segment in text_segments: + self._apostrophe_proportion_statistics.count_characters(text_segment) + for quotation_mark_match in quotation_marks: + self._process_quotation_mark(quotation_mark_match) + + def _process_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> None: + if quotation_mark_match.quotation_mark_matches(self._APOSTROPHE_PATTERN): + self._count_apostrophe(quotation_mark_match) + + def _count_apostrophe(self, apostrophe_match: QuotationMarkStringMatch) -> None: + apostrophe: str = apostrophe_match.quotation_mark + self._apostrophe_proportion_statistics.add_apostrophe() + if self._is_match_word_initial(apostrophe_match): + self._word_position_statistics.count_word_initial_apostrophe(apostrophe) + elif self._is_match_mid_word(apostrophe_match): + self._word_position_statistics.count_mid_word_apostrophe(apostrophe) + elif self._is_match_word_final(apostrophe_match): + self._word_position_statistics.count_word_final_apostrophe(apostrophe) + + def _is_match_word_initial(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if apostrophe_match.has_trailing_whitespace(): + return False + if not apostrophe_match.is_at_start_of_segment() and not apostrophe_match.has_leading_whitespace(): + return False + return True + + def _is_match_mid_word(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if apostrophe_match.has_trailing_whitespace(): + return False + if apostrophe_match.has_leading_whitespace(): + return False + return True + + def _is_match_word_final(self, apostrophe_match: QuotationMarkStringMatch) -> bool: + if not apostrophe_match.is_at_end_of_segment() and not apostrophe_match.has_trailing_whitespace(): + return False + if apostrophe_match.has_leading_whitespace(): + return False + return True + + def is_apostrophe_only(self, mark: str) -> bool: + if not self._APOSTROPHE_PATTERN.search(mark): + return False + + if self._word_position_statistics.is_mark_rarely_initial( + mark + ) or self._word_position_statistics.is_mark_rarely_final(mark): + return True + + if self._word_position_statistics.are_initial_and_final_rates_similar( + mark + ) and self._word_position_statistics.is_mark_commonly_mid_word(mark): + return True + + if self._apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than( + self._MAXIMUM_APOSTROPHE_PROPORTION + ): + return True + + return False + + +class PreliminaryQuotationMarkAnalyzer: + + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions + self._apostrophe_analyzer = PreliminaryApostropheAnalyzer() + self._quotation_mark_sequences = QuotationMarkSequences() + self.reset() + + def reset(self) -> None: + self._apostrophe_analyzer.reset() + self._quotation_mark_sequences.reset() + + def narrow_down_possible_quote_conventions(self, chapters: List[Chapter]) -> QuoteConventionSet: + for chapter in chapters: + self._analyze_quotation_marks_for_chapter(chapter) + return self._select_compatible_quote_conventions() + + def _analyze_quotation_marks_for_chapter(self, chapter: Chapter) -> None: + for verse in chapter.verses: + self._analyze_quotation_marks_for_verse(verse) + + def _analyze_quotation_marks_for_verse(self, verse: Verse) -> None: + quotation_marks: List[QuotationMarkStringMatch] = QuotationMarkFinder( + self._quote_conventions + ).find_all_potential_quotation_marks_in_verse(verse) + self._analyze_quotation_mark_sequence(quotation_marks) + self._apostrophe_analyzer.process_quotation_marks(verse.text_segments, quotation_marks) + + def _analyze_quotation_mark_sequence(self, quotation_marks: List[QuotationMarkStringMatch]) -> None: + quotation_mark_grouper: QuotationMarkGrouper = QuotationMarkGrouper(quotation_marks, self._quote_conventions) + for earlier_mark, later_mark in quotation_mark_grouper.get_quotation_mark_pairs(): + self._quotation_mark_sequences.count_earlier_quotation_mark(earlier_mark) + self._quotation_mark_sequences.count_later_quotation_mark(later_mark) + + def _select_compatible_quote_conventions(self) -> QuoteConventionSet: + opening_quotation_marks = self._find_opening_quotation_marks() + closing_quotation_marks = self._find_closing_quotation_marks() + + return self._quote_conventions.filter_to_compatible_quote_conventions( + opening_quotation_marks, closing_quotation_marks + ) + + def _find_opening_quotation_marks(self) -> List[str]: + return [ + quotation_mark + for quotation_mark in self._quote_conventions.get_possible_opening_marks() + if self._is_opening_quotation_mark(quotation_mark) + ] + + def _is_opening_quotation_mark(self, quotation_mark: str) -> bool: + if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark): + return False + + if self._quotation_mark_sequences.is_mark_much_more_common_earlier(quotation_mark): + return True + if self._quotation_mark_sequences.are_early_and_late_mark_rates_similar( + quotation_mark + ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): + return True + return False + + def _find_closing_quotation_marks(self) -> List[str]: + return [ + quotation_mark + for quotation_mark in self._quote_conventions.get_possible_closing_marks() + if self._is_closing_quotation_mark(quotation_mark) + ] + + def _is_closing_quotation_mark(self, quotation_mark: str) -> bool: + if self._apostrophe_analyzer.is_apostrophe_only(quotation_mark): + return False + + if self._quotation_mark_sequences.is_mark_much_more_common_later(quotation_mark): + return True + + if self._quotation_mark_sequences.are_early_and_late_mark_rates_similar( + quotation_mark + ) and self._quote_conventions.is_quotation_mark_direction_ambiguous(quotation_mark): + return True + return False diff --git a/machine/corpora/punctuation_analysis/quotation_mark_direction.py b/machine/corpora/punctuation_analysis/quotation_mark_direction.py new file mode 100644 index 00000000..87734ae3 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_direction.py @@ -0,0 +1,6 @@ +from enum import Enum, auto + + +class QuotationMarkDirection(Enum): + OPENING = auto() + CLOSING = auto() diff --git a/machine/corpora/punctuation_analysis/quotation_mark_finder.py b/machine/corpora/punctuation_analysis/quotation_mark_finder.py new file mode 100644 index 00000000..73c95368 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_finder.py @@ -0,0 +1,46 @@ +from typing import List + +import regex + +from .chapter import Chapter +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .verse import Verse + + +class QuotationMarkFinder: + _QUOTATION_MARK_PATTERN = regex.compile(r"(\p{Quotation_Mark}|<<|>>|<|>)", regex.U) + + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions + + def find_all_potential_quotation_marks_in_chapter(self, chapter: Chapter) -> List[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] + for verse in chapter.verses: + quotation_matches.extend(self.find_all_potential_quotation_marks_in_verse(verse)) + return quotation_matches + + def find_all_potential_quotation_marks_in_verse(self, verse: Verse) -> List[QuotationMarkStringMatch]: + return self.find_all_potential_quotation_marks_in_text_segments(verse.text_segments) + + def find_all_potential_quotation_marks_in_text_segments( + self, text_segments: List[TextSegment] + ) -> list[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] + for text_segment in text_segments: + quotation_matches.extend(self.find_all_potential_quotation_marks_in_text_segment(text_segment)) + return quotation_matches + + def find_all_potential_quotation_marks_in_text_segment( + self, text_segment: TextSegment + ) -> List[QuotationMarkStringMatch]: + quotation_matches: List[QuotationMarkStringMatch] = [] + for quotation_mark_match in self._QUOTATION_MARK_PATTERN.finditer(text_segment.text): + if self._quote_conventions.is_valid_opening_quotation_mark( + quotation_mark_match.group() + ) or self._quote_conventions.is_valid_closing_quotation_mark(quotation_mark_match.group()): + quotation_matches.append( + QuotationMarkStringMatch(text_segment, quotation_mark_match.start(), quotation_mark_match.end()) + ) + return quotation_matches diff --git a/machine/corpora/punctuation_analysis/quotation_mark_metadata.py b/machine/corpora/punctuation_analysis/quotation_mark_metadata.py new file mode 100644 index 00000000..efc9bc29 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_metadata.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from .quotation_mark_direction import QuotationMarkDirection +from .quote_convention import QuoteConvention +from .text_segment import TextSegment + + +@dataclass +class QuotationMarkMetadata: + + quotation_mark: str + depth: int + direction: QuotationMarkDirection + text_segment: TextSegment + start_index: int + end_index: int + + @property + def length(self) -> int: + return self.end_index - self.start_index + + def shift_indices(self, shift_amount: int) -> None: + self.start_index += shift_amount + self.end_index += shift_amount + + def update_quotation_mark(self, quote_convention: QuoteConvention) -> None: + updated_quotation_mark = quote_convention.get_expected_quotation_mark(self.depth, self.direction) + if updated_quotation_mark == self.quotation_mark: + return + + self.text_segment.replace_substring( + self.start_index, + self.end_index, + updated_quotation_mark, + ) + + if len(updated_quotation_mark) != len(self.quotation_mark): + self.end_index += len(updated_quotation_mark) - len(self.quotation_mark) diff --git a/machine/corpora/punctuation_analysis/quotation_mark_resolution_issue.py b/machine/corpora/punctuation_analysis/quotation_mark_resolution_issue.py new file mode 100644 index 00000000..4022722c --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_resolution_issue.py @@ -0,0 +1,9 @@ +from enum import Enum, auto + + +class QuotationMarkResolutionIssue(Enum): + UNPAIRED_QUOTATION_MARK = auto() + TOO_DEEP_NESTING = auto() + INCOMPATIBLE_QUOTATION_MARK = auto() + AMBIGUOUS_QUOTATION_MARK = auto() + UNEXPECTED_QUOTATION_MARK = auto() diff --git a/machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py b/machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py new file mode 100644 index 00000000..f636ae6d --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_resolution_settings.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +from typing import Set + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_string_match import QuotationMarkStringMatch + + +class QuotationMarkResolutionSettings(ABC): + + @abstractmethod + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + + @abstractmethod + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: ... + + @property + @abstractmethod + def opening_quotation_mark_regex(self) -> regex.Pattern: ... + + @property + @abstractmethod + def closing_quotation_mark_regex(self) -> regex.Pattern: ... + + @abstractmethod + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: ... + + @property + @abstractmethod + def should_rely_on_paragraph_markers(self) -> bool: ... + + @abstractmethod + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: ... + + @abstractmethod + def metadata_matches_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: ... diff --git a/machine/corpora/punctuation_analysis/quotation_mark_resolver.py b/machine/corpora/punctuation_analysis/quotation_mark_resolver.py new file mode 100644 index 00000000..3e9097f5 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_resolver.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod +from typing import Generator, List, Set + +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .quotation_mark_string_match import QuotationMarkStringMatch + + +class QuotationMarkResolver(ABC): + + @abstractmethod + def resolve_quotation_marks( + self, quotation_mark_matches: List[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: ... + + @abstractmethod + def reset(self) -> None: ... + + @abstractmethod + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: ... diff --git a/machine/corpora/punctuation_analysis/quotation_mark_string_match.py b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py new file mode 100644 index 00000000..573e37c7 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_string_match.py @@ -0,0 +1,150 @@ +from re import Pattern +from typing import Optional + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quote_convention_set import QuoteConventionSet +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType + + +class QuotationMarkStringMatch: + + # Extra stuff in the regex to handle Western Cham + _LETTER_PATTERN: Pattern = regex.compile(r"[\p{L}\U0001E200-\U0001E28F]", regex.U) + _LATIN_LETTER_PATTERN: Pattern = regex.compile(r"^\p{script=Latin}$", regex.U) + _WHITESPACE_PATTERN: Pattern = regex.compile(r"[\s~]", regex.U) + _PUNCTUATION_PATTERN: Pattern = regex.compile(r"[\.,;\?!\)\]\-—۔،؛]", regex.U) + _QUOTE_INTRODUCER_PATTERN: Pattern = regex.compile(r"[:,]\s*$", regex.U) + + def __init__(self, text_segment: TextSegment, start_index: int, end_index: int): + self._text_segment = text_segment + self._start_index = start_index + self._end_index = end_index + + def __eq__(self, value): + if not isinstance(value, QuotationMarkStringMatch): + return False + return ( + self._text_segment == value._text_segment + and self._start_index == value._start_index + and self._end_index == value._end_index + ) + + @property + def quotation_mark(self) -> str: + return self._text_segment.text[self._start_index : self._end_index] + + def is_valid_opening_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: + return quote_conventions.is_valid_opening_quotation_mark(self.quotation_mark) + + def is_valid_closing_quotation_mark(self, quote_conventions: QuoteConventionSet) -> bool: + return quote_conventions.is_valid_closing_quotation_mark(self.quotation_mark) + + def quotation_mark_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self.quotation_mark) is not None + + def next_character_matches(self, regex_pattern: regex.Pattern) -> bool: + return self.next_character is not None and regex_pattern.search(self.next_character) is not None + + def previous_character_matches(self, regex_pattern: regex.Pattern) -> bool: + return self.previous_character is not None and regex_pattern.search(self.previous_character) is not None + + @property + def previous_character(self) -> Optional[str]: + if self.is_at_start_of_segment(): + previous_segment = self._text_segment.previous_segment + if previous_segment is not None and not self._text_segment.marker_is_in_preceding_context( + UsfmMarkerType.PARAGRAPH + ): + return previous_segment.text[-1] + return None + return self._text_segment.text[self._start_index - 1] + + @property + def next_character(self) -> Optional[str]: + if self.is_at_end_of_segment(): + next_segment = self._text_segment.next_segment + if next_segment is not None and not next_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH): + return next_segment.text[0] + return None + return self._text_segment.text[self._end_index] + + def leading_substring_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self._text_segment.substring_before(self._start_index)) is not None + + def trailing_substring_matches(self, regex_pattern: regex.Pattern) -> bool: + return regex_pattern.search(self._text_segment.substring_after(self._end_index)) is not None + + # This assumes that the two matches occur in the same verse + def precedes(self, other: "QuotationMarkStringMatch") -> bool: + return self._text_segment.index_in_verse < other._text_segment.index_in_verse or ( + self._text_segment.index_in_verse == other._text_segment.index_in_verse + and self._start_index < other._start_index + ) + + @property + def text_segment(self) -> TextSegment: + return self._text_segment + + @property + def start_index(self) -> int: + return self._start_index + + @property + def end_index(self) -> int: + return self._end_index + + # Not used, but a useful method for debugging + @property + def context(self) -> str: + return self._text_segment.text[ + max(self._start_index - 10, 0) : min(self._end_index + 10, len(self._text_segment.text)) + ] + + def resolve(self, depth: int, direction: QuotationMarkDirection) -> QuotationMarkMetadata: + return QuotationMarkMetadata( + self.quotation_mark, depth, direction, self._text_segment, self._start_index, self._end_index + ) + + def is_at_start_of_segment(self) -> bool: + return self._start_index == 0 + + def is_at_end_of_segment(self) -> bool: + return self._end_index == self._text_segment.length + + def has_leading_whitespace(self) -> bool: + if self.previous_character is None: + return ( + self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.PARAGRAPH) + or self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.EMBED) + or self._text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) + ) + + return self.previous_character_matches(self._WHITESPACE_PATTERN) + + def has_trailing_whitespace(self) -> bool: + return self.next_character_matches(self._WHITESPACE_PATTERN) + + def has_leading_punctuation(self) -> bool: + return self.previous_character_matches(self._PUNCTUATION_PATTERN) + + def has_trailing_punctuation(self) -> bool: + return self.next_character_matches(self._PUNCTUATION_PATTERN) + + def has_letter_in_leading_substring(self) -> bool: + return self.leading_substring_matches(self._LETTER_PATTERN) + + def has_letter_in_trailing_substring(self) -> bool: + return self.trailing_substring_matches(self._LETTER_PATTERN) + + def has_leading_latin_letter(self) -> bool: + return self.previous_character_matches(self._LATIN_LETTER_PATTERN) + + def has_trailing_latin_letter(self) -> bool: + return self.next_character_matches(self._LATIN_LETTER_PATTERN) + + def has_quote_introducer_in_leading_substring(self) -> bool: + return self.leading_substring_matches(self._QUOTE_INTRODUCER_PATTERN) diff --git a/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py new file mode 100644 index 00000000..c76ff540 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quotation_mark_tabulator.py @@ -0,0 +1,100 @@ +from collections import Counter, defaultdict +from typing import List + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_metadata import QuotationMarkMetadata +from .quote_convention import QuoteConvention + + +class QuotationMarkCounts: + def __init__(self): + self._quotation_mark_counter: Counter[str] = Counter() + self._total_count = 0 + + def count_quotation_mark(self, quotation_mark: str) -> None: + self._quotation_mark_counter.update([quotation_mark]) + self._total_count += 1 + + def find_best_quotation_mark_proportion(self) -> tuple[str, int, int]: + return self._quotation_mark_counter.most_common(1)[0] + (self._total_count,) + + def calculate_num_differences(self, expected_quotation_mark: str) -> int: + return self._total_count - self._quotation_mark_counter[expected_quotation_mark] + + def get_observed_count(self) -> int: + return self._total_count + + +class QuotationMarkTabulator: + + def __init__(self): + self._quotation_counts_by_depth_and_direction: dict[tuple[int, QuotationMarkDirection], QuotationMarkCounts] = ( + defaultdict(QuotationMarkCounts) + ) + + def tabulate(self, quotation_marks: list[QuotationMarkMetadata]) -> None: + for quotation_mark in quotation_marks: + self._count_quotation_mark(quotation_mark) + + def _count_quotation_mark(self, quotation_mark: QuotationMarkMetadata) -> None: + key = (quotation_mark.depth, quotation_mark.direction) + self._quotation_counts_by_depth_and_direction[key].count_quotation_mark(quotation_mark.quotation_mark) + + def _depth_and_direction_observed(self, depth: int, direction: QuotationMarkDirection) -> bool: + return (depth, direction) in self._quotation_counts_by_depth_and_direction + + def _find_most_common_quotation_mark_with_depth_and_direction( + self, depth: int, direction: QuotationMarkDirection + ) -> tuple[str, int, int]: + return self._quotation_counts_by_depth_and_direction[(depth, direction)].find_best_quotation_mark_proportion() + + def calculate_similarity(self, quote_convention: QuoteConvention) -> float: + weighted_difference = 0 + total_weight = 0 + for depth, direction in self._quotation_counts_by_depth_and_direction: + expected_quotation_mark: str = quote_convention.get_expected_quotation_mark(depth, direction) + + # Give higher weight to shallower depths, since deeper marks are more likely to be mistakes + weighted_difference += self._quotation_counts_by_depth_and_direction[ + (depth, direction) + ].calculate_num_differences(expected_quotation_mark) * 2 ** (-depth) + total_weight += self._quotation_counts_by_depth_and_direction[ + (depth, direction) + ].get_observed_count() * 2 ** (-depth) + + if total_weight == 0: + return 0 + return 1 - (weighted_difference / total_weight) + + def get_summary_message(self) -> str: + message_lines: List[str] = [] + for depth in range(1, 5): + if self._depth_and_direction_observed( + depth, QuotationMarkDirection.OPENING + ) and self._depth_and_direction_observed(depth, QuotationMarkDirection.CLOSING): + (opening_quotation_mark, observed_opening_count, total_opening_count) = ( + self._find_most_common_quotation_mark_with_depth_and_direction( + depth, QuotationMarkDirection.OPENING + ) + ) + (closing_quotation_mark, observed_closing_count, total_closing_count) = ( + self._find_most_common_quotation_mark_with_depth_and_direction( + depth, QuotationMarkDirection.CLOSING + ) + ) + message_lines.append( + ( + "The most common level %i quotation marks are " + + "%s (%i of %i opening marks) and %s (%i of %i closing marks)" + ) + % ( + depth, + opening_quotation_mark, + observed_opening_count, + total_opening_count, + closing_quotation_mark, + observed_closing_count, + total_closing_count, + ) + ) + return "\n".join(message_lines) diff --git a/machine/corpora/punctuation_analysis/quote_convention.py b/machine/corpora/punctuation_analysis/quote_convention.py new file mode 100644 index 00000000..386cd559 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quote_convention.py @@ -0,0 +1,153 @@ +from dataclasses import dataclass +from typing import Dict, Set + +from .quotation_mark_direction import QuotationMarkDirection + +_QUOTATION_MARK_NORMALIZATION_MAP: Dict[str, str] = { + "\u00ab": '"', + "\u00bb": '"', + "\u2018": "'", + "\u2019": "'", + "\u201a": "'", + "\u201c": '"', + "\u201d": '"', + "\u201e": '"', + "\u300a": '"', + "\u300b": '"', + "\u300c": '"', + "\u300d": '"', +} + + +@dataclass(frozen=True) +class SingleLevelQuoteConvention: + opening_quotation_mark: str + closing_quotation_mark: str + + def normalize(self) -> "SingleLevelQuoteConvention": + normalized_opening_quotation_mark = ( + _QUOTATION_MARK_NORMALIZATION_MAP[self.opening_quotation_mark] + if self.opening_quotation_mark in _QUOTATION_MARK_NORMALIZATION_MAP + else self.opening_quotation_mark + ) + normalized_closing_quotation_mark = ( + _QUOTATION_MARK_NORMALIZATION_MAP[self.closing_quotation_mark] + if self.closing_quotation_mark in _QUOTATION_MARK_NORMALIZATION_MAP + else self.closing_quotation_mark + ) + return SingleLevelQuoteConvention(normalized_opening_quotation_mark, normalized_closing_quotation_mark) + + +class QuoteConvention: + def __init__(self, name: str, level_conventions: list[SingleLevelQuoteConvention]): + self._name = name + self.level_conventions = level_conventions + + def __eq__(self, value): + if not isinstance(value, QuoteConvention): + return False + if self._name != value._name: + return False + if len(self.level_conventions) != len(value.level_conventions): + return False + for level_convention, other_level_convention in zip(self.level_conventions, value.level_conventions): + if level_convention.opening_quotation_mark != other_level_convention.opening_quotation_mark: + return False + if level_convention.closing_quotation_mark != other_level_convention.closing_quotation_mark: + return False + return True + + @property + def name(self) -> str: + return self._name + + @property + def num_levels(self) -> int: + return len(self.level_conventions) + + def get_opening_quotation_mark_at_depth(self, depth: int) -> str: + return self.level_conventions[depth - 1].opening_quotation_mark + + def get_closing_quotation_mark_at_depth(self, depth: int) -> str: + return self.level_conventions[depth - 1].closing_quotation_mark + + def get_expected_quotation_mark(self, depth: int, direction: QuotationMarkDirection) -> str: + if depth > self.num_levels or depth < 1: + return "" + return ( + self.get_opening_quotation_mark_at_depth(depth) + if direction is QuotationMarkDirection.OPENING + else self.get_closing_quotation_mark_at_depth(depth) + ) + + def _includes_opening_quotation_mark(self, opening_quotation_mark: str) -> bool: + for level_convention in self.level_conventions: + if level_convention.opening_quotation_mark == opening_quotation_mark: + return True + return False + + def _includes_closing_quotation_mark(self, closing_quotation_mark: str) -> bool: + for level_convention in self.level_conventions: + if level_convention.closing_quotation_mark == closing_quotation_mark: + return True + return False + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + depths: Set[int] = set() + for depth, level_convention in enumerate(self.level_conventions, start=1): + if ( + direction is QuotationMarkDirection.OPENING + and level_convention.opening_quotation_mark == quotation_mark + ): + depths.add(depth) + elif ( + direction is QuotationMarkDirection.CLOSING + and level_convention.closing_quotation_mark == quotation_mark + ): + depths.add(depth) + return depths + + def is_compatible_with_observed_quotation_marks( + self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] + ) -> bool: + for opening_quotation_mark in opening_quotation_marks: + if not self._includes_opening_quotation_mark(opening_quotation_mark): + return False + for closing_quotation_mark in closing_quotation_marks: + if not self._includes_closing_quotation_mark(closing_quotation_mark): + return False + + # We require the first-level quotation marks to have been observed + if ( + self.get_opening_quotation_mark_at_depth(1) not in opening_quotation_marks + or self.get_closing_quotation_mark_at_depth(1) not in closing_quotation_marks + ): + return False + return True + + def normalize(self) -> "QuoteConvention": + return QuoteConvention( + self.name + "_normalized", [level_convention.normalize() for level_convention in self.level_conventions] + ) + + def __str__(self) -> str: + summary = self.name + "\n" + for depth, level_convention in enumerate(self.level_conventions): + ordinal_name = self._get_ordinal_name(depth + 1) + summary += "%s%s-level quote%s\n" % ( + level_convention.opening_quotation_mark, + ordinal_name, + level_convention.closing_quotation_mark, + ) + return summary + + def _get_ordinal_name(self, depth) -> str: + if depth == 1: + return "First" + if depth == 2: + return "Second" + if depth == 3: + return "Third" + if depth == 4: + return "Fourth" + return str(depth) + "th" diff --git a/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py new file mode 100644 index 00000000..be43806c --- /dev/null +++ b/machine/corpora/punctuation_analysis/quote_convention_detection_resolution_settings.py @@ -0,0 +1,43 @@ +from typing import Set + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quote_convention_set import QuoteConventionSet + + +class QuoteConventionDetectionResolutionSettings(QuotationMarkResolutionSettings): + + def __init__(self, quote_conventions: QuoteConventionSet): + self._quote_conventions = quote_conventions + + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_conventions) + + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_conventions) + + @property + def opening_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_conventions.opening_quotation_mark_regex + + @property + def closing_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_conventions.closing_quotation_mark_regex + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return self._quote_conventions.marks_are_a_valid_pair(opening_mark, closing_mark) + + @property + def should_rely_on_paragraph_markers(self): + return True + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + return self._quote_conventions.get_possible_depths(quotation_mark, direction) + + def metadata_matches_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + return self._quote_conventions.metadata_matches_quotation_mark(quotation_mark, depth, direction) diff --git a/machine/corpora/punctuation_analysis/quote_convention_detector.py b/machine/corpora/punctuation_analysis/quote_convention_detector.py new file mode 100644 index 00000000..5a8d098d --- /dev/null +++ b/machine/corpora/punctuation_analysis/quote_convention_detector.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +from typing import List, Optional + +from .chapter import Chapter +from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .preliminary_quotation_mark_analyzer import PreliminaryQuotationMarkAnalyzer +from .quotation_mark_finder import QuotationMarkFinder +from .quotation_mark_metadata import QuotationMarkMetadata +from .quotation_mark_string_match import QuotationMarkStringMatch +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention +from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings +from .quote_convention_set import QuoteConventionSet +from .standard_quote_conventions import STANDARD_QUOTE_CONVENTIONS +from .usfm_structure_extractor import UsfmStructureExtractor + + +@dataclass(frozen=True) +class QuoteConventionAnalysis: + best_quote_convention: QuoteConvention + best_quote_convention_score: float + analysis_summary: str + + +class QuoteConventionDetector(UsfmStructureExtractor): + + def __init__(self): + super().__init__() + self._quotation_mark_tabulator = QuotationMarkTabulator() + + def _count_quotation_marks_in_chapters(self, chapters: list[Chapter]) -> None: + possible_quote_conventions: QuoteConventionSet = PreliminaryQuotationMarkAnalyzer( + STANDARD_QUOTE_CONVENTIONS + ).narrow_down_possible_quote_conventions(chapters) + + for chapter in chapters: + self._count_quotation_marks_in_chapter(chapter, possible_quote_conventions) + + def _count_quotation_marks_in_chapter( + self, chapter: Chapter, possible_quote_conventions: QuoteConventionSet + ) -> None: + quotation_mark_matches: List[QuotationMarkStringMatch] = QuotationMarkFinder( + possible_quote_conventions + ).find_all_potential_quotation_marks_in_chapter(chapter) + + resolved_quotation_marks: List[QuotationMarkMetadata] = list( + DepthBasedQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(possible_quote_conventions) + ).resolve_quotation_marks(quotation_mark_matches) + ) + + self._quotation_mark_tabulator.tabulate(resolved_quotation_marks) + + def detect_quote_convention(self) -> Optional[QuoteConventionAnalysis]: + self._count_quotation_marks_in_chapters(self.get_chapters()) + + (best_quote_convention, score) = STANDARD_QUOTE_CONVENTIONS.find_most_similar_convention( + self._quotation_mark_tabulator + ) + + if score > 0 and best_quote_convention is not None: + return QuoteConventionAnalysis( + best_quote_convention, score, self._quotation_mark_tabulator.get_summary_message() + ) + return None diff --git a/machine/corpora/punctuation_analysis/quote_convention_set.py b/machine/corpora/punctuation_analysis/quote_convention_set.py new file mode 100644 index 00000000..bef15639 --- /dev/null +++ b/machine/corpora/punctuation_analysis/quote_convention_set.py @@ -0,0 +1,151 @@ +from collections import defaultdict +from re import Pattern +from typing import Dict, List, Optional, Set, Tuple + +import regex + +from .quotation_mark_direction import QuotationMarkDirection +from .quotation_mark_tabulator import QuotationMarkTabulator +from .quote_convention import QuoteConvention + + +class QuoteConventionSet: + def __init__(self, conventions: List[QuoteConvention]): + self._conventions = conventions + self._create_quotation_mark_regexes() + self._create_quotation_mark_pair_map() + + def __eq__(self, other: object) -> bool: + if not isinstance(other, QuoteConventionSet): + return False + return self._conventions == other._conventions + + def _create_quotation_mark_regexes(self) -> None: + self._opening_quotation_mark_regex = regex.compile(r"") + self._closing_quotation_mark_regex = regex.compile(r"") + self._all_quotation_mark_regex = regex.compile(r"") + + opening_quotation_marks: Set[str] = set() + closing_quotation_marks: Set[str] = set() + + for convention in self._conventions: + for depth in range(1, convention.num_levels + 1): + opening_quotation_mark = convention.get_opening_quotation_mark_at_depth(depth) + closing_quotation_mark = convention.get_closing_quotation_mark_at_depth(depth) + opening_quotation_marks.add(opening_quotation_mark) + closing_quotation_marks.add(closing_quotation_mark) + + all_quotation_marks = opening_quotation_marks.union(closing_quotation_marks) + + if len(all_quotation_marks) > 0: + self._opening_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(opening_quotation_marks))) + "]" + ) + self._closing_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(closing_quotation_marks))) + "]" + ) + self._all_quotation_mark_regex: Pattern = regex.compile( + r"[" + "".join(sorted(list(all_quotation_marks))) + "]" + ) + + def _create_quotation_mark_pair_map(self) -> None: + self.closing_marks_by_opening_mark: Dict[str, set[str]] = defaultdict(set) + self.opening_marks_by_closing_mark: Dict[str, set[str]] = defaultdict(set) + for convention in self._conventions: + for depth in range(1, convention.num_levels + 1): + opening_quotation_mark = convention.get_opening_quotation_mark_at_depth(depth) + closing_quotation_mark = convention.get_closing_quotation_mark_at_depth(depth) + self.closing_marks_by_opening_mark[opening_quotation_mark].add(closing_quotation_mark) + self.opening_marks_by_closing_mark[closing_quotation_mark].add(opening_quotation_mark) + + @property + def opening_quotation_mark_regex(self) -> Pattern: + return self._opening_quotation_mark_regex + + @property + def closing_quotation_mark_regex(self) -> Pattern: + return self._closing_quotation_mark_regex + + @property + def quotation_mark_regex(self) -> Pattern: + return self._all_quotation_mark_regex + + def get_quote_convention_by_name(self, name: str) -> Optional[QuoteConvention]: + for convention in self._conventions: + if convention.name == name: + return convention + return None + + def get_all_quote_convention_names(self) -> List[str]: + return sorted([qc._name for qc in self._conventions]) + + def get_possible_opening_marks(self) -> list[str]: + return sorted(list(self.closing_marks_by_opening_mark.keys())) + + def get_possible_closing_marks(self) -> list[str]: + return sorted(list(self.opening_marks_by_closing_mark.keys())) + + def is_valid_opening_quotation_mark(self, quotation_mark: str) -> bool: + return quotation_mark in self.closing_marks_by_opening_mark + + def is_valid_closing_quotation_mark(self, quotation_mark: str) -> bool: + return quotation_mark in self.opening_marks_by_closing_mark + + def marks_are_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return (opening_mark in self.closing_marks_by_opening_mark) and ( + closing_mark in self.closing_marks_by_opening_mark[opening_mark] + ) + + def is_quotation_mark_direction_ambiguous(self, quotation_mark: str) -> bool: + return ( + quotation_mark in self.closing_marks_by_opening_mark + and quotation_mark in self.closing_marks_by_opening_mark[quotation_mark] + ) + + def get_possible_paired_quotation_marks(self, quotation_mark: str) -> Set[str]: + paired_quotation_marks: Set[str] = set() + if quotation_mark in self.closing_marks_by_opening_mark: + paired_quotation_marks.update(self.closing_marks_by_opening_mark[quotation_mark]) + if quotation_mark in self.opening_marks_by_closing_mark: + paired_quotation_marks.update(self.opening_marks_by_closing_mark[quotation_mark]) + return paired_quotation_marks + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + depths: Set[int] = set() + for convention in self._conventions: + depths.update(convention.get_possible_depths(quotation_mark, direction)) + return depths + + def metadata_matches_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + for convention in self._conventions: + if convention.get_expected_quotation_mark(depth, direction) == quotation_mark: + return True + return False + + def filter_to_compatible_quote_conventions( + self, opening_quotation_marks: list[str], closing_quotation_marks: list[str] + ) -> "QuoteConventionSet": + return QuoteConventionSet( + [ + convention + for convention in self._conventions + if convention.is_compatible_with_observed_quotation_marks( + opening_quotation_marks, closing_quotation_marks + ) + ] + ) + + def find_most_similar_convention( + self, tabulated_quotation_marks: QuotationMarkTabulator + ) -> Tuple[Optional[QuoteConvention], float]: + best_similarity: float = float("-inf") + best_quote_convention: Optional[QuoteConvention] = None + for quote_convention in self._conventions: + similarity = tabulated_quotation_marks.calculate_similarity(quote_convention) + if similarity > best_similarity: + best_similarity = similarity + best_quote_convention = quote_convention + + return (best_quote_convention, best_similarity) diff --git a/machine/corpora/punctuation_analysis/standard_quote_conventions.py b/machine/corpora/punctuation_analysis/standard_quote_conventions.py new file mode 100644 index 00000000..b1292e15 --- /dev/null +++ b/machine/corpora/punctuation_analysis/standard_quote_conventions.py @@ -0,0 +1,193 @@ +from .quote_convention import QuoteConvention, SingleLevelQuoteConvention +from .quote_convention_set import QuoteConventionSet + +STANDARD_QUOTE_CONVENTIONS: QuoteConventionSet = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "british_english", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ), + QuoteConvention( + "british_typewriter_english", + [ + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "hybrid_typewriter_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ), + QuoteConvention( + "typewriter_french", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + ], + ), + QuoteConvention( + "french_variant", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "british_inspired_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ), + QuoteConvention( + "typewriter_western_european", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "typewriter_western_european_variant", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "hybrid_typewriter_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ), + QuoteConvention( + "hybrid_british_typewriter_western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ), + QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "central_european_guillemets", + [ + SingleLevelQuoteConvention("\u00bb", "\u00ab"), + SingleLevelQuoteConvention("\u203a", "\u2039"), + SingleLevelQuoteConvention("\u00bb", "\u00ab"), + SingleLevelQuoteConvention("\u203a", "\u2039"), + ], + ), + QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + QuoteConvention( + "standard_finnish", + [ + SingleLevelQuoteConvention("\u00bb", "\u00bb"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ), + QuoteConvention( + "standard_russian", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "standard_arabic", + [ + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + SingleLevelQuoteConvention("\u201d", "\u201c"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + ], + ), + QuoteConvention( + "non-standard_arabic", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2019", "\u2018"), + ], + ), + ] +) diff --git a/machine/corpora/punctuation_analysis/text_segment.py b/machine/corpora/punctuation_analysis/text_segment.py new file mode 100644 index 00000000..cae3e387 --- /dev/null +++ b/machine/corpora/punctuation_analysis/text_segment.py @@ -0,0 +1,83 @@ +from typing import Optional, Set + +from ..usfm_token import UsfmToken +from .usfm_marker_type import UsfmMarkerType + + +class TextSegment: + def __init__(self): + self._text = "" + self._immediate_preceding_marker: UsfmMarkerType = UsfmMarkerType.NO_MARKER + self._markers_in_preceding_context: Set[UsfmMarkerType] = set() + self.previous_segment: Optional[TextSegment] = None + self.next_segment: Optional[TextSegment] = None + self.index_in_verse: int = 0 + self.num_segments_in_verse: int = 0 + self._usfm_token: Optional[UsfmToken] = None + + def __eq__(self, value): + if not isinstance(value, TextSegment): + return False + if self._text != value._text: + return False + if self.index_in_verse != value.index_in_verse: + return False + if self.num_segments_in_verse != value.num_segments_in_verse: + return False + if self._usfm_token != value._usfm_token: + return False + if self._immediate_preceding_marker != value._immediate_preceding_marker: + return False + return True + + @property + def text(self) -> str: + return self._text + + @property + def length(self) -> int: + return len(self._text) + + def substring_before(self, index: int) -> str: + return self._text[:index] + + def substring_after(self, index: int) -> str: + return self._text[index:] + + def marker_is_in_preceding_context(self, marker: UsfmMarkerType) -> bool: + return marker in self._markers_in_preceding_context + + def is_first_segment_in_verse(self) -> bool: + return self.index_in_verse == 0 + + def is_last_segment_in_verse(self) -> bool: + return self.index_in_verse == self.num_segments_in_verse - 1 + + def replace_substring(self, start_index: int, end_index: int, replacement: str) -> None: + self._text = self.substring_before(start_index) + replacement + self.substring_after(end_index) + if self._usfm_token is not None: + self._usfm_token.text = self._text + + class Builder: + def __init__(self): + self._text_segment = TextSegment() + + def set_previous_segment(self, previous_segment: "TextSegment") -> "TextSegment.Builder": + self._text_segment.previous_segment = previous_segment + return self + + def add_preceding_marker(self, marker: UsfmMarkerType) -> "TextSegment.Builder": + self._text_segment._immediate_preceding_marker = marker + self._text_segment._markers_in_preceding_context.add(marker) + return self + + def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder": + self._text_segment._usfm_token = token + return self + + def set_text(self, text: str) -> "TextSegment.Builder": + self._text_segment._text = text + return self + + def build(self) -> "TextSegment": + return self._text_segment diff --git a/machine/corpora/punctuation_analysis/usfm_marker_type.py b/machine/corpora/punctuation_analysis/usfm_marker_type.py new file mode 100644 index 00000000..ea4349e6 --- /dev/null +++ b/machine/corpora/punctuation_analysis/usfm_marker_type.py @@ -0,0 +1,11 @@ +from enum import Enum, auto + + +class UsfmMarkerType(Enum): + PARAGRAPH = auto() + CHARACTER = auto() + VERSE = auto() + CHAPTER = auto() + EMBED = auto() + OTHER = auto() + NO_MARKER = auto() diff --git a/machine/corpora/punctuation_analysis/usfm_structure_extractor.py b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py new file mode 100644 index 00000000..02b22ce6 --- /dev/null +++ b/machine/corpora/punctuation_analysis/usfm_structure_extractor.py @@ -0,0 +1,100 @@ +from typing import Optional, Sequence + +from ..usfm_parser_handler import UsfmParserHandler +from ..usfm_parser_state import UsfmParserState +from ..usfm_token import UsfmAttribute +from .chapter import Chapter +from .text_segment import TextSegment +from .usfm_marker_type import UsfmMarkerType +from .verse import Verse + + +class UsfmStructureExtractor(UsfmParserHandler): + def __init__(self): + self._text_segments: list[TextSegment] = [] + self._next_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + def chapter( + self, + state: UsfmParserState, + number: str, + marker: str, + alt_number: Optional[str], + pub_number: Optional[str], + ) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER) + + def start_para( + self, + state: UsfmParserState, + marker: str, + unknown: bool, + attributes: Optional[Sequence[UsfmAttribute]], + ) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.PARAGRAPH) + + def start_char( + self, + state: UsfmParserState, + marker_without_plus: str, + unknown: bool, + attributes: Optional[Sequence[UsfmAttribute]], + ) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) + + def end_char( + self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool + ) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) + + def verse( + self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str] + ) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) + + def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) + + def end_table(self, state: UsfmParserState) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) + + def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) + + def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None: + self._next_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) + + def text(self, state: UsfmParserState, text: str) -> None: + if not state.is_verse_text: + return + if len(text) > 0: + self._next_text_segment_builder.set_text(text) + text_segment: TextSegment = self._next_text_segment_builder.build() + # Don't look past verse boundaries, to enable identical functionality in the + # online one-verse-at-a-time (QuotationMarkDenormalizationScriptureUpdateBlockHandler) + # and offline whole-book-at-once settings (QuoteConventionDetector) + if len(self._text_segments) > 0 and not text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): + self._text_segments[-1].next_segment = text_segment + text_segment.previous_segment = self._text_segments[-1] + self._text_segments.append(text_segment) + self._next_text_segment_builder = TextSegment.Builder() + + def get_chapters(self) -> list[Chapter]: + chapters: list[Chapter] = [] + current_chapter_verses: list[Verse] = [] + current_verse_segments: list[TextSegment] = [] + for text_segment in self._text_segments: + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): + if len(current_verse_segments) > 0: + current_chapter_verses.append(Verse(current_verse_segments)) + current_verse_segments = [] + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER): + if len(current_chapter_verses) > 0: + chapters.append(Chapter(current_chapter_verses)) + current_chapter_verses = [] + current_verse_segments.append(text_segment) + if len(current_verse_segments) > 0: + current_chapter_verses.append(Verse(current_verse_segments)) + if len(current_chapter_verses) > 0: + chapters.append(Chapter(current_chapter_verses)) + return chapters diff --git a/machine/corpora/punctuation_analysis/verse.py b/machine/corpora/punctuation_analysis/verse.py new file mode 100644 index 00000000..9c871421 --- /dev/null +++ b/machine/corpora/punctuation_analysis/verse.py @@ -0,0 +1,16 @@ +from .text_segment import TextSegment + + +class Verse: + def __init__(self, text_segments: list[TextSegment]): + self._text_segments = text_segments + self._index_text_segments() + + def _index_text_segments(self) -> None: + for index, text_segment in enumerate(self._text_segments): + text_segment.index_in_verse = index + text_segment.num_segments_in_verse = len(self._text_segments) + + @property + def text_segments(self) -> list[TextSegment]: + return self._text_segments diff --git a/machine/corpora/quotation_mark_denormalization_first_pass.py b/machine/corpora/quotation_mark_denormalization_first_pass.py new file mode 100644 index 00000000..4460d876 --- /dev/null +++ b/machine/corpora/quotation_mark_denormalization_first_pass.py @@ -0,0 +1,9 @@ +from .punctuation_analysis.quote_convention import QuoteConvention +from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass + + +# This is a convenience class so that users don't have to know to normalize the source quote convention +class QuotationMarkDenormalizationFirstPass(QuotationMarkUpdateFirstPass): + + def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + super().__init__(source_quote_convention.normalize(), target_quote_convention) diff --git a/machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py b/machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py new file mode 100644 index 00000000..baf75718 --- /dev/null +++ b/machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py @@ -0,0 +1,15 @@ +from .punctuation_analysis.quote_convention import QuoteConvention +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler + + +# This is a convenience class so that users don't have to know to normalize the source quote convention +class QuotationMarkDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler): + + def __init__( + self, + source_quote_convention: QuoteConvention, + target_quote_convention: QuoteConvention, + settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), + ): + super().__init__(source_quote_convention.normalize(), target_quote_convention, settings) diff --git a/machine/corpora/quotation_mark_update_first_pass.py b/machine/corpora/quotation_mark_update_first_pass.py new file mode 100644 index 00000000..b42af15a --- /dev/null +++ b/machine/corpora/quotation_mark_update_first_pass.py @@ -0,0 +1,93 @@ +from typing import Dict, List, Set + +from .punctuation_analysis.chapter import Chapter +from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .punctuation_analysis.quotation_mark_finder import QuotationMarkFinder +from .punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet +from .punctuation_analysis.usfm_structure_extractor import UsfmStructureExtractor +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy + + +# Determines the best strategy to take for each chapter +class QuotationMarkUpdateFirstPass(UsfmStructureExtractor): + + def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention): + super().__init__() + self._source_quote_convention: QuoteConvention = source_quote_convention + self._target_quote_convention: QuoteConvention = target_quote_convention + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([source_quote_convention]) + ) + self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(source_quote_convention) + ) + self._will_fallback_mode_work: bool = self._check_whether_fallback_mode_will_work( + source_quote_convention, target_quote_convention + ) + + def _check_whether_fallback_mode_will_work( + self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention + ) -> bool: + opening_target_marks_by_source_marks: Dict[str, str] = {} + closing_target_marks_by_source_marks: Dict[str, str] = {} + for depth in range(1, min(source_quote_convention.num_levels, target_quote_convention.num_levels) + 1): + source_opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth) + target_opening_quotation_mark = target_quote_convention.get_opening_quotation_mark_at_depth(depth) + if ( + source_opening_quotation_mark in opening_target_marks_by_source_marks + and opening_target_marks_by_source_marks[source_opening_quotation_mark] != target_opening_quotation_mark + ): + return False + opening_target_marks_by_source_marks[source_opening_quotation_mark] = target_opening_quotation_mark + + source_closing_quotation_mark = source_quote_convention.get_closing_quotation_mark_at_depth(depth) + target_closing_quotation_mark = target_quote_convention.get_closing_quotation_mark_at_depth(depth) + if ( + source_closing_quotation_mark in closing_target_marks_by_source_marks + and closing_target_marks_by_source_marks[source_closing_quotation_mark] != target_closing_quotation_mark + ): + return False + closing_target_marks_by_source_marks[source_closing_quotation_mark] = target_closing_quotation_mark + + return True + + def find_best_chapter_strategies(self) -> List[QuotationMarkUpdateStrategy]: + best_actions_by_chapter: List[QuotationMarkUpdateStrategy] = [] + + for chapter in self.get_chapters(): + best_actions_by_chapter.append(self._find_best_strategy_for_chapter(chapter)) + + return best_actions_by_chapter + + def _find_best_strategy_for_chapter(self, chapter: Chapter) -> QuotationMarkUpdateStrategy: + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_chapter(chapter) + ) + + self._quotation_mark_resolver.reset() + + # Use list() to force evaluation of the generator + list(self._quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches)) + + return self._choose_best_strategy_based_on_observed_issues(self._quotation_mark_resolver.get_issues()) + + def _choose_best_strategy_based_on_observed_issues( + self, issues: Set[QuotationMarkResolutionIssue] + ) -> QuotationMarkUpdateStrategy: + if QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK in issues: + return QuotationMarkUpdateStrategy.SKIP + + if ( + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK in issues + or QuotationMarkResolutionIssue.TOO_DEEP_NESTING in issues + ): + if self._will_fallback_mode_work: + return QuotationMarkUpdateStrategy.APPLY_FALLBACK + return QuotationMarkUpdateStrategy.SKIP + + return QuotationMarkUpdateStrategy.APPLY_FULL diff --git a/machine/corpora/quotation_mark_update_resolution_settings.py b/machine/corpora/quotation_mark_update_resolution_settings.py new file mode 100644 index 00000000..cd37d43d --- /dev/null +++ b/machine/corpora/quotation_mark_update_resolution_settings.py @@ -0,0 +1,44 @@ +from typing import Set + +import regex + +from .punctuation_analysis.quotation_mark_direction import QuotationMarkDirection +from .punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet + + +class QuotationMarkUpdateResolutionSettings(QuotationMarkResolutionSettings): + def __init__(self, source_quote_convention: QuoteConvention): + self._source_quote_convention = source_quote_convention + self._quote_convention_singleton_set = QuoteConventionSet([self._source_quote_convention]) + + def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_singleton_set) + + def is_valid_closing_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool: + return quotation_mark_match.is_valid_closing_quotation_mark(self._quote_convention_singleton_set) + + @property + def opening_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_singleton_set.opening_quotation_mark_regex + + @property + def closing_quotation_mark_regex(self) -> regex.Pattern: + return self._quote_convention_singleton_set.closing_quotation_mark_regex + + def are_marks_a_valid_pair(self, opening_mark: str, closing_mark: str) -> bool: + return self._quote_convention_singleton_set.marks_are_a_valid_pair(opening_mark, closing_mark) + + @property + def should_rely_on_paragraph_markers(self): + return False + + def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]: + return self._source_quote_convention.get_possible_depths(quotation_mark, direction) + + def metadata_matches_quotation_mark( + self, quotation_mark: str, depth: int, direction: QuotationMarkDirection + ) -> bool: + return self._source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark diff --git a/machine/corpora/quotation_mark_update_settings.py b/machine/corpora/quotation_mark_update_settings.py new file mode 100644 index 00000000..006f413c --- /dev/null +++ b/machine/corpora/quotation_mark_update_settings.py @@ -0,0 +1,17 @@ +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy + + +class QuotationMarkUpdateSettings: + + def __init__( + self, + default_chapter_strategy: QuotationMarkUpdateStrategy = QuotationMarkUpdateStrategy.APPLY_FULL, + chapter_strategies: list[QuotationMarkUpdateStrategy] = [], + ): + self._default_chapter_strategy = default_chapter_strategy + self._chapter_strategies = chapter_strategies + + def get_action_for_chapter(self, chapter_number: int) -> QuotationMarkUpdateStrategy: + if chapter_number <= len(self._chapter_strategies): + return self._chapter_strategies[chapter_number - 1] + return self._default_chapter_strategy diff --git a/machine/corpora/quotation_mark_update_strategy.py b/machine/corpora/quotation_mark_update_strategy.py new file mode 100644 index 00000000..ea66e5e2 --- /dev/null +++ b/machine/corpora/quotation_mark_update_strategy.py @@ -0,0 +1,7 @@ +from enum import Enum, auto + + +class QuotationMarkUpdateStrategy(Enum): + APPLY_FULL = auto() + APPLY_FALLBACK = auto() + SKIP = auto() diff --git a/machine/corpora/quote_convention_changing_usfm_update_block_handler.py b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py new file mode 100644 index 00000000..818311dd --- /dev/null +++ b/machine/corpora/quote_convention_changing_usfm_update_block_handler.py @@ -0,0 +1,167 @@ +from typing import List, Optional + +from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver +from .punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver +from .punctuation_analysis.quotation_mark_finder import QuotationMarkFinder +from .punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata +from .punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver +from .punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch +from .punctuation_analysis.quote_convention import QuoteConvention +from .punctuation_analysis.quote_convention_set import QuoteConventionSet +from .punctuation_analysis.text_segment import TextSegment +from .punctuation_analysis.usfm_marker_type import UsfmMarkerType +from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings +from .quotation_mark_update_settings import QuotationMarkUpdateSettings +from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy +from .usfm_token import UsfmToken, UsfmTokenType +from .usfm_update_block import UsfmUpdateBlock +from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType +from .usfm_update_block_handler import UsfmUpdateBlockHandler + + +class QuoteConventionChangingUsfmUpdateBlockHandler(UsfmUpdateBlockHandler): + + def __init__( + self, + source_quote_convention: QuoteConvention, + target_quote_convention: QuoteConvention, + settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), + ): + super().__init__() + self._source_quote_convention: QuoteConvention = source_quote_convention + self._target_quote_convention: QuoteConvention = target_quote_convention + self._settings: QuotationMarkUpdateSettings = settings + + self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder( + QuoteConventionSet([self._source_quote_convention]) + ) + self._next_scripture_text_segment_builder: TextSegment.Builder = TextSegment.Builder() + + resolution_settings = QuotationMarkUpdateResolutionSettings(self._source_quote_convention) + + # Each embed represents a separate context for quotation marks + # (i.e. you can't open a quote in one context and close it in another) + # so we need to keep track of the verse and embed contexts separately. + self._verse_text_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings + ) + self._embed_quotation_mark_resolver: DepthBasedQuotationMarkResolver = DepthBasedQuotationMarkResolver( + resolution_settings + ) + self._simple_quotation_mark_resolver: FallbackQuotationMarkResolver = FallbackQuotationMarkResolver( + resolution_settings + ) + self._current_strategy = QuotationMarkUpdateStrategy.APPLY_FULL + self._current_chapter_number: int = 0 + self._current_verse_number: int = 0 + + def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + self._check_for_chapter_change(block) + self._check_for_verse_change(block) + if self._current_strategy is QuotationMarkUpdateStrategy.SKIP: + return block + if self._current_strategy is QuotationMarkUpdateStrategy.APPLY_FALLBACK: + return self._apply_fallback_updating(block) + return self._apply_standard_updating(block) + + def _apply_fallback_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + for element in block.elements: + self._process_scripture_element(element, self._simple_quotation_mark_resolver) + return block + + def _apply_standard_updating(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock: + for element in block.elements: + if element.type == UsfmUpdateBlockElementType.EMBED: + self._embed_quotation_mark_resolver.reset() + self._process_scripture_element(element, self._embed_quotation_mark_resolver) + else: + self._process_scripture_element(element, self._verse_text_quotation_mark_resolver) + + return block + + def _process_scripture_element( + self, element: UsfmUpdateBlockElement, quotation_mark_resolver: QuotationMarkResolver + ) -> None: + text_segments: List[TextSegment] = self._create_text_segments(element) + quotation_mark_matches: List[QuotationMarkStringMatch] = ( + self._quotation_mark_finder.find_all_potential_quotation_marks_in_text_segments(text_segments) + ) + resolved_quotation_mark_matches: List[QuotationMarkMetadata] = list( + quotation_mark_resolver.resolve_quotation_marks(quotation_mark_matches) + ) + self._update_quotation_marks(resolved_quotation_mark_matches) + + def _create_text_segments(self, element: UsfmUpdateBlockElement) -> List[TextSegment]: + text_segments: List[TextSegment] = [] + for token in element.get_tokens(): + if token.type == UsfmTokenType.VERSE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) + elif token.type == UsfmTokenType.PARAGRAPH: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.PARAGRAPH) + elif token.type == UsfmTokenType.CHARACTER: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHARACTER) + elif token.type == UsfmTokenType.NOTE: + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED) + elif token.type == UsfmTokenType.TEXT: + text_segment: Optional[TextSegment] = self._create_text_segment(token) + if text_segment is not None: + text_segments.append(text_segment) + return self._set_previous_and_next_for_segments(text_segments) + + def _create_text_segment(self, token: UsfmToken) -> Optional[TextSegment]: + self._next_scripture_text_segment_builder.set_usfm_token(token) + text_segment_to_return: Optional[TextSegment] = None + if token.text is not None: + self._next_scripture_text_segment_builder.set_text(token.text) + text_segment_to_return = self._next_scripture_text_segment_builder.build() + self._next_scripture_text_segment_builder = TextSegment.Builder() + return text_segment_to_return + + def _set_previous_and_next_for_segments(self, text_segments: List[TextSegment]) -> List[TextSegment]: + for i in range(len(text_segments)): + if i > 0: + text_segments[i].previous_segment = text_segments[i - 1] + if i < len(text_segments) - 1: + text_segments[i].next_segment = text_segments[i + 1] + return text_segments + + def _update_quotation_marks(self, resolved_quotation_mark_matches: List[QuotationMarkMetadata]) -> None: + for quotation_mark_index, resolved_quotation_mark_match in enumerate(resolved_quotation_mark_matches): + previous_length: int = resolved_quotation_mark_match.length + resolved_quotation_mark_match.update_quotation_mark(self._target_quote_convention) + updated_length: int = resolved_quotation_mark_match.length + + if previous_length != updated_length: + self._shift_quotation_mark_metadata_indices( + resolved_quotation_mark_matches[quotation_mark_index + 1 :], updated_length - previous_length + ) + + def _shift_quotation_mark_metadata_indices( + self, quotation_mark_metadata_list: List[QuotationMarkMetadata], shift_amount: int + ) -> None: + for quotation_mark_metadata in quotation_mark_metadata_list: + quotation_mark_metadata.shift_indices(shift_amount) + + def _check_for_chapter_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if scripture_ref.chapter_num != self._current_chapter_number: + self._start_new_chapter(scripture_ref.chapter_num) + + def _start_new_chapter(self, new_chapter_number: int) -> None: + self._current_chapter_number = new_chapter_number + self._current_strategy = self._settings.get_action_for_chapter(new_chapter_number) + self._verse_text_quotation_mark_resolver.reset() + self._next_scripture_text_segment_builder = TextSegment.Builder() + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.CHAPTER) + + def _check_for_verse_change(self, block: UsfmUpdateBlock) -> None: + for scripture_ref in block.refs: + if ( + scripture_ref.chapter_num == self._current_chapter_number + and scripture_ref.verse_num != self._current_verse_number + ): + self._start_new_verse(scripture_ref.verse_num) + + def _start_new_verse(self, new_verse_number: int) -> None: + self._current_verse_number = new_verse_number + self._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.VERSE) diff --git a/machine/corpora/scripture_embed.py b/machine/corpora/scripture_embed.py new file mode 100644 index 00000000..cc4a64f6 --- /dev/null +++ b/machine/corpora/scripture_embed.py @@ -0,0 +1,16 @@ +from typing import Optional + +EMBED_PART_START_CHAR_STYLES = ("f", "x", "z") +EMBED_STYLES = ("f", "fe", "fig", "fm", "x") + + +def is_note_text(marker: Optional[str]) -> bool: + return marker == "ft" + + +def is_embed_part_style(marker: Optional[str]) -> bool: + return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) + + +def is_embed_style(marker: Optional[str]) -> bool: + return marker is not None and marker.strip("*") in EMBED_STYLES diff --git a/tests/corpora/punctuation_analysis/test_chapter.py b/tests/corpora/punctuation_analysis/test_chapter.py new file mode 100644 index 00000000..a8ee6cce --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_chapter.py @@ -0,0 +1,23 @@ +from machine.corpora.punctuation_analysis import Chapter, TextSegment, Verse + + +def test_initialize_verse() -> None: + text_segments1 = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + verse1 = Verse(text_segments1) + + text_segments2 = [ + TextSegment.Builder().set_text("Segment 4").build(), + TextSegment.Builder().set_text("Segment 5").build(), + TextSegment.Builder().set_text("Segment 6").build(), + ] + verse2 = Verse(text_segments2) + + chapter = Chapter([verse1, verse2]) + + assert len(chapter.verses) == 2 + assert chapter.verses[0].text_segments == text_segments1 + assert chapter.verses[1].text_segments == text_segments2 diff --git a/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py new file mode 100644 index 00000000..f9303ebc --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_depth_based_quotation_mark_resolver.py @@ -0,0 +1,2526 @@ +from pytest import raises + +from machine.corpora import QuotationMarkUpdateResolutionSettings +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + DepthBasedQuotationMarkResolver, + QuotationMarkCategorizer, + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkResolverState, + QuotationMarkStringMatch, + QuoteContinuerState, + QuoteContinuerStyle, + QuoteConventionDetectionResolutionSettings, + QuoteConventionSet, + TextSegment, + UsfmMarkerType, +) + + +# QuotationMarkResolverState tests +def test_current_depth_quotation_mark_resolver_state() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert quotation_mark_resolver_state.current_depth == 0 + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.current_depth == 1 + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.current_depth == 2 + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.current_depth == 1 + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert quotation_mark_resolver_state.current_depth == 0 + + +def test_has_open_quotation_mark() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert not quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.has_open_quotation_mark() + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.has_open_quotation_mark() + + +def test_are_more_than_n_quotes_open() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(1) + assert not quotation_mark_resolver_state.are_more_than_n_quotes_open(2) + + +def test_get_opening_quotation_mark_at_depth() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) == "\u2018" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) == "\u201c" + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(2) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + with raises(Exception): + quotation_mark_resolver_state.get_opening_quotation_mark_at_depth(1) + + +def test_get_deepest_opening_mark() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + with raises(Exception): + quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u201c" + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u2018" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert quotation_mark_resolver_state.get_deepest_opening_quotation_mark() == "\u201c" + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + with raises(Exception): + quotation_mark_resolver_state.get_deepest_opening_quotation_mark() + + +# QuotationContinuerState tests +def test_get_current_depth_quotation_continuer_state() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuoteContinuerState() + assert quotation_continuer_state.current_depth == 0 + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.current_depth == 1 + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.current_depth == 2 + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.current_depth == 0 + + +def test_has_continuer_been_observed() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuoteContinuerState() + assert not quotation_continuer_state.continuer_has_been_observed() + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.continuer_has_been_observed() + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.continuer_has_been_observed() + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert not quotation_continuer_state.continuer_has_been_observed() + + +def test_get_continuer_style() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuoteContinuerState() + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.UNDETERMINED + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.ENGLISH + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.SPANISH, + ) + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.SPANISH + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert quotation_continuer_state.continuer_style is QuoteContinuerStyle.ENGLISH + + +def test_add_quotation_continuer() -> None: + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + quotation_continuer_state = QuoteContinuerState() + + assert quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) == QuotationMarkMetadata( + "\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + ) + + assert quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.SPANISH, + ) == QuotationMarkMetadata( + "\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u2018").build(), 0, 1 + ) + assert quotation_continuer_state.continuer_style == QuoteContinuerStyle.SPANISH + + assert quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) == QuotationMarkMetadata( + "\u201c", 3, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text("\u201c").build(), 0, 1 + ) + + +# QuotationMarkCategorizer tests + + +def test_is_english_quotation_continuer() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + + english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + + quotation_mark_categorizer = QuotationMarkCategorizer( + english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + + # Should always be false if the continuer style is Spanish + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH + + # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").build(), + 0, + 1, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( + QuotationMarkUpdateResolutionSettings(standard_english_quote_convention), + quotation_mark_resolver_state, + quotation_continuer_state, + ) + assert quotation_mark_categorizer_for_denormalization.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if there are no open quotation marks + empty_quotation_mark_resolver_state = QuotationMarkResolverState() + empty_quotation_mark_categorizer = QuotationMarkCategorizer( + english_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state + ) + assert not empty_quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if the starting index of the quotation mark is greater than 0 + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text(" \u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + # Should be false if the mark does not match the already opened mark + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + # If there are multiple open quotes, the next quote continuer must follow immediately + # after the current one + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + ) + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + ) + + # When there are multiple open quotes, the continuer must match the deepest observed mark + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c\u2018test").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201c").build(), + 0, + 1, + ) + ) + + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 1, + 2, + ), + quotation_mark_resolver_state, + QuoteContinuerStyle.ENGLISH, + ) + assert not quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u2018test") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 2, + 3, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_english_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u201c\u2018\u201ctest") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 2, + 3, + ), + None, + None, + ) + + +def test_is_spanish_quotation_continuer() -> None: + western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") + assert western_european_quote_convention is not None + + spanish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([western_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + + quotation_mark_categorizer = QuotationMarkCategorizer( + spanish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00ab").build(), 0, 1) + ) + + # Should always be false if the continuer style is English + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_continuer_state._continuer_style = QuoteContinuerStyle.ENGLISH + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + quotation_continuer_state._continuer_style = QuoteContinuerStyle.SPANISH + + # Should be false if there's no preceding paragraph marker (and the settings say to rely on markers) + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").build(), + 0, + 1, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + quotation_mark_categorizer_for_denormalization = QuotationMarkCategorizer( + QuotationMarkUpdateResolutionSettings(western_european_quote_convention), + quotation_mark_resolver_state, + quotation_continuer_state, + ) + assert quotation_mark_categorizer_for_denormalization.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if there are no open quotation marks + empty_quotation_mark_resolver_state = QuotationMarkResolverState() + empty_quotation_mark_categorizer = QuotationMarkCategorizer( + spanish_resolver_settings, empty_quotation_mark_resolver_state, quotation_continuer_state + ) + assert not empty_quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + # Should be false if the starting index of the quotation mark is greater than 0 + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text(" \u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + # Should be false if the mark does not match the already opened mark + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + + # If there are multiple open quotes, the next quote continuer must follow immediately + # after the current one + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + ) + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + None, + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u00bbtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + ) + + # When there are multiple open quotes, the continuer must match the deepest observed mark + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + quotation_mark_resolver_state, + QuoteContinuerStyle.SPANISH, + ) + + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u201ctest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u00bb\u201dtest").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 1, + 2, + ), + None, + None, + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u2018").build(), + 0, + 1, + ) + ) + + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 1, + 2, + ), + None, + None, + ) + + quotation_continuer_state.add_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 1, + 2, + ), + quotation_mark_resolver_state, + QuoteContinuerStyle.SPANISH, + ) + assert not quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u201dtest") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 2, + 3, + ), + None, + None, + ) + assert quotation_mark_categorizer.is_spanish_quote_continuer( + QuotationMarkStringMatch( + TextSegment.Builder() + .set_text("\u00bb\u201d\u2019test") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build(), + 2, + 3, + ), + None, + None, + ) + + +def test_is_opening_quote() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid opening marks under the quote convention + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' "').build(), 1, 2) + ) + + # Leading whitespace is not necessary for unambiguous opening quotes + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) + ) + assert central_european_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) + ) + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201c").build(), 4, 5) + ) + assert british_english_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u2018").build(), 4, 5) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201e").build(), 4, 5) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("text\u201a").build(), 4, 5) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as opening if + # it has a quote introducer beforehand + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c").build(), 1, 2) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as opening if + # preceded by another opening mark + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201c").build(), 1, 2) + ) + + # An ambiguous quotation mark (opening/closing) is not recognized as opening if + # it has trailing whitespace or punctuation + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d.").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019 ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019?").build(), 1, 2) + ) + + +def test_is_closing_quote() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_french") + assert standard_french_quote_convention is not None + standard_french_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_french_quote_convention]) + ) + standard_french_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_french_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + assert central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018 ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb ").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('" ').build(), 0, 1) + ) + + # Trailing whitespace is not necessary for unambiguous closing quotes + assert standard_french_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bbtext").build(), 0, 1) + ) + assert standard_french_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u203atext").build(), 0, 1) + ) + + # An ambiguous quotation mark (opening/closing) is recognized as closing if + # followed by whitespace, punctuation or the end of the segment + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019text").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019?").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201ctext").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c?").build(), 0, 1) + ) + + # An ambiguous quotation mark (opening/closing) is not recognized as opening if + # it has leading whitespace + assert not standard_swedish_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u201c?").build(), 1, 2) + ) + + +def test_is_malformed_opening_quote() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid opening marks under the quote convention + assert central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201e ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201a ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u00ab ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(' " ').build(), 1, 2) + ) + + # Should return true if there is a leading quote introducer + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201d ").build(), 1, 2) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019 ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(":\u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c ").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(",\u201c ").build(), 1, 2) + ) + + # Should return false unless the mark has leading and trailing whitespace + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + + # Should return false if there is already an open quotation mark on the stack + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201c ").build(), 1, 2) + ) + + +def test_is_malformed_closing_quote() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + # Returns true if it's at the end of the segment + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + + # Returns true if it does not have trailing whitespace + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d-").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201dtext").build(), 0, 1) + ) + + # Returns true if it has trailing and leading whitespace + assert standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d ").build(), 1, 2) + ) + + # Requires there to be an open quotation mark on the stack + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + + # Requires the quotation mark on the stack to be a valid pair with the + # observed quotation mark + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_malformed_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + +def test_is_unpaired_closing_quote() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + central_european_quotation_mark_categorizer = QuotationMarkCategorizer( + central_european_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + british_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("british_english") + assert british_english_quote_convention is not None + british_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([british_english_quote_convention]) + ) + british_english_quotation_mark_categorizer = QuotationMarkCategorizer( + british_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_swedish_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + three_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [central_european_quote_convention, british_english_quote_convention, standard_swedish_quote_convention] + ) + ) + three_conventions_quotation_mark_categorizer = QuotationMarkCategorizer( + three_conventions_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # It should only accept valid closing marks under the quote convention + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201e").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201a").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u00bb").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + + # There must not be an opening quotation mark on the stack + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not central_european_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not standard_swedish_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not three_conventions_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + + # There must not be leading whitespace + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u201d").build(), 1, 2) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\t\u2019").build(), 1, 2) + ) + + # The quotation mark must be either at the end of the segment + # or have trailing whitespace + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + ) + assert british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d ").build(), 0, 1) + ) + assert not british_english_quotation_mark_categorizer.is_unpaired_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d?").build(), 0, 1) + ) + + +def test_is_apostrophe() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + quotation_mark_resolver_state = QuotationMarkResolverState() + quotation_continuer_state = QuoteContinuerState() + standard_english_quotation_mark_categorizer = QuotationMarkCategorizer( + standard_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_categorizer = QuotationMarkCategorizer( + typewriter_english_resolver_settings, quotation_mark_resolver_state, quotation_continuer_state + ) + + # The quotation mark must make for a plausible apostrophe + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a'b").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019b").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2018b").build(), 1, 2), None + ) + assert not typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u201cb").build(), 1, 2), None + ) + assert not typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text('a"b').build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a'b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2018b").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u201cb").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text('a"b').build(), 1, 2), None + ) + + # Returns true if the mark has Latin letters on both sides + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019Ƅ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("ǡ\u2019b").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("ᴀ\u2019B").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("𝼀\u2019Ꝙ").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019ℵ").build(), 1, 2), None + ) + assert typewriter_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("a\u2019ℵ").build(), 1, 2), None + ) + + # Recognizes s possessives (e.g. Moses') + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2019").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("Moses\u2019 ").build(), 5, 6), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019?").build(), 1, 2), None + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u20195").build(), 1, 2), None + ) + + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), None + ) + + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word\u2019").build(), 4, 5), + ) + assert not standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("s\u2019 ").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word\u201d").build(), 4, 5), + ) + + # the straight quote should always be an apostrophe if it's not a valid quotation mark + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + + # the straight quote should be an apostrophe if there's nothing on the quotation mark stack + quotation_mark_resolver_state.add_closing_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + + # any matching mark should be an apostrophe if it doesn't pair with the + # deepest opening quotation mark on the stack + # (opening/closing quotation marks will have been detected before calling this) + quotation_mark_resolver_state.add_opening_quotation_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5'ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" ' ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5\u2018ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2018 ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text("5\u2019ℵ").build(), 1, 2), None + ) + assert standard_english_quotation_mark_categorizer.is_apostrophe( + QuotationMarkStringMatch(TextSegment.Builder().set_text(" \u2019 ").build(), 1, 2), None + ) + + +# DepthBasedQuotationMarkResolver tests +def test_depth_based_quotation_mark_resolver_reset() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cThis is a quote").build(), 0, 1)] + ) + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + standard_english_quotation_mark_resolver.reset() + assert standard_english_quotation_mark_resolver.get_issues() == set() + + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("This is a quote\u2019").build(), 15, 16)] + ) + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_basic_quotation_mark_recognition() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_resolution_only_of_passed_matches() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019\u201d").build() + assert ( + list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 17, 18), + ] + ) + ) + == [] + ) + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_resolution_across_segments() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201cThis is a ").build() + text_segment2 = TextSegment.Builder().set_text("\u2018quote\u2019\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 6, 7), + QuotationMarkStringMatch(text_segment2, 7, 8), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 6, 7), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 7, 8), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_resolution_with_apostrophes() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201cThis\u2019 is a \u2018quote\u2019\u201d") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 5, 6), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 20), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 19, 20), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = ( + TextSegment.Builder().set_text("\"This' is a 'quote'\"").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build() + ) + assert list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 5, 6), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 20), + ] + ) + ) == [ + QuotationMarkMetadata('"', 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.CLOSING, text_segment, 19, 20), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_english_quote_continuers() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201cThis is a \u2018quote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("\u201c\u2018This is the rest\u2019 of it\u201d") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 11, 12), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 1, 2), + QuotationMarkStringMatch(text_segment2, 18, 19), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment1, 11, 12), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment2, 1, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 18, 19), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_spanish_quote_continuers() -> None: + western_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("western_european") + assert western_european_quote_convention is not None + western_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([western_european_quote_convention]) + ) + western_european_quotation_mark_resolver = DepthBasedQuotationMarkResolver(western_european_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u00abThis is a \u201cquote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("\u00bb\u201dThis is the rest\u201d of it\u00bb") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + western_european_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 11, 12), + QuotationMarkStringMatch(text_segment2, 0, 1), + QuotationMarkStringMatch(text_segment2, 1, 2), + QuotationMarkStringMatch(text_segment2, 18, 19), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, text_segment1, 11, 12), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.OPENING, text_segment2, 0, 1), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.OPENING, text_segment2, 1, 2), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, text_segment2, 18, 19), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), + ] + assert western_european_quotation_mark_resolver.get_issues() == set() + + +def test_malformed_quotation_marks() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment1 = TextSegment.Builder().set_text("\u201c This is a,\u2018 quote").build() + text_segment2 = ( + TextSegment.Builder() + .set_text("This is the rest \u2019 of it \u201d") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment1, 0, 1), + QuotationMarkStringMatch(text_segment1, 12, 13), + QuotationMarkStringMatch(text_segment2, 17, 18), + QuotationMarkStringMatch(text_segment2, 25, 26), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment1, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment1, 12, 13), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment2, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment2, 25, 26), + ] + assert standard_english_quotation_mark_resolver.get_issues() == set() + + +def test_unpaired_quotation_mark_issue() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u2018quote\u2019").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + text_segment = TextSegment.Builder().set_text("another quote\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 13, 14), + ] + ) + ) == [ + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 13, 14), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK + } + + +def test_too_deep_nesting_issue() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = ( + TextSegment.Builder().set_text("\u201cThis \u2018is \u201ca \u2018quote \u201cnested too deeply").build() + ) + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 6, 7), + QuotationMarkStringMatch(text_segment, 10, 11), + QuotationMarkStringMatch(text_segment, 13, 14), + QuotationMarkStringMatch(text_segment, 20, 21), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, text_segment, 6, 7), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.OPENING, text_segment, 10, 11), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.OPENING, text_segment, 13, 14), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + } + + +def test_incompatible_quotation_mark_issue() -> None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + standard_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_english_quote_convention]) + ) + standard_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text("\u201cThis is a \u201cquote\u201d\u201d").build() + assert list( + standard_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert standard_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK + } + + +def test_ambiguous_quotation_mark_issue() -> None: + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = TextSegment.Builder().set_text('This"is an ambiguous quotation mark').build() + assert ( + list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 4, 5), + ] + ) + ) + == [] + ) + assert typewriter_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK + } + + typewriter_english_quotation_mark_resolver.reset() + text_segment = TextSegment.Builder().set_text("\u201cThis is an ambiguous quotation mark").build() + assert ( + list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [QuotationMarkStringMatch(text_segment, 0, 1)] + ) + ) + == [] + ) + assert typewriter_english_quotation_mark_resolver.get_issues() == { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK + } + + +def test_typewriter_english_quotation_mark_recognition() -> None: + typewriter_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_english") + assert typewriter_english_quote_convention is not None + typewriter_english_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_english_quote_convention]) + ) + typewriter_english_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_english_resolver_settings) + + text_segment = ( + TextSegment.Builder().set_text("\"This is a 'quote'\"").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build() + ) + assert list( + typewriter_english_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata('"', 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("'", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata('"', 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert typewriter_english_quotation_mark_resolver.get_issues() == set() + + +def test_typewriter_french_mark_recognition() -> None: + typewriter_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") + assert typewriter_french_quote_convention is not None + typewriter_french_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([typewriter_french_quote_convention]) + ) + typewriter_french_quotation_mark_resolver = DepthBasedQuotationMarkResolver(typewriter_french_resolver_settings) + + text_segment = TextSegment.Builder().set_text("<>>").build() + assert list( + typewriter_french_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 2), + QuotationMarkStringMatch(text_segment, 12, 13), + QuotationMarkStringMatch(text_segment, 18, 19), + QuotationMarkStringMatch(text_segment, 19, 21), + ] + ) + ) == [ + QuotationMarkMetadata("<<", 1, QuotationMarkDirection.OPENING, text_segment, 0, 2), + QuotationMarkMetadata("<", 2, QuotationMarkDirection.OPENING, text_segment, 12, 13), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + QuotationMarkMetadata(">>", 1, QuotationMarkDirection.CLOSING, text_segment, 19, 21), + ] + assert typewriter_french_quotation_mark_resolver.get_issues() == set() + + +def test_central_european_quotation_mark_recognition() -> None: + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + central_european_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([central_european_quote_convention]) + ) + central_european_quotation_mark_resolver = DepthBasedQuotationMarkResolver(central_european_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201eThis is a \u201aquote\u2018\u201c") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + central_european_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u201a", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert central_european_quotation_mark_resolver.get_issues() == set() + + +def test_standard_swedish_quotation_mark_recognition() -> None: + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + standard_swedish_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet([standard_swedish_quote_convention]) + ) + standard_swedish_quotation_mark_resolver = DepthBasedQuotationMarkResolver(standard_swedish_resolver_settings) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201dThis is a \u2019quote\u2019\u201d") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + standard_swedish_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert standard_swedish_quotation_mark_resolver.get_issues() == set() + + +def test_multiple_conventions_quotation_mark_recognition() -> None: + typewriter_french_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("typewriter_french") + assert typewriter_french_quote_convention is not None + + central_european_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("central_european") + assert central_european_quote_convention is not None + + standard_swedish_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_swedish") + assert standard_swedish_quote_convention is not None + multiple_conventions_resolver_settings = QuoteConventionDetectionResolutionSettings( + QuoteConventionSet( + [typewriter_french_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + ) + multiple_conventions_quotation_mark_resolver = DepthBasedQuotationMarkResolver( + multiple_conventions_resolver_settings + ) + + text_segment = ( + TextSegment.Builder() + .set_text("\u201eThis is a \u2019quote>\u201c") + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ) + assert list( + multiple_conventions_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(text_segment, 0, 1), + QuotationMarkStringMatch(text_segment, 11, 12), + QuotationMarkStringMatch(text_segment, 17, 18), + QuotationMarkStringMatch(text_segment, 18, 19), + ] + ) + ) == [ + QuotationMarkMetadata("\u201e", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.OPENING, text_segment, 11, 12), + QuotationMarkMetadata(">", 2, QuotationMarkDirection.CLOSING, text_segment, 17, 18), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.CLOSING, text_segment, 18, 19), + ] + assert multiple_conventions_quotation_mark_resolver.get_issues() == set() diff --git a/tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py b/tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py new file mode 100644 index 00000000..4607fafa --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_preliminary_quotation_mark_analyzer.py @@ -0,0 +1,988 @@ +from machine.corpora.punctuation_analysis import ( + ApostropheProportionStatistics, + Chapter, + PreliminaryApostropheAnalyzer, + PreliminaryQuotationMarkAnalyzer, + QuotationMarkGrouper, + QuotationMarkSequences, + QuotationMarkStringMatch, + QuotationMarkWordPositions, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, + TextSegment, + Verse, +) + + +# ApostropheProportionStatistics tests +def test_apostrophe_proportion_statistics_reset() -> None: + apostrophe_proportion_statistics = ApostropheProportionStatistics() + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("'").build()) + apostrophe_proportion_statistics.add_apostrophe() + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + apostrophe_proportion_statistics.reset() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + +def test_is_apostrophe_proportion_greater_than() -> None: + apostrophe_proportion_statistics = ApostropheProportionStatistics() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.0) + + # invalid case where no characters have been counted + apostrophe_proportion_statistics.add_apostrophe() + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.0) + + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("a").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.99) + + apostrophe_proportion_statistics.add_apostrophe() + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("bcd").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.4) + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.5) + + apostrophe_proportion_statistics.count_characters(TextSegment.Builder().set_text("ef").build()) + assert apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.3) + assert not apostrophe_proportion_statistics.is_apostrophe_proportion_greater_than(0.4) + + +# QuotationMarkWordPosition tests +def test_is_mark_rarely_initial() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_initial("\u201d") + + +def test_is_mark_rarely_final() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201c") + quotation_mark_word_positions.count_word_final_apostrophe("\u201c") + assert quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert not quotation_mark_word_positions.is_mark_rarely_final("\u201d") + + +def test_are_initial_and_final_rates_similar() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + assert not quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + assert quotation_mark_word_positions.are_initial_and_final_rates_similar("\u201d") + + +def test_is_mark_commonly_mid_word() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_mid_word_apostrophe("'") + assert quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_word_initial_apostrophe("'") + quotation_mark_word_positions.count_word_final_apostrophe("'") + quotation_mark_word_positions.count_word_initial_apostrophe("'") + quotation_mark_word_positions.count_word_final_apostrophe("'") + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + quotation_mark_word_positions.count_mid_word_apostrophe("'") + assert quotation_mark_word_positions.is_mark_commonly_mid_word("'") + + +def test_quotation_mark_word_positions_reset() -> None: + quotation_mark_word_positions = QuotationMarkWordPositions() + quotation_mark_word_positions.count_word_initial_apostrophe("\u201d") + quotation_mark_word_positions.count_word_final_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + quotation_mark_word_positions.count_mid_word_apostrophe("\u201d") + + assert quotation_mark_word_positions.is_mark_commonly_mid_word("\u201d") + + quotation_mark_word_positions.reset() + + assert not quotation_mark_word_positions.is_mark_commonly_mid_word("\u201d") + + +# QuotationMarkSequence tests +def test_is_mark_much_more_common_earlier() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_earlier('"') + + +def test_is_mark_much_more_common_later() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.is_mark_much_more_common_later('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + assert not quotation_mark_sequences.is_mark_much_more_common_later('"') + + +def test_is_mark_common_early_and_late() -> None: + quotation_mark_sequences = QuotationMarkSequences() + assert not quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') + + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_earlier_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + assert quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') + + quotation_mark_sequences.count_later_quotation_mark('"') + quotation_mark_sequences.count_later_quotation_mark('"') + assert not quotation_mark_sequences.are_early_and_late_mark_rates_similar('"') + + +# QuotationMarkGrouper tests +def test_get_quotation_mark_pairs() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + quotation_mark_grouper = QuotationMarkGrouper([], QuoteConventionSet([standard_english_quote_convention])) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # no paired quotation mark + quotation_mark_grouper = QuotationMarkGrouper( + [QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1)], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # basic quotation mark pair + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d")] + + # out-of-order quotation mark pair + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # multiple unpaired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # paired and unpaired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u2018\u201d").build(), 2, 3), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d")] + + # ambiguous unpaired quotation mark + quotation_mark_grouper = QuotationMarkGrouper( + [QuotationMarkStringMatch(TextSegment.Builder().set_text('"').build(), 0, 1)], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # paired ambiguous quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [('"', '"')] + + # multiple paired quotation marks (should be skipped because we don't know how to pair them) + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 2, 3), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u201c\u201d").build(), 3, 4), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + # multiple different paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 1, 2), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 2, 3), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d\u2018\u2019").build(), 3, 4), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u201c", "\u201d"), ("\u2018", "\u2019")] + + # second-level paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [("\u2018", "\u2019")] + + # quotation marks that don't match the convention set + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert list(quotation_mark_grouper.get_quotation_mark_pairs()) == [] + + +def test_has_distinct_paired_quotation_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + quotation_mark_grouper = QuotationMarkGrouper( + [], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("") + + # basic paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # second-level paired quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u2018\u2019").build(), 1, 2), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u2018") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u2019") + + # only one half of the pair observed + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1), + ], + QuoteConventionSet([standard_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # quotation marks that don't match the convention set + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201c") + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark("\u201d") + + # ambiguous quotation marks + quotation_mark_grouper = QuotationMarkGrouper( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('""').build(), 1, 2), + ], + QuoteConventionSet([typewriter_english_quote_convention]), + ) + assert not quotation_mark_grouper.has_distinct_paired_quotation_mark('"') + + +# PreliminaryApostropheAnalyzer tests +def test_that_the_mark_must_be_an_apostrophe() -> None: + preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("alternative mid\u2019word apostrophe").build(), 15, 16 + ), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid\u2018word quotation mark").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid\u201cword quotation mark").build(), 3, 4), + ], + ) + assert preliminary_apostrophe_analyzer.is_apostrophe_only("'") + assert preliminary_apostrophe_analyzer.is_apostrophe_only("\u2019") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u2018") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u201c") + assert not preliminary_apostrophe_analyzer.is_apostrophe_only("\u201d") + + +def test_that_a_rarely_initial_or_final_mark_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + TextSegment.Builder() + .set_text("Technically Unicode has a separate character for the glottal stop, but it is rarely used") + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_mark_with_similar_final_and_initial_rates_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + negative_preliminary_apostrophe_analyzer2 = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer2.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert not negative_preliminary_apostrophe_analyzer2.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + TextSegment.Builder() + .set_text("We need a ton of text here to keep the proportion low, since we have 8 apostrophes in this test") + .build(), + TextSegment.Builder() + .set_text( + "The proportion must be kept below 0.02, because quotation marks should occur relatively infrequently" + ) + .build(), + TextSegment.Builder() + .set_text( + "Apostrophes, on the other hand, can be much more common, especially in non-English languages where they " + + "can indicate a glottal stop" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_commonly_mid_word_mark_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text("mid'word apostrophe").build(), 3, 4), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +def test_that_a_frequently_occurring_character_is_an_apostrophe() -> None: + negative_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + negative_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Long text segment to help keep the proportion of apostrophes low").build(), + TextSegment.Builder() + .set_text( + "If a mark appears very frequently in the text, it is likely an apostrophe, instead of a quotation mark" + ) + .build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert not negative_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + positive_preliminary_apostrophe_analyzer = PreliminaryApostropheAnalyzer() + positive_preliminary_apostrophe_analyzer.process_quotation_marks( + [ + TextSegment.Builder().set_text("Very short text").build(), + ], + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("'word initial apostrophe").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("word' final apostrophe").build(), 4, 5), + ], + ) + assert positive_preliminary_apostrophe_analyzer.is_apostrophe_only("'") + + +# PreliminaryQuotationMarkAnalyzer tests +def test_that_quotation_mark_sequence_is_used_to_determine_opening_and_closing_quotes() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + standard_french_quote_convention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_swedish_quote_convention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + preliminary_quotation_analyzer = PreliminaryQuotationMarkAnalyzer( + QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + standard_swedish_quote_convention, + ] + ) + ) + + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build() + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201d quoted Swedish text \u201d final text") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_swedish_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u00ab quoted French/Western European text \u00bb final text") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_french_quote_convention, western_european_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text('initial text " quoted typewriter English text " final text') + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([typewriter_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build(), + TextSegment.Builder().set_text("second level \u2018 English quotes \u2019").build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text('initial text " quoted typewriter English text " final text') + .build(), + TextSegment.Builder().set_text("second level 'typewriter quotes'").build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([typewriter_english_quote_convention]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("initial text \u201c quoted English text \u201d final text") + .build(), + TextSegment.Builder() + .set_text("the quotes \u201d in this segment \u201c are backwards") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([]) + + preliminary_quotation_analyzer.reset() + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("first-level quotes \u2018 must be observed \u2019 to retain a quote convention") + .build(), + ] + ) + ] + ) + ] + ) == QuoteConventionSet([]) + + +def test_that_apostrophes_not_considered_as_quotation_marks() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + typewriter_english_quote_convention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + preliminary_quotation_analyzer = PreliminaryQuotationMarkAnalyzer( + QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + ] + ) + ) + + assert preliminary_quotation_analyzer.narrow_down_possible_quote_conventions( + [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("ini'tial 'text \u201c quo'ted English text' \u201d fi'nal text") + .build() + ] + ) + ] + ) + ] + ) == QuoteConventionSet([standard_english_quote_convention]) diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py new file mode 100644 index 00000000..d0f66b6b --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_finder.py @@ -0,0 +1,280 @@ +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + QuotationMarkFinder, + QuotationMarkStringMatch, + QuoteConventionSet, + TextSegment, +) + + +def test_that_all_possible_quotation_marks_are_identified() -> None: + quotation_mark_finder = QuotationMarkFinder(STANDARD_QUOTE_CONVENTIONS) + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("\u201cSample Text\u201d").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cSample Text\u201d").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201cSample Text\u201d").build(), 12, 13), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("\"Sample Text'").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("\"Sample Text'").build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text("\"Sample Text'").build(), 12, 13), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build() + ) == [ + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 4, 5 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 9, 10 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 27, 28 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u201cthe \u2019English quotation\u2018 marks\u201d").build(), 34, 35 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build() + ) == [ + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 4, 5 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 9, 10 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 26, 27 + ), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \u00abthe \u2039French quotation\u203a marks\u00bb").build(), 33, 34 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build() + ) == [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build(), 4, 5), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("All \"the 'typewriter quotation marks").build(), 9, 10 + ), + ] + + assert quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder() + .set_text("This has \u201equotes from \u00bbdifferent conventions < None: + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + + english_quotation_mark_finder = QuotationMarkFinder(QuoteConventionSet([standard_english_quote_convention])) + assert ( + english_quotation_mark_finder.find_all_potential_quotation_marks_in_text_segment( + TextSegment.Builder() + .set_text("This has \u201equotes from \u00bbdifferent conventions < None: + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("standard_english")) + assert quotation_mark_metadata.text_segment._text == "He said to the woman, “Has God really said," + + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) + assert quotation_mark_metadata.text_segment._text == "He said to the woman, «Has God really said," + + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=23, + end_index=24, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("western_european")) + assert quotation_mark_metadata.text_segment._text == 'He said to the woman, "«as God really said,' + + +def test_update_quotation_mark_with_multi_character_quotation_marks() -> None: + quotation_mark_metadata = QuotationMarkMetadata( + quotation_mark='"', + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=TextSegment.Builder().set_text('He said to the woman, "Has God really said,').build(), + start_index=22, + end_index=23, + ) + quotation_mark_metadata.update_quotation_mark(get_quote_convention_by_name("typewriter_french")) + assert quotation_mark_metadata.text_segment._text == "He said to the woman, < QuoteConvention: + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) + assert quote_convention is not None + return quote_convention diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py new file mode 100644 index 00000000..a91e77e4 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_resolver.py @@ -0,0 +1,50 @@ +from typing import List + +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + DepthBasedQuotationMarkResolver, + QuotationMarkResolver, + QuotationMarkStringMatch, + QuoteConventionDetectionResolutionSettings, + TextSegment, + UsfmMarkerType, +) + + +def test_reset() -> None: + quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(STANDARD_QUOTE_CONVENTIONS) + ) + + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 + + quotation_mark_resolver.reset() + + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 + + quotation_mark_string_matches: List[QuotationMarkStringMatch] = [ + QuotationMarkStringMatch(TextSegment.Builder().set_text("Opening “quote").build(), 8, 9), + QuotationMarkStringMatch(TextSegment.Builder().set_text("Another opening ‘quote").build(), 16, 17), + QuotationMarkStringMatch( + TextSegment.Builder().set_text("“‘quote continuer").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ), + ] + + list(quotation_mark_resolver.resolve_quotation_marks(quotation_mark_string_matches)) + assert len(quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack) > 0 + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth > 0 + + quotation_mark_resolver.reset() + + assert quotation_mark_resolver._quotation_mark_resolver_state._quotation_stack == [] + assert quotation_mark_resolver._quote_continuer_state._quote_continuer_mark_stack == [] + assert quotation_mark_resolver._quotation_mark_resolver_state.current_depth == 0 + assert quotation_mark_resolver._quote_continuer_state.current_depth == 0 diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py new file mode 100644 index 00000000..82744ec8 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_string_match.py @@ -0,0 +1,451 @@ +import regex + +from machine.corpora.punctuation_analysis import ( + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkStringMatch, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, + TextSegment, + UsfmMarkerType, +) + + +def test_get_quotation_mark() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("quick brown fox").build(), 6, 7 + ) + assert quotation_mark_string_match.quotation_mark == "b" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("quick brown fox").build(), 6, 10 + ) + assert quotation_mark_string_match.quotation_mark == "brow" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("q").build(), 0, 1) + assert quotation_mark_string_match.quotation_mark == "q" + + +def test_is_valid_opening_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + assert quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + assert not quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 1, 2) + assert quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 2) + assert not quotation_mark_string_match.is_valid_opening_quotation_mark(standard_english_quote_convention_set) + + +def test_is_valid_closing_quotation_mark() -> None: + standard_english_quote_convention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d").build(), 0, 1) + assert quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c").build(), 0, 1) + assert not quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 1) + assert quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201d\u201c").build(), 0, 2) + assert not quotation_mark_string_match.is_valid_closing_quotation_mark(standard_english_quote_convention_set) + + +def test_does_quotation_mark_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.quotation_mark_matches(regex.compile(r"^s$")) + assert not quotation_mark_string_match.quotation_mark_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.quotation_mark_matches(regex.compile(r"sa")) + + +def test_does_next_character_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r"^s$")) + assert quotation_mark_string_match.next_character_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r"sa")) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.next_character_matches(regex.compile(r".*")) + + +def test_does_previous_character_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.previous_character_matches(regex.compile(r"^s$")) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r"a")) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r"sa")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.previous_character_matches(regex.compile(r".*")) + + +def test_get_previous_character() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.previous_character == "s" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.previous_character == "x" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.previous_character is None + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + assert quotation_mark_string_match.previous_character == "“" + + +def test_get_next_character() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.next_character == "m" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.next_character == "a" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.next_character is None + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) + assert quotation_mark_string_match.next_character == "”" + + +def test_does_leading_substring_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.leading_substring_matches(regex.compile(r"^sampl$")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.leading_substring_matches(regex.compile(r".+")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 1, 2) + assert quotation_mark_string_match.leading_substring_matches(regex.compile(r"\u201c")) + + +def test_does_trailing_substring_match() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.trailing_substring_matches(regex.compile(r"^ text$")) + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.trailing_substring_matches(regex.compile(r".+")) + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("\u201c\u201d").build(), 0, 1) + assert quotation_mark_string_match.trailing_substring_matches(regex.compile(r"\u201d")) + + +def test_get_context() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 15, 16 + ) + assert quotation_mark_string_match.context == "is a bunch' of sample" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 5, 6 + ) + assert quotation_mark_string_match.context == "this is a bunch'" + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("this is a bunch' of sample text").build(), 25, 26 + ) + assert quotation_mark_string_match.context == "' of sample text" + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("short").build(), 3, 4) + assert quotation_mark_string_match.context == "short" + + +def test_resolve() -> None: + text_segment = TextSegment.Builder().set_text("'").build() + quotation_mark_string_match = QuotationMarkStringMatch(text_segment, 0, 1) + assert quotation_mark_string_match.resolve(2, QuotationMarkDirection.OPENING) == QuotationMarkMetadata( + "'", 2, QuotationMarkDirection.OPENING, text_segment, 0, 1 + ) + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.OPENING) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.OPENING, text_segment, 0, 1 + ) + assert quotation_mark_string_match.resolve(1, QuotationMarkDirection.CLOSING) == QuotationMarkMetadata( + "'", 1, QuotationMarkDirection.CLOSING, text_segment, 0, 1 + ) + + +def test_is_at_start_of_segment() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert not quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").build(), 0, 1 + ) + assert quotation_mark_string_match.is_at_start_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 15, 16 + ) + assert not quotation_mark_string_match.is_at_start_of_segment() + + +def test_is_at_end_of_segment() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text\u201d").build(), 12, 13 + ) + assert quotation_mark_string_match.is_at_end_of_segment() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 15, 16 + ) + assert not quotation_mark_string_match.is_at_end_of_segment() + + +def test_has_leading_whitespace() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 7, 8) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample\ttext").build(), 7, 8) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 0, + 1, + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EMBED).build(), 0, 1 + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), 0, 1 + ) + assert quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CHAPTER).build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.CHARACTER).build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), + 0, + 1, + ) + assert quotation_mark_string_match.has_leading_whitespace() + + +def test_has_trailing_whitespace() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 5, 6) + assert quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample\ttext").build(), 5, 6) + assert quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.PARAGRAPH).build(), + 10, + 11, + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.EMBED).build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").add_preceding_marker(UsfmMarkerType.VERSE).build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_whitespace() + + +def test_has_leading_punctuation() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample)\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample) \u201d text").build(), 8, 9 + ) + assert not quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample.\u201d text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_leading_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("\u201csample text").build(), 0, 1 + ) + assert not quotation_mark_string_match.has_leading_punctuation() + + +def test_has_trailing_punctuation() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample \u201c-text").build(), 7, 8 + ) + assert quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample \u201c text").build(), 7, 8 + ) + assert not quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text\u201d").build(), 11, 12 + ) + assert not quotation_mark_string_match.has_trailing_punctuation() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample', text\u201d").build(), 6, 7 + ) + assert quotation_mark_string_match.has_trailing_punctuation() + + +def test_has_letter_in_leading_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.has_letter_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("ꮪample text").build(), 1, 2) + assert quotation_mark_string_match.has_letter_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_letter_in_leading_substring() + + +def test_has_letter_in_trailing_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 9, 10) + assert quotation_mark_string_match.has_letter_in_trailing_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample tex𑢼").build(), 9, 10) + assert quotation_mark_string_match.has_letter_in_trailing_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_letter_in_trailing_substring() + + +def test_has_leading_latin_letter() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 1, 2) + assert quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("5ample text").build(), 1, 2) + assert not quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("Sample text").build(), 1, 2) + assert quotation_mark_string_match.has_leading_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 0, 1) + assert not quotation_mark_string_match.has_leading_latin_letter() + + +def test_has_trailing_latin_letter() -> None: + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample text").build(), 9, 10) + assert quotation_mark_string_match.has_trailing_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample texT").build(), 9, 10 + ) + assert quotation_mark_string_match.has_trailing_latin_letter() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample text").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_trailing_latin_letter() + + +def test_has_quote_introducer_in_leading_substring() -> None: + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, \u201ctext").build(), 8, 9 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,\u201ctext").build(), 7, 8 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample: \u201ctext").build(), 8, 9 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample:\u201ctext").build(), 7, 8 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, \u201ctext").build(), 9, 10 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample,, \u201ctext").build(), 9, 10 + ) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch( + TextSegment.Builder().set_text("sample, a \u201ctext").build(), 10, 11 + ) + assert not quotation_mark_string_match.has_quote_introducer_in_leading_substring() + + quotation_mark_string_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("sample, text").build(), 8, 9) + assert quotation_mark_string_match.has_quote_introducer_in_leading_substring() diff --git a/tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py b/tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py new file mode 100644 index 00000000..de9a787d --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quotation_mark_tabulator.py @@ -0,0 +1,139 @@ +# QuotationMarkCounts tests +from pytest import approx + +from machine.corpora.punctuation_analysis import ( + QuotationMarkCounts, + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkTabulator, + QuoteConvention, + SingleLevelQuoteConvention, + TextSegment, +) + + +def test_get_observed_count() -> None: + counts = QuotationMarkCounts() + assert counts.get_observed_count() == 0 + + counts.count_quotation_mark('"') + assert counts.get_observed_count() == 1 + + counts.count_quotation_mark('"') + assert counts.get_observed_count() == 2 + + counts.count_quotation_mark("'") + assert counts.get_observed_count() == 3 + + +def test_get_best_proportion() -> None: + counts = QuotationMarkCounts() + counts.count_quotation_mark('"') + counts.count_quotation_mark('"') + counts.count_quotation_mark("'") + + best_str, best_count, total_count = counts.find_best_quotation_mark_proportion() + assert best_str == '"' + assert best_count == 2 + assert total_count == 3 + + counts.count_quotation_mark("'") + counts.count_quotation_mark("'") + + best_str, best_count, total_count = counts.find_best_quotation_mark_proportion() + assert best_str == "'" + assert best_count == 3 + assert total_count == 5 + + +def test_calculate_num_differences() -> None: + counts = QuotationMarkCounts() + counts.count_quotation_mark('"') + counts.count_quotation_mark('"') + counts.count_quotation_mark("'") + + assert counts.calculate_num_differences('"') == 1 + assert counts.calculate_num_differences("'") == 2 + assert counts.calculate_num_differences("\u201c") == 3 + + counts.count_quotation_mark("'") + assert counts.calculate_num_differences('"') == 2 + assert counts.calculate_num_differences("'") == 2 + assert counts.calculate_num_differences("\u201c") == 4 + + +# QuotationMarkTabulator tests +def test_calculate_similarity() -> None: + single_level_quotation_mark_tabulator = QuotationMarkTabulator() + single_level_quotation_mark_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 1), + ] + ) + + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) + == 1.0 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201d", "\u201c")]) + ) + == 0.0 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", '"')]) + ) + == 0.5 + ) + assert ( + single_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] + ) + ) + == 1.0 + ) + + empty_quotation_mark_tabulator = QuotationMarkTabulator() + assert ( + empty_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) + == 0.0 + ) + + two_level_quotation_mark_tabulator = QuotationMarkTabulator() + two_level_quotation_mark_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 2), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 0, 2), + ] + ) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention("", [SingleLevelQuoteConvention("\u201c", "\u201d")]) + ) == approx(0.66666666666667, rel=1e-9) + assert ( + two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + ) + == 1.0 + ) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u00ab", "\u00bb")] + ) + ) == approx(0.66666666666667, rel=1e-9) + assert two_level_quotation_mark_tabulator.calculate_similarity( + QuoteConvention( + "", [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + ) == approx(0.33333333333333, rel=1e-9) diff --git a/tests/corpora/punctuation_analysis/test_quote_convention.py b/tests/corpora/punctuation_analysis/test_quote_convention.py new file mode 100644 index 00000000..12c5deb5 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quote_convention.py @@ -0,0 +1,383 @@ +from machine.corpora.punctuation_analysis import QuotationMarkDirection +from machine.corpora.punctuation_analysis.quote_convention import QuoteConvention, SingleLevelQuoteConvention + + +def test_single_level_quote_convention_normalize() -> None: + english_level1_quote_convention = SingleLevelQuoteConvention("\u201c", "\u201d") + normalized_english_level1_quote_convention = english_level1_quote_convention.normalize() + assert normalized_english_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_english_level1_quote_convention.closing_quotation_mark == '"' + + english_level2_quote_convention = SingleLevelQuoteConvention("\u2018", "\u2019") + normalized_english_level2_quote_convention = english_level2_quote_convention.normalize() + assert normalized_english_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_english_level2_quote_convention.closing_quotation_mark == "'" + + already_normalized_english_level1_quote_convention = SingleLevelQuoteConvention('"', '"') + doubly_normalized_english_level1_quote_convention = already_normalized_english_level1_quote_convention.normalize() + assert doubly_normalized_english_level1_quote_convention.opening_quotation_mark == '"' + assert doubly_normalized_english_level1_quote_convention.closing_quotation_mark == '"' + + already_normalized_english_level2_quote_convention = SingleLevelQuoteConvention("'", "'") + doubly_normalized_english_level2_quote_convention = already_normalized_english_level2_quote_convention.normalize() + assert doubly_normalized_english_level2_quote_convention.opening_quotation_mark == "'" + assert doubly_normalized_english_level2_quote_convention.closing_quotation_mark == "'" + + french_level1_quote_convention = SingleLevelQuoteConvention("\u00ab", "\u00bb") + normalized_french_level1_quote_convention = french_level1_quote_convention.normalize() + assert normalized_french_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_french_level1_quote_convention.closing_quotation_mark == '"' + + french_level2_quote_convention = SingleLevelQuoteConvention("\u2039", "\u203a") + normalized_french_level2_quote_convention = french_level2_quote_convention.normalize() + assert normalized_french_level2_quote_convention.opening_quotation_mark == "\u2039" + assert normalized_french_level2_quote_convention.closing_quotation_mark == "\u203a" + + typewriter_french_level1_quote_convention = SingleLevelQuoteConvention("<<", ">>") + normalized_typewriter_french_level1_quote_convention = typewriter_french_level1_quote_convention.normalize() + assert normalized_typewriter_french_level1_quote_convention.opening_quotation_mark == "<<" + assert normalized_typewriter_french_level1_quote_convention.closing_quotation_mark == ">>" + + typewriter_french_level2_quote_convention = SingleLevelQuoteConvention("<", ">") + normalized_typewriter_french_level2_quote_convention = typewriter_french_level2_quote_convention.normalize() + assert normalized_typewriter_french_level2_quote_convention.opening_quotation_mark == "<" + assert normalized_typewriter_french_level2_quote_convention.closing_quotation_mark == ">" + + central_european_level1_quote_convention = SingleLevelQuoteConvention("\u201e", "\u201c") + normalized_central_european_level1_quote_convention = central_european_level1_quote_convention.normalize() + assert normalized_central_european_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_central_european_level1_quote_convention.closing_quotation_mark == '"' + + central_european_level2_quote_convention = SingleLevelQuoteConvention("\u201a", "\u2018") + normalized_central_european_level2_quote_convention = central_european_level2_quote_convention.normalize() + assert normalized_central_european_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_central_european_level2_quote_convention.closing_quotation_mark == "'" + + central_european_guillemets_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00ab") + normalized_central_european_guillemets_quote_convention = central_european_guillemets_quote_convention.normalize() + assert normalized_central_european_guillemets_quote_convention.opening_quotation_mark == '"' + assert normalized_central_european_guillemets_quote_convention.closing_quotation_mark == '"' + + swedish_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201d") + normalized_swedish_level1_quote_convention = swedish_level1_quote_convention.normalize() + assert normalized_swedish_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_swedish_level1_quote_convention.closing_quotation_mark == '"' + + swedish_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2019") + normalized_swedish_level2_quote_convention = swedish_level2_quote_convention.normalize() + assert normalized_swedish_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_swedish_level2_quote_convention.closing_quotation_mark == "'" + + finnish_level1_quote_convention = SingleLevelQuoteConvention("\u00bb", "\u00bb") + normalized_finnish_level1_quote_convention = finnish_level1_quote_convention.normalize() + assert normalized_finnish_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_finnish_level1_quote_convention.closing_quotation_mark == '"' + + arabic_level1_quote_convention = SingleLevelQuoteConvention("\u201d", "\u201c") + normalized_arabic_level1_quote_convention = arabic_level1_quote_convention.normalize() + assert normalized_arabic_level1_quote_convention.opening_quotation_mark == '"' + assert normalized_arabic_level1_quote_convention.closing_quotation_mark == '"' + + arabic_level2_quote_convention = SingleLevelQuoteConvention("\u2019", "\u2018") + normalized_arabic_level2_quote_convention = arabic_level2_quote_convention.normalize() + assert normalized_arabic_level2_quote_convention.opening_quotation_mark == "'" + assert normalized_arabic_level2_quote_convention.closing_quotation_mark == "'" + + +def test_get_num_levels() -> None: + empty_quote_convention = QuoteConvention("empty-quote-convention", []) + assert empty_quote_convention.num_levels == 0 + + one_level_quote_convention = QuoteConvention( + "one_level_quote_convention", + [SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert one_level_quote_convention.num_levels == 1 + + two_level_quote_convention = QuoteConvention( + "two_level_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + assert two_level_quote_convention.num_levels == 2 + + three_level_quote_convention = QuoteConvention( + "three_level_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201D", "\u201D"), + ], + ) + assert three_level_quote_convention.num_levels == 3 + + +def test_get_opening_quote_at_level() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_opening_quotation_mark_at_depth(1) == "\u201c" + assert quote_convention.get_opening_quotation_mark_at_depth(2) == "\u2018" + assert quote_convention.get_opening_quotation_mark_at_depth(3) == "\u00ab" + + +def test_get_closing_quote_at_level() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_closing_quotation_mark_at_depth(1) == "\u201d" + assert quote_convention.get_closing_quotation_mark_at_depth(2) == "\u2019" + assert quote_convention.get_closing_quotation_mark_at_depth(3) == "\u00bb" + + +def test_get_expected_quotation_mark() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.OPENING) == "\u201c" + assert quote_convention.get_expected_quotation_mark(1, QuotationMarkDirection.CLOSING) == "\u201d" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.OPENING) == "\u2018" + assert quote_convention.get_expected_quotation_mark(2, QuotationMarkDirection.CLOSING) == "\u2019" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.OPENING) == "\u00ab" + assert quote_convention.get_expected_quotation_mark(3, QuotationMarkDirection.CLOSING) == "\u00bb" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.OPENING) == "" + assert quote_convention.get_expected_quotation_mark(4, QuotationMarkDirection.CLOSING) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.OPENING) == "" + assert quote_convention.get_expected_quotation_mark(0, QuotationMarkDirection.CLOSING) == "" + + +def test_includes_opening_quotation_mark() -> None: + empty_quote_convention = QuoteConvention("empty quote convention", []) + assert not empty_quote_convention._includes_opening_quotation_mark("\u201c") + + positive_quote_convention1 = QuoteConvention( + "positive_quote_convention_1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + ) + assert positive_quote_convention1._includes_opening_quotation_mark("\u201c") + + negative_quote_convention1 = QuoteConvention( + "negative_quote_convention_1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + assert not negative_quote_convention1._includes_opening_quotation_mark("\u201c") + + negative_quote_convention2 = QuoteConvention( + "negative_quote_convention_2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + ) + assert not negative_quote_convention2._includes_opening_quotation_mark("\u201c") + + positive_quote_convention2 = QuoteConvention( + "positive_quote_convention_2", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], + ) + assert positive_quote_convention2._includes_opening_quotation_mark("\u201c") + + positive_quote_convention3 = QuoteConvention( + "positive_quote_convention_3", + [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert positive_quote_convention3._includes_opening_quotation_mark("\u201c") + + negative_quote_convention3 = QuoteConvention( + "negative quote convention 3", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert not negative_quote_convention3._includes_opening_quotation_mark("\u201c") + + +def test_includes_closing_quotation_mark() -> None: + empty_quote_convention = QuoteConvention("empty quote convention", []) + assert not empty_quote_convention._includes_closing_quotation_mark("\u201d") + + positive_quote_convention1 = QuoteConvention( + "positive_quote_convention_1", [SingleLevelQuoteConvention("\u201c", "\u201d")] + ) + assert positive_quote_convention1._includes_closing_quotation_mark("\u201d") + + negative_quote_convention1 = QuoteConvention( + "negative_quote_convention_1", [SingleLevelQuoteConvention("\u2018", "\u2019")] + ) + assert not negative_quote_convention1._includes_closing_quotation_mark("\u201d") + + negative_quote_convention2 = QuoteConvention( + "negative_quote_convention_2", [SingleLevelQuoteConvention("\u201d", "\u201c")] + ) + assert not negative_quote_convention2._includes_closing_quotation_mark("\u201d") + + positive_quote_convention2 = QuoteConvention( + "positive_quote_convention_2", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u2018", "\u2019")], + ) + assert positive_quote_convention2._includes_closing_quotation_mark("\u201d") + + positive_quote_convention3 = QuoteConvention( + "positive_quote_convention_3", + [SingleLevelQuoteConvention("\u2018", "\u2019"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ) + assert positive_quote_convention3._includes_closing_quotation_mark("\u201d") + + negative_quote_convention3 = QuoteConvention( + "negative_quote_convention_3", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert not negative_quote_convention3._includes_closing_quotation_mark("\u201d") + + +def test_get_possible_depths() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 3} + assert quote_convention.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {2, 4} + assert quote_convention.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 3} + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {2, 4} + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.OPENING) == set() + assert quote_convention.get_possible_depths("\u00ab", QuotationMarkDirection.CLOSING) == set() + + +def test_is_compatible_with_observed_quotation_marks() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + ], + ) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u201d", "\u2019"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u00ab"], ["\u201d", "\u00bb"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c"], ["\u201d", "\u2019"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c"], ["\u201d"]) + assert quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u00ab"], ["\u201d", "\u2019"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201d", "\u2019"], ["\u201c"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u201e"], ["\u201d"]) + + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u201d", "\u201f"]) + + # must have observed the first-level quotes + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u2018"], ["\u201d"]) + assert not quote_convention.is_compatible_with_observed_quotation_marks(["\u201c", "\u2018"], ["\u00ab"]) + + +def test_normalize() -> None: + empty_quote_convention = QuoteConvention("empty-quote-convention", []) + normalized_empty_quote_convention = empty_quote_convention.normalize() + assert normalized_empty_quote_convention.name == "empty-quote-convention_normalized" + assert normalized_empty_quote_convention.num_levels == 0 + + standard_english_quote_convention = QuoteConvention( + "standard_english_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + normalized_standard_english_quote_convention = standard_english_quote_convention.normalize() + assert normalized_standard_english_quote_convention.name == "standard_english_quote_convention_normalized" + assert normalized_standard_english_quote_convention.num_levels == 4 + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(2) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(2) == "'" + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(3) == '"' + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(3) == '"' + assert normalized_standard_english_quote_convention.get_opening_quotation_mark_at_depth(4) == "'" + assert normalized_standard_english_quote_convention.get_closing_quotation_mark_at_depth(4) == "'" + + western_european_quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + normalized_western_european_quote_convention = western_european_quote_convention.normalize() + assert normalized_western_european_quote_convention.name == "test_quote_convention_normalized" + assert normalized_western_european_quote_convention.num_levels == 3 + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(2) == '"' + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(2) == '"' + assert normalized_western_european_quote_convention.get_opening_quotation_mark_at_depth(3) == "'" + assert normalized_western_european_quote_convention.get_closing_quotation_mark_at_depth(3) == "'" + + hybrid_british_typewriter_english_quote_convention = QuoteConvention( + "hybrid_british_typewriter_english_quote_convention", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + ], + ) + + normalized_hybrid_british_typewriter_english_quote_convention = ( + hybrid_british_typewriter_english_quote_convention.normalize() + ) + assert ( + normalized_hybrid_british_typewriter_english_quote_convention.name + == "hybrid_british_typewriter_english_quote_convention_normalized" + ) + assert normalized_hybrid_british_typewriter_english_quote_convention.num_levels == 3 + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(1) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(2) == "'" + assert normalized_hybrid_british_typewriter_english_quote_convention.get_opening_quotation_mark_at_depth(3) == '"' + assert normalized_hybrid_british_typewriter_english_quote_convention.get_closing_quotation_mark_at_depth(3) == '"' + + +def test_print_summary() -> None: + quote_convention = QuoteConvention( + "test_quote_convention", + [ + SingleLevelQuoteConvention("\u201c", "\u201D"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201D", "\u201D"), + ], + ) + expected_summary_message = ( + "test_quote_convention\n" + + "\u201CFirst-level quote\u201D\n" + + "\u2018Second-level quote\u2019\n" + + "\u201DThird-level quote\u201D\n" + ) + assert str(quote_convention) == expected_summary_message diff --git a/tests/corpora/punctuation_analysis/test_quote_convention_detector.py b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py new file mode 100644 index 00000000..190abb18 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quote_convention_detector.py @@ -0,0 +1,305 @@ +from typing import Union + +from machine.corpora import parse_usfm +from machine.corpora.punctuation_analysis import QuoteConventionAnalysis, QuoteConventionDetector + +# Text comes from the World English Bible, which is in the public domain. + + +def test_standard_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_english" + + +def test_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, \"Has God really said, + 'You shall not eat of any tree of the garden'?\" + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "typewriter_english" + + +def test_british_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "british_english" + + +def test_british_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + \"You shall not eat of any tree of the garden\"?' + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "british_typewriter_english" + + +def test_hybrid_typewriter_english() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + 'You shall not eat of any tree of the garden'?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "hybrid_typewriter_english" + + +def test_standard_french() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‹You shall not eat of any tree of the garden›?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_french" + + +def test_typewriter_french() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "typewriter_french" + + +# french_variant requires a 3rd-level of quotes to differentiate from standard_french + + +def test_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + “You shall not eat of any tree of the garden”?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "western_european" + + +def test_british_inspired_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ‘You shall not eat of any tree of the garden’?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "british_inspired_western_european" + + +def test_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "typewriter_western_european" + + +def test_typewriter_western_european_variant() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ?" + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "typewriter_western_european_variant" + + +def test_hybrid_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + "You shall not eat of any tree of the garden"?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "hybrid_typewriter_western_european" + + +def test_hybrid_british_typewriter_western_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + 'You shall not eat of any tree of the garden'?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "hybrid_british_typewriter_western_european" + + +def test_central_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden‘?“ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "central_european" + + +def test_central_european_guillemets() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ›You shall not eat of any tree of the garden‹?« + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "central_european_guillemets" + + +def test_standard_swedish() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_swedish" + + +def test_standard_finnish() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, »Has God really said, + ’You shall not eat of any tree of the garden’?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_finnish" + + +def test_eastern_european() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, „Has God really said, + ‚You shall not eat of any tree of the garden’?” + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "eastern_european" + + +def test_standard_russian() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + „You shall not eat of any tree of the garden“?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_russian" + + +def test_standard_arabic() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ”Has God really said, + ’You shall not eat of any tree of the garden‘?“ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_arabic" + + +def test_non_standard_arabic() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + ’You shall not eat of any tree of the garden‘?» + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "non-standard_arabic" + + +def test_mismatched_quotation_marks() -> None: + usfm = """ + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + \\v 2 The woman said to the serpent, + “We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’ + """ + analysis = detect_quote_convention(usfm) + assert analysis is not None + assert analysis.best_quote_convention.name == "standard_english" + + +def detect_quote_convention(usfm: str) -> Union[QuoteConventionAnalysis, None]: + quote_convention_detector = QuoteConventionDetector() + parse_usfm(usfm, quote_convention_detector) + return quote_convention_detector.detect_quote_convention() diff --git a/tests/corpora/punctuation_analysis/test_quote_convention_set.py b/tests/corpora/punctuation_analysis/test_quote_convention_set.py new file mode 100644 index 00000000..58ee0269 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_quote_convention_set.py @@ -0,0 +1,1326 @@ +from pytest import approx + +from machine.corpora.punctuation_analysis import ( + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkTabulator, + QuoteConvention, + QuoteConventionSet, + SingleLevelQuoteConvention, + TextSegment, +) + + +def test_quote_regexes() -> None: + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set._opening_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set._closing_quotation_mark_regex.pattern == r"" + assert empty_quote_convention_set._all_quotation_mark_regex.pattern == r"" + + quote_convention_set_with_empty_conventions = QuoteConventionSet( + [QuoteConvention("empty convention 1", []), QuoteConvention("empty convention 2", [])] + ) + assert quote_convention_set_with_empty_conventions._opening_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions._closing_quotation_mark_regex.pattern == r"" + assert quote_convention_set_with_empty_conventions._all_quotation_mark_regex.pattern == r"" + + standard_english_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + ] + ) + assert standard_english_quote_convention_set._opening_quotation_mark_regex.pattern == r"[‘“]" + assert standard_english_quote_convention_set._closing_quotation_mark_regex.pattern == r"[’”]" + assert standard_english_quote_convention_set._all_quotation_mark_regex.pattern == r"[‘’“”]" + + western_european_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + ] + ) + assert western_european_quote_convention_set._opening_quotation_mark_regex.pattern == r"[«‘“]" + assert western_european_quote_convention_set._closing_quotation_mark_regex.pattern == r"[»’”]" + assert western_european_quote_convention_set._all_quotation_mark_regex.pattern == r"[«»‘’“”]" + + multiple_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "typewriter_french", + [ + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + SingleLevelQuoteConvention("<<", ">>"), + SingleLevelQuoteConvention("<", ">"), + ], + ), + QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ), + ] + ) + assert multiple_quote_convention_set._opening_quotation_mark_regex.pattern == r"[<<<«‘“‹]" + assert multiple_quote_convention_set._closing_quotation_mark_regex.pattern == r"[>>>»’”›]" + assert multiple_quote_convention_set._all_quotation_mark_regex.pattern == r"[<<<>>>«»‘’“”‹›]" + + +def test_quotation_mark_pair_map() -> None: + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set.closing_marks_by_opening_mark == {} + assert empty_quote_convention_set.opening_marks_by_closing_mark == {} + + quote_convention_set_with_empty_conventions = QuoteConventionSet( + [QuoteConvention("empty convention 1", []), QuoteConvention("empty convention 2", [])] + ) + assert quote_convention_set_with_empty_conventions.closing_marks_by_opening_mark == {} + assert quote_convention_set_with_empty_conventions.opening_marks_by_closing_mark == {} + + standard_english_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + ] + ) + assert standard_english_quote_convention_set.closing_marks_by_opening_mark == {"‘": {"’"}, "“": {"”"}} + assert standard_english_quote_convention_set.opening_marks_by_closing_mark == {"’": {"‘"}, "”": {"“"}} + + western_european_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + ] + ) + assert western_european_quote_convention_set.closing_marks_by_opening_mark == {"‘": {"’"}, "“": {"”"}, "«": {"»"}} + assert western_european_quote_convention_set.opening_marks_by_closing_mark == {"’": {"‘"}, "”": {"“"}, "»": {"«"}} + + multiple_quote_convention_set = QuoteConventionSet( + [ + QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ), + QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ), + QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ), + ] + ) + assert multiple_quote_convention_set.closing_marks_by_opening_mark == { + "‘": {"’"}, + "“": {"”"}, + "„": {"“"}, + "‚": {"‘"}, + "”": {"”"}, + "’": {"’"}, + } + assert multiple_quote_convention_set.opening_marks_by_closing_mark == { + "’": {"‘", "’"}, + "”": {"“", "”"}, + "“": {"„"}, + "‘": {"‚"}, + } + + +def test_get_quote_convention_by_name() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("standard_english") + == standard_english_quote_convention + ) + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("central_european") + == central_european_quote_convention + ) + assert ( + multiple_quote_convention_set.get_quote_convention_by_name("standard_swedish") + == standard_swedish_quote_convention + ) + assert multiple_quote_convention_set.get_quote_convention_by_name("undefined convention") is None + + +def test_get_all_quote_convention_names() -> None: + assert QuoteConventionSet([]).get_all_quote_convention_names() == [] + assert QuoteConventionSet([QuoteConvention("conv", [])]).get_all_quote_convention_names() == ["conv"] + assert QuoteConventionSet( + [QuoteConvention("conv1", []), QuoteConvention("conv2", [])] + ).get_all_quote_convention_names() == ["conv1", "conv2"] + assert QuoteConventionSet( + [QuoteConvention("conv2", []), QuoteConvention("conv1", [])] + ).get_all_quote_convention_names() == ["conv1", "conv2"] + + +def test_get_possible_opening_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_opening_marks() == ["‘", "“"] + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_opening_marks() == ["‚", "„"] + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_opening_marks() == ["’", "”"] + + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + assert multiple_quote_convention_set.get_possible_opening_marks() == ["‘", "’", "‚", "“", "”", "„"] + + +def test_get_possible_closing_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_closing_marks() == ["’", "”"] + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_closing_marks() == ["‘", "“"] + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_closing_marks() == ["’", "”"] + + multiple_quote_convention_set = QuoteConventionSet( + [standard_english_quote_convention, central_european_quote_convention, standard_swedish_quote_convention] + ) + assert multiple_quote_convention_set.get_possible_closing_marks() == ["‘", "’", "“", "”"] + + +def test_is_opening_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert standard_english_quote_convention_set.is_valid_opening_quotation_mark("“") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("”") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("’") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("") + assert not standard_english_quote_convention_set.is_valid_opening_quotation_mark("‘“") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.is_valid_opening_quotation_mark("‚") + assert central_european_quote_convention_set.is_valid_opening_quotation_mark("„") + assert not central_european_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert not central_european_quote_convention_set.is_valid_opening_quotation_mark("“") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_valid_opening_quotation_mark("’") + assert standard_swedish_quote_convention_set.is_valid_opening_quotation_mark("”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.is_valid_opening_quotation_mark("«") + assert standard_french_quote_convention_set.is_valid_opening_quotation_mark("‹") + assert not standard_french_quote_convention_set.is_valid_opening_quotation_mark("»") + assert not standard_french_quote_convention_set.is_valid_opening_quotation_mark("›") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_opening_marks() == ["«", "‘", "’", "‚", "“", "”", "„", "‹"] + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‘") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("’") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‚") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("“") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("”") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("„") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("«") + assert multiple_quote_convention_set.is_valid_opening_quotation_mark("‹") + assert not multiple_quote_convention_set.is_valid_opening_quotation_mark("»") + assert not multiple_quote_convention_set.is_valid_opening_quotation_mark("›") + + +def test_is_closing_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.is_valid_closing_quotation_mark("”") + assert standard_english_quote_convention_set.is_valid_closing_quotation_mark("’") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("“") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("") + assert not standard_english_quote_convention_set.is_valid_closing_quotation_mark("”’") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert central_european_quote_convention_set.is_valid_closing_quotation_mark("“") + assert not central_european_quote_convention_set.is_valid_closing_quotation_mark("„") + assert not central_european_quote_convention_set.is_valid_closing_quotation_mark("‚") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_valid_closing_quotation_mark("’") + assert standard_swedish_quote_convention_set.is_valid_closing_quotation_mark("”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.is_valid_closing_quotation_mark("»") + assert standard_french_quote_convention_set.is_valid_closing_quotation_mark("›") + assert not standard_french_quote_convention_set.is_valid_closing_quotation_mark("«") + assert not standard_french_quote_convention_set.is_valid_closing_quotation_mark("‹") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_closing_marks() == ["»", "‘", "’", "“", "”", "›"] + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("‘") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("’") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("“") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("”") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("»") + assert multiple_quote_convention_set.is_valid_closing_quotation_mark("›") + assert not multiple_quote_convention_set.is_valid_closing_quotation_mark("«") + assert not multiple_quote_convention_set.is_valid_closing_quotation_mark("‹") + + +def test_are_marks_a_valid_pair() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.marks_are_a_valid_pair("“", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("”", "“") + assert standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "’") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("’", "‘") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "”") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("‘", "") + assert not standard_english_quote_convention_set.marks_are_a_valid_pair("", "") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.marks_are_a_valid_pair("„", "“") + assert central_european_quote_convention_set.marks_are_a_valid_pair("‚", "‘") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("“", "„") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("’", "‚") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("‚", "“") + assert not central_european_quote_convention_set.marks_are_a_valid_pair("‚", "’") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.marks_are_a_valid_pair("”", "”") + assert standard_swedish_quote_convention_set.marks_are_a_valid_pair("’", "’") + assert not standard_swedish_quote_convention_set.marks_are_a_valid_pair("”", "’") + assert not standard_swedish_quote_convention_set.marks_are_a_valid_pair("’", "”") + + standard_french_quote_convention_set = QuoteConventionSet([standard_french_quote_convention]) + assert standard_french_quote_convention_set.marks_are_a_valid_pair("«", "»") + assert standard_french_quote_convention_set.marks_are_a_valid_pair("‹", "›") + assert not standard_french_quote_convention_set.marks_are_a_valid_pair("«", "›") + assert not standard_french_quote_convention_set.marks_are_a_valid_pair("‹", "»") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + standard_french_quote_convention, + ] + ) + assert multiple_quote_convention_set.marks_are_a_valid_pair("“", "”") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‘", "’") + assert multiple_quote_convention_set.marks_are_a_valid_pair("„", "“") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‚", "‘") + assert multiple_quote_convention_set.marks_are_a_valid_pair("”", "”") + assert multiple_quote_convention_set.marks_are_a_valid_pair("’", "’") + assert multiple_quote_convention_set.marks_are_a_valid_pair("«", "»") + assert multiple_quote_convention_set.marks_are_a_valid_pair("‹", "›") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("‹", "»") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("‹", "”") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("„", "”") + assert not multiple_quote_convention_set.marks_are_a_valid_pair("’", "‘") + + +def test_is_quotation_mark_direction_ambiguous() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + typewriter_english_quote_convention: QuoteConvention = QuoteConvention( + "typewriter_english", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + eastern_european_quote_convention = QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not standard_english_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + + typewriter_english_quote_convention_set = QuoteConventionSet([typewriter_english_quote_convention]) + assert typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + assert typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("'") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not typewriter_english_quote_convention_set.is_quotation_mark_direction_ambiguous("«") + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + assert not central_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert standard_swedish_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + + eastern_european_quote_convention_set = QuoteConventionSet([eastern_european_quote_convention]) + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not eastern_european_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + typewriter_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + eastern_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous('"') + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("'") + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("”") + assert multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("’") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("„") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("‚") + + # these are unambiguous because they are never the opening and closing in the same convention + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("“") + assert not multiple_quote_convention_set.is_quotation_mark_direction_ambiguous("‘") + + +def test_get_possible_paired_quotation_marks() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + central_european_quote_convention: QuoteConvention = QuoteConvention( + "central_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + SingleLevelQuoteConvention("\u201e", "\u201c"), + SingleLevelQuoteConvention("\u201a", "\u2018"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + eastern_european_quote_convention = QuoteConvention( + "eastern_european", + [ + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + SingleLevelQuoteConvention("\u201e", "\u201d"), + SingleLevelQuoteConvention("\u201a", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("“") == {"”"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("”") == {"“"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"’"} + assert standard_english_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‘"} + + central_european_quote_convention_set = QuoteConventionSet([central_european_quote_convention]) + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("„") == {"“"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("“") == {"„"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"‘"} + assert central_european_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"‚"} + + standard_swedish_quote_convention_set = QuoteConventionSet([standard_swedish_quote_convention]) + assert standard_swedish_quote_convention_set.get_possible_paired_quotation_marks("”") == {"”"} + assert standard_swedish_quote_convention_set.get_possible_paired_quotation_marks("’") == {"’"} + + eastern_european_quote_convention_set = QuoteConventionSet([eastern_european_quote_convention]) + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("„") == {"”"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("”") == {"„"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"’"} + assert eastern_european_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‚"} + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + central_european_quote_convention, + standard_swedish_quote_convention, + eastern_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("“") == {"”", "„"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("”") == {"“", "”", "„"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("‘") == {"’", "‚"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("’") == {"‘", "’", "‚"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("„") == {"“", "”"} + assert multiple_quote_convention_set.get_possible_paired_quotation_marks("‚") == {"‘", "’"} + + +def test_get_possible_depths() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + british_english_quote_convention: QuoteConvention = QuoteConvention( + "british_english", + [ + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + ], + ) + + normalized_western_european_quote_convention = QuoteConvention( + "western_european_normalized", + [ + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention('"', '"'), + SingleLevelQuoteConvention("'", "'"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 3} + assert standard_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {2, 4} + assert standard_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.OPENING) == set() + assert standard_english_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.CLOSING) == set() + + british_english_quote_convention_set = QuoteConventionSet([british_english_quote_convention]) + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {1, 3} + assert british_english_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {2, 4} + assert british_english_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.OPENING) == set() + assert british_english_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.CLOSING) == set() + + normalized_western_european_quote_convention_set = QuoteConventionSet( + [normalized_western_european_quote_convention] + ) + assert normalized_western_european_quote_convention_set.get_possible_depths( + '"', QuotationMarkDirection.OPENING + ) == {1, 2} + assert normalized_western_european_quote_convention_set.get_possible_depths( + '"', QuotationMarkDirection.CLOSING + ) == {1, 2} + assert normalized_western_european_quote_convention_set.get_possible_depths( + "'", QuotationMarkDirection.OPENING + ) == {3} + assert normalized_western_european_quote_convention_set.get_possible_depths( + "'", QuotationMarkDirection.CLOSING + ) == {3} + assert ( + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) + == set() + ) + assert ( + normalized_western_european_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) + == set() + ) + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + british_english_quote_convention, + normalized_western_european_quote_convention, + ] + ) + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.OPENING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201c", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.CLOSING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u201d", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.OPENING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2018", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.CLOSING) == {1, 2, 3, 4} + assert multiple_quote_convention_set.get_possible_depths("\u2019", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.OPENING) == set() + assert multiple_quote_convention_set.get_possible_depths("\u201e", QuotationMarkDirection.CLOSING) == set() + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.OPENING) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths('"', QuotationMarkDirection.CLOSING) == {1, 2} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.OPENING) == {3} + assert multiple_quote_convention_set.get_possible_depths("'", QuotationMarkDirection.CLOSING) == {3} + + +def test_does_metadata_match_quotation_mark() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 1, QuotationMarkDirection.OPENING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 3, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 2, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 4, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 1, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 2, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 3, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201c", 4, QuotationMarkDirection.CLOSING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 1, QuotationMarkDirection.CLOSING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 3, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 2, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 4, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 1, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 2, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 3, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201d", 4, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 1, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 3, QuotationMarkDirection.OPENING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 2, QuotationMarkDirection.OPENING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 4, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 1, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 2, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 3, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2018", 4, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 1, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 3, QuotationMarkDirection.CLOSING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 2, QuotationMarkDirection.CLOSING + ) + assert standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 4, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 1, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 2, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 3, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u2019", 4, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 1, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 1, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 2, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 2, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 3, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 3, QuotationMarkDirection.CLOSING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 4, QuotationMarkDirection.OPENING + ) + assert not standard_english_quote_convention_set.metadata_matches_quotation_mark( + "\u201e", 4, QuotationMarkDirection.CLOSING + ) + + +def test_filter_to_compatible_quote_conventions() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention: QuoteConvention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_swedish_quote_convention: QuoteConvention = QuoteConvention( + "standard_swedish", + [ + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + SingleLevelQuoteConvention("\u201d", "\u201d"), + SingleLevelQuoteConvention("\u2019", "\u2019"), + ], + ) + + standard_english_quote_convention_set = QuoteConventionSet([standard_english_quote_convention]) + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u2018"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u2019"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201d"], ["\u201c"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u201d"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u201e"], ["\u201d"] + ).get_all_quote_convention_names() + == [] + ) + assert ( + standard_english_quote_convention_set.filter_to_compatible_quote_conventions( + [], [] + ).get_all_quote_convention_names() + == [] + ) + + multiple_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + standard_swedish_quote_convention, + ] + ) + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_english"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c", "\u2018"], ["\u201d", "\u2019"] + ).get_all_quote_convention_names() == ["standard_english"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201d"], ["\u201d"] + ).get_all_quote_convention_names() == ["standard_swedish"] + assert ( + multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u201c"], ["\u201c"] + ).get_all_quote_convention_names() + == [] + ) + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab"], ["\u00bb"] + ).get_all_quote_convention_names() == ["standard_french", "western_european"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab", "\u2039"], ["\u00bb"] + ).get_all_quote_convention_names() == ["standard_french"] + assert multiple_quote_convention_set.filter_to_compatible_quote_conventions( + ["\u00ab"], ["\u00bb", "\u201d"] + ).get_all_quote_convention_names() == ["western_european"] + assert ( + multiple_quote_convention_set.filter_to_compatible_quote_conventions([], []).get_all_quote_convention_names() + == [] + ) + + +def test_find_most_similar_convention() -> None: + standard_english_quote_convention: QuoteConvention = QuoteConvention( + "standard_english", + [ + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + standard_french_quote_convention: QuoteConvention = QuoteConvention( + "standard_french", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u2039", "\u203a"), + ], + ) + + western_european_quote_convention: QuoteConvention = QuoteConvention( + "western_european", + [ + SingleLevelQuoteConvention("\u00ab", "\u00bb"), + SingleLevelQuoteConvention("\u201c", "\u201d"), + SingleLevelQuoteConvention("\u2018", "\u2019"), + ], + ) + + all_three_quote_convention_set = QuoteConventionSet( + [ + standard_english_quote_convention, + standard_french_quote_convention, + western_european_quote_convention, + ] + ) + two_french_quote_convention_set = QuoteConventionSet( + [western_european_quote_convention, standard_french_quote_convention] + ) + + multiple_english_quotes_tabulator = QuotationMarkTabulator() + multiple_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_english_quotes_tabulator) == ( + standard_english_quote_convention, + 1.0, + ) + + multiple_western_european_quotes_tabulator = QuotationMarkTabulator() + multiple_western_european_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201d", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_western_european_quotes_tabulator) == ( + western_european_quote_convention, + 1.0, + ) + + multiple_french_quotes_tabulator = QuotationMarkTabulator() + multiple_french_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + assert two_french_quote_convention_set.find_most_similar_convention(multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + + noisy_multiple_english_quotes_tabulator = QuotationMarkTabulator() + noisy_multiple_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u201c", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u201d", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( + standard_english_quote_convention, + approx(0.9, rel=1e-9), + ) + assert two_french_quote_convention_set.find_most_similar_convention(noisy_multiple_english_quotes_tabulator) == ( + western_european_quote_convention, + approx(0.1, rel=1e-9), + ) + + noisy_multiple_french_quotes_tabulator = QuotationMarkTabulator() + noisy_multiple_french_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u203a", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2039", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u2019", 2, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 14, 15), + QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 28, 29), + QuotationMarkMetadata("\u00bb", 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().build(), 42, 43), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(noisy_multiple_french_quotes_tabulator) == ( + standard_french_quote_convention, + approx(0.916666666666, rel=1e-9), + ) + + too_deep_english_quotes_tabulator = QuotationMarkTabulator() + too_deep_english_quotes_tabulator.tabulate( + [ + QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1), + QuotationMarkMetadata("\u2018", 2, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 5, 6), + QuotationMarkMetadata("\u201c", 3, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 13, 14), + QuotationMarkMetadata("\u2018", 4, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 15, 16), + QuotationMarkMetadata("\u201c", 5, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 17, 18), + ] + ) + assert all_three_quote_convention_set.find_most_similar_convention(too_deep_english_quotes_tabulator) == ( + standard_english_quote_convention, + approx(0.967741935483871, rel=1e-9), + ) + + # in case of ties, the earlier convention in the list should be returned + unknown_quote_tabulator = QuotationMarkTabulator() + unknown_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u201a", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] + ) + assert all_three_quote_convention_set.find_most_similar_convention(unknown_quote_tabulator) == ( + standard_english_quote_convention, + 0.0, + ) + + single_french_opening_quote_tabulator = QuotationMarkTabulator() + single_french_opening_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u00ab", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] + ) + assert all_three_quote_convention_set.find_most_similar_convention(single_french_opening_quote_tabulator) == ( + standard_french_quote_convention, + 1.0, + ) + assert two_french_quote_convention_set.find_most_similar_convention(single_french_opening_quote_tabulator) == ( + western_european_quote_convention, + 1.0, + ) + + # Default values should be returned when the QuoteConventionSet is empty + single_english_opening_quote_tabulator = QuotationMarkTabulator() + single_english_opening_quote_tabulator.tabulate( + [QuotationMarkMetadata("\u201c", 1, QuotationMarkDirection.OPENING, TextSegment.Builder().build(), 0, 1)] + ) + empty_quote_convention_set = QuoteConventionSet([]) + assert empty_quote_convention_set.find_most_similar_convention(single_english_opening_quote_tabulator) == ( + None, + float("-inf"), + ) diff --git a/tests/corpora/punctuation_analysis/test_text_segment.py b/tests/corpora/punctuation_analysis/test_text_segment.py new file mode 100644 index 00000000..bb8f529d --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_text_segment.py @@ -0,0 +1,270 @@ +from machine.corpora import UsfmToken, UsfmTokenType +from machine.corpora.punctuation_analysis import TextSegment, UsfmMarkerType + + +def test_builder_initialization() -> None: + builder = TextSegment.Builder() + + assert builder._text_segment._text == "" + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert builder._text_segment._markers_in_preceding_context == set() + assert builder._text_segment.index_in_verse == 0 + assert builder._text_segment.num_segments_in_verse == 0 + assert builder._text_segment._usfm_token is None + + +def test_builder_set_text() -> None: + builder = TextSegment.Builder() + text = "Example text" + builder.set_text(text) + + assert builder._text_segment._text == text + + +def test_builder_set_previous_segment() -> None: + builder = TextSegment.Builder() + previous_segment = TextSegment.Builder().set_text("previous segment text").build() + builder.set_previous_segment(previous_segment) + + assert builder._text_segment.previous_segment == previous_segment + assert builder._text_segment.next_segment is None + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert builder._text_segment._markers_in_preceding_context == set() + assert builder._text_segment.index_in_verse == 0 + assert builder._text_segment.num_segments_in_verse == 0 + + +def test_builder_add_preceding_marker() -> None: + builder = TextSegment.Builder() + builder.add_preceding_marker(UsfmMarkerType.CHAPTER) + + assert builder._text_segment._immediate_preceding_marker is UsfmMarkerType.CHAPTER + assert builder._text_segment._markers_in_preceding_context == {UsfmMarkerType.CHAPTER} + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None + + builder.add_preceding_marker(UsfmMarkerType.VERSE) + assert builder._text_segment._immediate_preceding_marker == UsfmMarkerType.VERSE + assert builder._text_segment._markers_in_preceding_context == { + UsfmMarkerType.CHAPTER, + UsfmMarkerType.VERSE, + } + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None + + +def test_builder_set_usfm_token() -> None: + builder = TextSegment.Builder() + builder.set_usfm_token(UsfmToken(type=UsfmTokenType.TEXT, text="USFM token text")) + + assert builder._text_segment._usfm_token is not None + assert builder._text_segment._usfm_token.type == UsfmTokenType.TEXT + assert builder._text_segment._usfm_token.text == "USFM token text" + assert builder._text_segment._text == "" + assert builder._text_segment.previous_segment is None + assert builder._text_segment.next_segment is None + + +def test_equals() -> None: + basic_segment = TextSegment.Builder().set_text("text1").build() + same_text_segment = TextSegment.Builder().set_text("text1").build() + different_text_segment = TextSegment.Builder().set_text("different text").build() + + assert basic_segment == basic_segment + assert basic_segment != UsfmToken(type=UsfmTokenType.TEXT, text="text1") + assert basic_segment == same_text_segment + assert basic_segment != different_text_segment + + segment_with_index = TextSegment.Builder().set_text("text1").build() + segment_with_index.index_in_verse = 1 + segment_with_same_index = TextSegment.Builder().set_text("text1").build() + segment_with_same_index.index_in_verse = 1 + segment_with_different_index = TextSegment.Builder().set_text("text1").build() + segment_with_different_index.index_in_verse = 2 + + assert segment_with_index == segment_with_same_index + assert segment_with_index != segment_with_different_index + assert segment_with_index != basic_segment + + segment_with_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VERSE).build() + ) + segment_with_same_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.VERSE).build() + ) + segment_with_different_preceding_marker = ( + TextSegment.Builder().set_text("text1").add_preceding_marker(UsfmMarkerType.CHAPTER).build() + ) + segment_with_multiple_preceding_markers = ( + TextSegment.Builder() + .set_text("text1") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ) + + usfm_token = UsfmToken(type=UsfmTokenType.TEXT, text="USFM token text") + segment_with_usfm_token = TextSegment.Builder().set_text("text1").set_usfm_token(usfm_token).build() + segment_with_same_usfm_token = TextSegment.Builder().set_text("text1").set_usfm_token(usfm_token).build() + segment_with_different_usfm_token = ( + TextSegment.Builder() + .set_text("text1") + .set_usfm_token(UsfmToken(type=UsfmTokenType.TEXT, text="Different USFM token text")) + .build() + ) + + assert segment_with_usfm_token == segment_with_same_usfm_token + assert segment_with_usfm_token != segment_with_different_usfm_token + assert basic_segment != segment_with_usfm_token + + # attributes that are not used in equality checks + segment_with_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_num_verses.num_segments_in_verse = 3 + segment_with_same_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_same_num_verses.num_segments_in_verse = 3 + segment_with_different_num_verses = TextSegment.Builder().set_text("text1").build() + segment_with_different_num_verses.num_segments_in_verse = 4 + + assert segment_with_num_verses == segment_with_same_num_verses + assert segment_with_num_verses != segment_with_different_num_verses + assert segment_with_num_verses != basic_segment + + assert segment_with_preceding_marker == segment_with_same_preceding_marker + assert segment_with_preceding_marker != segment_with_different_preceding_marker + assert segment_with_preceding_marker == segment_with_multiple_preceding_markers + assert segment_with_preceding_marker != basic_segment + + segment_with_previous_segment = TextSegment.Builder().set_text("text1").build() + segment_with_previous_segment.previous_segment = segment_with_num_verses + + segment_with_next_segment = TextSegment.Builder().set_text("text1").build() + segment_with_next_segment.next_segment = segment_with_num_verses + + assert basic_segment == segment_with_previous_segment + assert basic_segment == segment_with_next_segment + + +def test_get_text() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.text == "example text" + + text_segment = TextSegment.Builder().set_text("new example text").build() + assert text_segment.text == "new example text" + + +def test_length() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.length == len("example text") + + text_segment = TextSegment.Builder().set_text("new example text").build() + assert text_segment.length == len("new example text") + + +def test_substring_before() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.substring_before(7) == "example" + assert text_segment.substring_before(8) == "example " + assert text_segment.substring_before(0) == "" + assert text_segment.substring_before(12) == "example text" + + +def test_substring_after() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + assert text_segment.substring_after(7) == " text" + assert text_segment.substring_after(8) == "text" + assert text_segment.substring_after(0) == "example text" + assert text_segment.substring_after(12) == "" + assert text_segment.substring_after(11) == "t" + + +def test_is_marker_in_preceding_context() -> None: + no_preceding_marker_segment = TextSegment.Builder().set_text("example text").build() + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is False + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is False + assert no_preceding_marker_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is False + + one_preceding_marker_text_segment = ( + TextSegment.Builder().set_text("example text").add_preceding_marker(UsfmMarkerType.CHARACTER).build() + ) + + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is True + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is False + assert one_preceding_marker_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is False + + two_preceding_markers_text_segment = ( + TextSegment.Builder() + .set_text("example text") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ) + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is True + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is True + assert two_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is False + + three_preceding_markers_text_segment = ( + TextSegment.Builder() + .set_text("example text") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) + .build() + ) + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER) is True + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE) is True + assert three_preceding_markers_text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHARACTER) is True + + +def test_is_first_segment_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.index_in_verse = 0 + assert text_segment.is_first_segment_in_verse() is True + + text_segment.index_in_verse = 1 + assert text_segment.is_first_segment_in_verse() is False + + +def test_is_last_segment_in_verse() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.index_in_verse = 0 + text_segment.num_segments_in_verse = 1 + assert text_segment.is_last_segment_in_verse() is True + + text_segment.index_in_verse = 0 + text_segment.num_segments_in_verse = 2 + assert text_segment.is_last_segment_in_verse() is False + + text_segment.index_in_verse = 1 + assert text_segment.is_last_segment_in_verse() is True + + +def test_replace_substring() -> None: + text_segment = TextSegment.Builder().set_text("example text").build() + text_segment.replace_substring(0, 7, "sample") + assert text_segment.text == "sample text" + + text_segment.replace_substring(7, 11, "text") + assert text_segment.text == "sample text" + + text_segment.replace_substring(0, 7, "") + assert text_segment.text == "text" + + text_segment.replace_substring(0, 4, "new'") + assert text_segment.text == "new'" + + text_segment.replace_substring(3, 4, "\u2019") + assert text_segment.text == "new\u2019" + + text_segment.replace_substring(0, 0, "prefix ") + assert text_segment.text == "prefix new\u2019" + + text_segment.replace_substring(0, 0, "") + assert text_segment.text == "prefix new\u2019" + + text_segment.replace_substring(11, 11, " suffix") + assert text_segment.text == "prefix new\u2019 suffix" + + text_segment.replace_substring(6, 6, "-") + assert text_segment.text == "prefix- new\u2019 suffix" diff --git a/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py new file mode 100644 index 00000000..26cad441 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_usfm_structure_extractor.py @@ -0,0 +1,426 @@ +from typing import List + +from machine.corpora import UsfmParser +from machine.corpora.punctuation_analysis import Chapter, TextSegment, UsfmMarkerType, UsfmStructureExtractor, Verse + +verse_text_parser_state = usfm_parser = UsfmParser("").state +verse_text_parser_state.verse_ref.verse_num = 1 + + +def test_chapter_and_verse_markers(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_start_paragraph_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.start_para(verse_text_parser_state, "p", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.PARAGRAPH) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_start_character_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_end_character_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_char(verse_text_parser_state, "k", None, False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_end_note_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "f", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_end_table_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "tr", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_ref_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "x", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_sidebar_marker(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.end_note(verse_text_parser_state, "esb", False) + usfm_structure_extractor.text(verse_text_parser_state, "test") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.EMBED) + .build() + ] + ) + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + + +def test_multiple_verses(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.verse(verse_text_parser_state, "2", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ), + Verse( + [ + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + assert actual_chapters[0].verses[1]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[1]._text_segments[0].next_segment is None + + +def test_multiple_chapters(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ), + ] + ), + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build() + ] + ), + ] + ), + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert actual_chapters[0].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[0].verses[0]._text_segments[0].next_segment is None + assert actual_chapters[1].verses[0]._text_segments[0].previous_segment is None + assert actual_chapters[1].verses[0]._text_segments[0].next_segment is None + + +def test_character_marker_in_text(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build(), + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) + .build(), + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert ( + actual_chapters[0].verses[0]._text_segments[1].previous_segment + == expected_chapters[0].verses[0]._text_segments[0] + ) + assert ( + actual_chapters[0].verses[0]._text_segments[0].next_segment == expected_chapters[0].verses[0]._text_segments[1] + ) + + +def test_empty_text(): + usfm_structure_extractor = UsfmStructureExtractor() + usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) + usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) + usfm_structure_extractor.text(verse_text_parser_state, "test") + usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) + usfm_structure_extractor.text(verse_text_parser_state, "") + usfm_structure_extractor.end_char(verse_text_parser_state, "k", None, False) + usfm_structure_extractor.text(verse_text_parser_state, "test2") + + expected_chapters = [ + Chapter( + [ + Verse( + [ + TextSegment.Builder() + .set_text("test") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .build(), + TextSegment.Builder() + .set_text("test2") + .add_preceding_marker(UsfmMarkerType.CHAPTER) + .add_preceding_marker(UsfmMarkerType.VERSE) + .add_preceding_marker(UsfmMarkerType.CHARACTER) + .build(), + ] + ), + ] + ) + ] + + actual_chapters = usfm_structure_extractor.get_chapters() + assert_chapter_equal(expected_chapters, actual_chapters) + assert ( + actual_chapters[0].verses[0]._text_segments[1].previous_segment + == expected_chapters[0].verses[0]._text_segments[0] + ) + assert ( + actual_chapters[0].verses[0]._text_segments[0].next_segment == expected_chapters[0].verses[0]._text_segments[1] + ) + + +def assert_chapter_equal(expected_chapters: List[Chapter], actual_chapters: List[Chapter]): + assert len(expected_chapters) == len(actual_chapters) + for expected_chapter, actual_chapter in zip(expected_chapters, actual_chapters): + assert len(expected_chapter.verses) == len(actual_chapter.verses) + for expected_verse, actual_verse in zip(expected_chapter.verses, actual_chapter.verses): + assert len(expected_verse._text_segments) == len(actual_verse._text_segments) + for expected_segment, actual_segment in zip(expected_verse._text_segments, actual_verse._text_segments): + assert expected_segment == actual_segment diff --git a/tests/corpora/punctuation_analysis/test_verse.py b/tests/corpora/punctuation_analysis/test_verse.py new file mode 100644 index 00000000..6212e2b3 --- /dev/null +++ b/tests/corpora/punctuation_analysis/test_verse.py @@ -0,0 +1,42 @@ +from machine.corpora.punctuation_analysis import TextSegment, Verse + + +def test_initialize_verse() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + + verse = Verse(text_segments) + + assert len(verse.text_segments) == 3 + assert verse.text_segments == text_segments + + +def test_segment_indices() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 1").build(), + ] + + verse = Verse(text_segments) + + assert verse.text_segments[0].index_in_verse == 0 + assert verse.text_segments[1].index_in_verse == 1 + assert verse.text_segments[2].index_in_verse == 2 + + +def test_num_segments_in_verse() -> None: + text_segments = [ + TextSegment.Builder().set_text("Segment 1").build(), + TextSegment.Builder().set_text("Segment 2").build(), + TextSegment.Builder().set_text("Segment 3").build(), + ] + + verse = Verse(text_segments) + + assert verse.text_segments[0].num_segments_in_verse == 3 + assert verse.text_segments[1].num_segments_in_verse == 3 + assert verse.text_segments[2].num_segments_in_verse == 3 diff --git a/tests/corpora/test_fallback_quotation_mark_resolver.py b/tests/corpora/test_fallback_quotation_mark_resolver.py new file mode 100644 index 00000000..9a889c6c --- /dev/null +++ b/tests/corpora/test_fallback_quotation_mark_resolver.py @@ -0,0 +1,351 @@ +from machine.corpora import FallbackQuotationMarkResolver, QuotationMarkUpdateResolutionSettings +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + QuotationMarkDirection, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkStringMatch, + QuoteConventionDetectionResolutionSettings, + QuoteConventionSet, + TextSegment, +) + + +def test_reset(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention) + ) + + basic_quotation_mark_resolver._last_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"\'test text"').build(), 0, 1 + ) + basic_quotation_mark_resolver._issues.add(QuotationMarkResolutionIssue.UNEXPECTED_QUOTATION_MARK) + + basic_quotation_mark_resolver.reset() + assert basic_quotation_mark_resolver._last_quotation_mark is None + assert len(basic_quotation_mark_resolver._issues) == 0 + + +def test_simple_quotation_mark_resolution_with_no_previous_mark(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('test " text').build(), 5, 6), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('test " text').build(), 5, 6 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_simple_quotation_mark_resolution_with_previous_opening_mark(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test " text').build(), 0, 1), + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test " text').build(), 6, 7), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test " text').build(), 0, 1 + ), + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test " text').build(), 6, 7 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_simple_quotation_mark_resolution_with_previous_closing_mark(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + actual_resolved_quotation_marks = list( + basic_quotation_mark_resolver.resolve_quotation_marks( + [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('test" " text').build(), 4, 5), + QuotationMarkStringMatch(TextSegment.Builder().set_text('test" " text').build(), 6, 7), + ] + ) + ) + expected_resolved_quotation_marks = [ + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('test" " text').build(), 4, 5 + ), + QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('test" " text').build(), 6, 7 + ), + ] + + assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks, + expected_resolved_quotation_marks, + ) + + +def test_is_opening_quote(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + # valid opening quote at start of segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True + + # opening quote with leading whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True + + # opening quote with quote introducer + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test:"text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is True + + # QuotationMarkStringMatch indices don't indicate a quotation mark + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_opening_quotation_mark(quote_match) is False + + # the quotation mark is not valid under the current quote convention + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('').build(), 10, 11) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False + + # no trailing whitespace after quotation mark + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test"text').build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False + + # opening quote at the start of the segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False + + # opening quote with leading whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text('test "text"').build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False + + +def test_is_closing_quote_with_unambiguous_quote_convention(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuoteConventionDetectionResolutionSettings(QuoteConventionSet([english_quote_convention])) + ) + + # unambiguous closing quote at end of segment + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test text”").build(), 10, 11) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True + + # unambiguous closing quote with trailing whitespace + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test” text").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True + + # unambiguous closing quote without the "correct" context + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("“test”text").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is True + + # unambiguous opening quote + quote_match = QuotationMarkStringMatch(TextSegment.Builder().set_text("test “text”").build(), 5, 6) + assert basic_quotation_mark_resolver._is_closing_quotation_mark(quote_match) is False + + +def test_resolve_opening_quote(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + expected_resolved_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.OPENING, TextSegment.Builder().set_text('"test text"').build(), 0, 1 + ) + actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_opening_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 0, 1) + ) + assert actual_resolved_quotation_mark == expected_resolved_quotation_mark + assert basic_quotation_mark_resolver._last_quotation_mark == actual_resolved_quotation_mark + + +def test_resolve_closing_quote(): + english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert english_quote_convention is not None + + basic_quotation_mark_resolver = FallbackQuotationMarkResolver( + QuotationMarkUpdateResolutionSettings(english_quote_convention.normalize()) + ) + + expected_resolved_quotation_mark = QuotationMarkMetadata( + '"', 1, QuotationMarkDirection.CLOSING, TextSegment.Builder().set_text('"test text"').build(), 10, 11 + ) + actual_resolved_quotation_mark = basic_quotation_mark_resolver._resolve_closing_mark( + QuotationMarkStringMatch(TextSegment.Builder().set_text('"test text"').build(), 10, 11) + ) + assert actual_resolved_quotation_mark == expected_resolved_quotation_mark + + +def assert_resolved_quotation_marks_equal( + actual_resolved_quotation_marks: list[QuotationMarkMetadata], + expected_resolved_quotation_marks: list[QuotationMarkMetadata], +) -> None: + assert len(actual_resolved_quotation_marks) == len(expected_resolved_quotation_marks) + for actual_mark, expected_mark in zip(actual_resolved_quotation_marks, expected_resolved_quotation_marks): + assert actual_mark == expected_mark diff --git a/tests/corpora/test_quotation_denormalization.py b/tests/corpora/test_quotation_denormalization.py new file mode 100644 index 00000000..06327f5b --- /dev/null +++ b/tests/corpora/test_quotation_denormalization.py @@ -0,0 +1,55 @@ +from testutils.corpora_test_helpers import ignore_line_endings + +from machine.corpora import ( + QuotationMarkDenormalizationFirstPass, + QuotationMarkDenormalizationUsfmUpdateBlockHandler, + QuotationMarkUpdateSettings, + UpdateUsfmParserHandler, + parse_usfm, +) +from machine.corpora.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS + + +def test_full_quotation_denormalization_pipeline() -> None: + normalized_usfm = """ + \\id GEN + \\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + \\v 2 The woman said to the serpent, + "We may eat fruit from the trees of the garden, + \\v 3 but not the fruit of the tree which is in the middle of the garden. + God has said, 'You shall not eat of it. You shall not touch it, lest you die.'" + """ + + expected_denormalized_usfm = """\\id GEN +\\c 1 +\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” +\\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, +\\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” +""" # noqa: E501 + + standard_english_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name("standard_english") + assert standard_english_quote_convention is not None + + quotation_mark_denormalization_first_pass = QuotationMarkDenormalizationFirstPass( + standard_english_quote_convention, standard_english_quote_convention + ) + + parse_usfm(normalized_usfm, quotation_mark_denormalization_first_pass) + best_chapter_strategies = quotation_mark_denormalization_first_pass.find_best_chapter_strategies() + + quotation_mark_denormalizer = QuotationMarkDenormalizationUsfmUpdateBlockHandler( + standard_english_quote_convention, + standard_english_quote_convention, + QuotationMarkUpdateSettings(chapter_strategies=best_chapter_strategies), + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_mark_denormalizer]) + parse_usfm(normalized_usfm, updater) + + actual_denormalized_usfm = updater.get_usfm() + + ignore_line_endings(actual_denormalized_usfm, expected_denormalized_usfm) diff --git a/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py new file mode 100644 index 00000000..e6a20b1d --- /dev/null +++ b/tests/corpora/test_quotation_denormalization_usfm_block_update_handler.py @@ -0,0 +1,417 @@ +from typing import Union + +from machine.corpora import ( + QuotationMarkDenormalizationUsfmUpdateBlockHandler, + QuotationMarkUpdateSettings, + QuotationMarkUpdateStrategy, + UpdateUsfmParserHandler, + parse_usfm, +) +from machine.corpora.punctuation_analysis import STANDARD_QUOTE_CONVENTIONS, QuoteConvention + +simple_normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + + +def test_simple_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_british_english_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + "You shall not eat of any tree of the garden"?' + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "british_english", "british_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# no denormalization should be needed for this example +def test_simple_typewriter_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, \"Has God really said, 'You shall not eat of any tree of the garden'?\"" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "typewriter_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# some of the quotes shouldn't need to be denormalized +def test_simple_hybrid_typewriter_english_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, 'You shall not eat of any tree of the garden'?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "hybrid_typewriter_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the single guillemets shouldn't need to be denormalized +# because Moses doesn't normalize them +def test_simple_french_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ‹You shall not eat of any tree of the garden›?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, ‹You shall not eat of any tree of the garden›?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_french", "standard_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the unusual quotation marks shouldn't need to be denormalized +def test_simple_typewriter_french_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "typewriter_french", "typewriter_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# the 1st- and 2nd-level quotes are denormalized to identical marks +def test_simple_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, “You shall not eat of any tree of the garden”?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "western_european", "western_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_typewriter_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <> + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, <>' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "typewriter_western_european", "typewriter_western_european" + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_typewriter_western_european_variant_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, ?"' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "typewriter_western_european_variant", "typewriter_western_european_variant" + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_hybrid_typewriter_western_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, «Has God really said, "You shall not eat of any tree of the garden"?»' + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "hybrid_typewriter_western_european", "hybrid_typewriter_western_european" + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_central_european_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden‘?“" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "central_european", "central_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_central_european_guillemets_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + ›You shall not eat of any tree of the garden‹?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ›You shall not eat of any tree of the garden‹?«" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, "central_european_guillemets", "central_european_guillemets" + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_swedish_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + 'You shall not eat of any tree of the garden'?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_swedish", "standard_swedish") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_finnish_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, »Has God really said, ’You shall not eat of any tree of the garden’?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_finnish") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_eastern_european_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, „Has God really said, ‚You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "eastern_european") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_russian_quote_denormalization() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, «Has God really said, „You shall not eat of any tree of the garden“?»" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_russian", "standard_russian") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_simple_arabic_quote_denormalization() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ”Has God really said, ’You shall not eat of any tree of the garden‘?“" + ) + + observed_usfm = denormalize_quotation_marks(normalized_usfm, "standard_english", "standard_arabic") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_quotation_denormalization_same_as_full() -> None: + normalized_usfm = simple_normalized_usfm + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_quotation_denormalization_incorrectly_nested() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + "You shall not eat of any tree of the garden"?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_quotation_denormalization_incorrectly_nested_second_case() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, 'Has God really said, + "You shall not eat of any tree of the garden"?' + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?’" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_quotation_denormalization_unclosed_quote() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = denormalize_quotation_marks( + normalized_usfm, + "standard_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def denormalize_quotation_marks( + normalized_usfm: str, + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> str: + quotation_denormalizer: QuotationMarkDenormalizationUsfmUpdateBlockHandler = ( + create_quotation_denormalization_usfm_update_block_handler( + source_quote_convention_name, target_quote_convention_name, quotation_denormalization_settings + ) + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quotation_denormalizer]) + parse_usfm(normalized_usfm, updater) + + return updater.get_usfm() + + +def create_quotation_denormalization_usfm_update_block_handler( + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_denormalization_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> QuotationMarkDenormalizationUsfmUpdateBlockHandler: + source_quote_convention = get_quote_convention_by_name(source_quote_convention_name) + target_quote_convention = get_quote_convention_by_name(target_quote_convention_name) + + return QuotationMarkDenormalizationUsfmUpdateBlockHandler( + source_quote_convention, + target_quote_convention, + quotation_denormalization_settings, + ) + + +def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: + for observed_line, expected_line in zip(observed_usfm.split("\n"), expected_usfm.split("\n")): + assert observed_line.strip() == expected_line.strip() + + +def get_quote_convention_by_name(name: str) -> QuoteConvention: + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) + assert quote_convention is not None + return quote_convention diff --git a/tests/corpora/test_quotation_mark_update_first_pass.py b/tests/corpora/test_quotation_mark_update_first_pass.py new file mode 100644 index 00000000..5b7ab9b0 --- /dev/null +++ b/tests/corpora/test_quotation_mark_update_first_pass.py @@ -0,0 +1,701 @@ +from typing import List, Union + +from machine.corpora import QuotationMarkUpdateFirstPass, QuotationMarkUpdateStrategy, parse_usfm +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + Chapter, + QuotationMarkResolutionIssue, + QuoteConvention, + SingleLevelQuoteConvention, + TextSegment, + Verse, +) + + +def test_check_whether_fallback_mode_will_work() -> None: + + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # Cases where we expect fallback mode to work + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_english"), + get_quote_convention_by_name("standard_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_french"), + get_quote_convention_by_name("british_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european"), + get_quote_convention_by_name("standard_russian"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european_variant"), + get_quote_convention_by_name("standard_arabic"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european"), + get_quote_convention_by_name("british_typewriter_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_swedish"), + get_quote_convention_by_name("typewriter_french"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_finnish"), + get_quote_convention_by_name("british_inspired_western_european"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("eastern_european"), + get_quote_convention_by_name("central_european"), + ) + is True + ) + + # Cases where we expect fallback mode to fail + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_english"), + get_quote_convention_by_name("western_european"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_french"), + get_quote_convention_by_name("western_european"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_french"), + get_quote_convention_by_name("french_variant"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european"), + get_quote_convention_by_name("typewriter_western_european"), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("eastern_european"), + get_quote_convention_by_name("standard_russian"), + ) + is False + ) + + +def test_check_whether_fallback_mode_will_work_with_normalized_conventions() -> None: + + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # Cases where we expect fallback mode to work + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_english").normalize(), + get_quote_convention_by_name("standard_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_french").normalize(), + get_quote_convention_by_name("british_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european").normalize(), + get_quote_convention_by_name("standard_russian"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_western_european_variant").normalize(), + get_quote_convention_by_name("standard_arabic"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european").normalize(), + get_quote_convention_by_name("british_typewriter_english"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_swedish").normalize(), + get_quote_convention_by_name("typewriter_french"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_finnish").normalize(), + get_quote_convention_by_name("british_inspired_western_european"), + ) + is True + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("eastern_european").normalize(), + get_quote_convention_by_name("central_european"), + ) + is True + ) + + # Cases where we expect fallback mode to fail + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("western_european").normalize(), + get_quote_convention_by_name("standard_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("french_variant").normalize(), + get_quote_convention_by_name("hybrid_typewriter_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("british_inspired_western_european").normalize(), + get_quote_convention_by_name("standard_russian"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("typewriter_english").normalize(), + get_quote_convention_by_name("western_european"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("central_european_guillemets").normalize(), + get_quote_convention_by_name("french_variant"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_arabic").normalize(), + get_quote_convention_by_name("hybrid_typewriter_english"), + ) + is False + ) + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + get_quote_convention_by_name("standard_russian").normalize(), + get_quote_convention_by_name("standard_french"), + ) + is False + ) + + +def test_check_whether_fallback_mode_will_work_with_artificial_conventions() -> None: + + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + + # This tests combinations of quotation marks that haven't been observed in real-world conventions, + # but would cause fallback mode not to work. + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + QuoteConvention( + "artificial_source_quote_convention1", + [SingleLevelQuoteConvention('"', '"'), SingleLevelQuoteConvention('"', '"')], + ), + QuoteConvention( + "artificial_target_quote_convention1", + [SingleLevelQuoteConvention("\u201c", "\u201d"), SingleLevelQuoteConvention("\u201c", "\u201c")], + ), + ) + is False + ) + + assert ( + first_pass_analyzer._check_whether_fallback_mode_will_work( + QuoteConvention( + "artificial_source_quote_convention2", + [SingleLevelQuoteConvention('"', '"'), SingleLevelQuoteConvention('"', '"')], + ), + QuoteConvention( + "artificial_target_quote_convention2", + [SingleLevelQuoteConvention("\u201d", "\u201d"), SingleLevelQuoteConvention("\u201c", "\u201d")], + ), + ) + is False + ) + + +def test_choose_best_action_for_chapter() -> None: + # Verse text with no issues + actual_action = run_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.APPLY_FULL + assert actual_action == expected_action + + # Verse text with unpaired opening quotation mark + actual_action = run_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK + assert actual_action == expected_action + + # Verse text with unpaired closing quotation mark + actual_action = run_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + "He said to the woman, Has God really said, " + + "You shall not eat of any tree of the garden?”" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK + assert actual_action == expected_action + + # Verse text with too deeply nested quotation marks + actual_action = run_first_pass_on_chapter( + [ + "“Now the serpent was more “subtle than any animal " + + "of the “field which “Yahweh God had made. " + + "He said to the woman, “Has God really said, " + + "“You shall not eat of any tree of the garden?" + ], + "standard_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.APPLY_FALLBACK + assert actual_action == expected_action + + # Verse text with an ambiguous quotation mark + actual_action = run_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman"Has God really said, ' + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.SKIP + assert actual_action == expected_action + + # Verse text with an ambiguous quotation mark + actual_action = run_first_pass_on_chapter( + [ + "Now the serpent was more subtle than any animal " + + "of the field which Yahweh God had made. " + + 'He said to the woman"Has God really said, ' + + "You shall not eat of any tree of the garden?" + ], + "typewriter_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.SKIP + assert actual_action == expected_action + + # Verse text with too deeply nested ambiguous quotation marks + actual_action = run_first_pass_on_chapter( + [ + '"Now the serpent was more "subtle than any animal ' + + 'of the "field which "Yahweh God had made. ' + + 'He said to the woman, "Has God really said, ' + + '"You shall not eat of any tree of the garden?' + ], + "typewriter_english", + "standard_english", + ) + expected_action = QuotationMarkUpdateStrategy.SKIP + assert actual_action == expected_action + + +def test_choose_best_action_based_on_observed_issues() -> None: + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_fallback_mode_work = False + + # Test with no issues + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues(set()) + assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL + + # Test with one issue + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK} + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK} + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.TOO_DEEP_NESTING} + ) + == QuotationMarkUpdateStrategy.SKIP + ) + + # Test with multiple issues + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + } + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + } + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + } + ) + == QuotationMarkUpdateStrategy.SKIP + ) + + +def test_choose_best_action_based_on_observed_issues_with_basic_fallback() -> None: + first_pass_analyzer = QuotationMarkUpdateFirstPass(QuoteConvention("", []), QuoteConvention("", [])) + first_pass_analyzer._will_fallback_mode_work = True + + # Test with no issues + best_action = first_pass_analyzer._choose_best_strategy_based_on_observed_issues(set()) + assert best_action == QuotationMarkUpdateStrategy.APPLY_FULL + + # Test with one issue + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK} + ) + == QuotationMarkUpdateStrategy.APPLY_FALLBACK + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK} + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + {QuotationMarkResolutionIssue.TOO_DEEP_NESTING} + ) + == QuotationMarkUpdateStrategy.APPLY_FALLBACK + ) + + # Test with multiple issues + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + } + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.AMBIGUOUS_QUOTATION_MARK, + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + } + ) + == QuotationMarkUpdateStrategy.SKIP + ) + assert ( + first_pass_analyzer._choose_best_strategy_based_on_observed_issues( + { + QuotationMarkResolutionIssue.TOO_DEEP_NESTING, + QuotationMarkResolutionIssue.UNPAIRED_QUOTATION_MARK, + } + ) + == QuotationMarkUpdateStrategy.APPLY_FALLBACK + ) + + +# tests of get_best_actions_by_chapter() +def test_no_issues_in_usfm() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_opening_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’? + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_closing_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_too_deep_nesting() -> None: + normalized_usfm = """\\c 1 + \\v 1 “Now the serpent was more “subtle than any animal + of the “field which “Yahweh God had made. + He said to the woman, “Has God really said, + “You shall not eat of any tree of the garden? + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman"Has God really said, + You shall not eat of any tree of the garden? + """ + expected_actions = [QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_no_issues_in_multiple_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_second_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_first_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \\c 2 \\v 1 He said to the woman, Has God really said, + “You shall not eat of any tree of the garden?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_second_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not"eat of any tree of the garden?" + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_first_chapter() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field"which Yahweh God had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + "You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.APPLY_FULL] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_quotation_mark_in_both_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had” made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?” + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "standard_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_quotation_mark_in_both_chapters() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had"made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any"tree of the garden? + """ + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_unpaired_in_first_ambiguous_in_second() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made." + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any"tree of the garden? + """ + expected_actions = [QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def test_ambiguous_in_first_unpaired_in_second() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God"had made. + \\c 2 \\v 1 He said to the woman, Has God really said, + You shall not eat of any tree of the garden?" + """ + expected_actions = [QuotationMarkUpdateStrategy.SKIP, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + observed_actions = run_first_pass(normalized_usfm, "typewriter_english", "standard_english") + + assert expected_actions == observed_actions + + +def run_first_pass( + normalized_usfm: str, source_quote_convention_name: str, target_quote_convention_name: str +) -> List[QuotationMarkUpdateStrategy]: + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) + assert source_quote_convention is not None + + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) + assert target_quote_convention is not None + + first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) + parse_usfm(normalized_usfm, first_pass_analyzer) + + return first_pass_analyzer.find_best_chapter_strategies() + + +def run_first_pass_on_chapter( + verse_texts: List[str], source_quote_convention_name: str, target_quote_convention_name: str +) -> QuotationMarkUpdateStrategy: + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) + assert source_quote_convention is not None + + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) + assert target_quote_convention is not None + + first_pass_analyzer = QuotationMarkUpdateFirstPass(source_quote_convention, target_quote_convention) + + chapter = Chapter([Verse([TextSegment.Builder().set_text(verse_text).build() for verse_text in verse_texts])]) + + return first_pass_analyzer._find_best_strategy_for_chapter(chapter) + + +def get_quote_convention_by_name(name: str) -> QuoteConvention: + quote_convention: Union[QuoteConvention, None] = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(name) + assert quote_convention is not None + return quote_convention diff --git a/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py new file mode 100644 index 00000000..f4b287de --- /dev/null +++ b/tests/corpora/test_quote_convention_changing_usfm_block_update_handler.py @@ -0,0 +1,859 @@ +from typing import Generator, List, Set, Union + +from machine.corpora import ( + QuotationMarkUpdateSettings, + QuotationMarkUpdateStrategy, + QuoteConventionChangingUsfmUpdateBlockHandler, + ScriptureRef, + UpdateUsfmParserHandler, + UsfmToken, + UsfmTokenType, + UsfmUpdateBlock, + UsfmUpdateBlockElement, + UsfmUpdateBlockElementType, + parse_usfm, +) +from machine.corpora.punctuation_analysis import ( + STANDARD_QUOTE_CONVENTIONS, + QuotationMarkDirection, + QuotationMarkFinder, + QuotationMarkMetadata, + QuotationMarkResolutionIssue, + QuotationMarkResolver, + QuotationMarkStringMatch, + QuoteConventionSet, + TextSegment, + UsfmMarkerType, +) + + +def test_quotes_spanning_verses() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really said, + \\v 2 “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, \n" + + "\\v 2 ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_single_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft «This is a “footnote”» \\f* + of the field which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field which Yahweh God had made." + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_multiple_embeds() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + \\f + \\ft «This is a “footnote”» \\f* + of the field \\f + \\ft Second «footnote» here \\f* which Yahweh God had made. + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal " + + "\\f + \\ft “This is a ‘footnote’” \\f* of the field \\f + \\ft Second " + + "“footnote” here \\f* which Yahweh God had made." + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_text_and_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God really \\f + \\ft a + «footnote» in the «midst of “text”» \\f* said, + “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_quotes_in_multiple_verses_and_embed() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, «Has God + \\v 2 really \\f + \\ft a + «footnote» in the «midst of “text”» \\f* said, + “You shall not eat of any tree of the garden”?» + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God\n" + + "\\v 2 really \\f + \\ft a “footnote” in the “midst of ‘text’” \\f* " + + "said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "western_european", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +# Fallback mode does not consider the nesting of quotation marks, +# but only determines opening/closing marks and maps based on that. +def test_fallback_strategy_same_as_full() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + “You shall not eat of any tree of the garden”?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_incorrectly_nested() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + ‘You shall not eat of any tree of the garden’?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_incorrectly_nested_second_case() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, ‘Has God really said, “You shall not eat of any tree of the garden”?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_fallback_strategy_unclosed_quote() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, ‘Has God really said, + You shall not eat of any tree of the garden”?’ + """ + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "british_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_default_quotation_mark_update_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FULL), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.APPLY_FALLBACK), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(default_chapter_strategy=QuotationMarkUpdateStrategy.SKIP), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_single_chapter_quotation_mark_update_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_basic_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_skipped_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + 'the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL]), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK]), + ) + assert_usfm_equal(observed_usfm, expected_basic_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings(chapter_strategies=[QuotationMarkUpdateStrategy.SKIP]), + ) + assert_usfm_equal(observed_usfm, expected_skipped_usfm) + + +def test_multiple_chapter_same_strategy() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_fallback_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FULL] + ), + ) + assert_usfm_equal(observed_usfm, expected_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_usfm) + + +def test_multiple_chapter_multiple_strategies() -> None: + normalized_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle" than any animal + of the field which Yahweh God had made. + \\c 2 + \\v 1 He said to the woman, "Has God really said, + You shall not eat of any tree of the garden'?" + """ + expected_full_then_fallback_usfm = ( + "\\c 1\n" + + '\\v 1 Now the serpent was more subtle" than any animal of the field which Yahweh God had made.\n' + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden’?”" + ) + + expected_fallback_then_full_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + "\\v 1 He said to the woman, “Has God really said, You shall not eat of any tree of the garden'?”" + ) + + expected_fallback_then_skip_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle” than any animal of the field which Yahweh God had made.\n" + + "\\c 2\n" + + '\\v 1 He said to the woman, "Has God really said, You shall not eat of any tree of the garden\'?"' + ) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FULL, QuotationMarkUpdateStrategy.APPLY_FALLBACK] + ), + ) + assert_usfm_equal(observed_usfm, expected_full_then_fallback_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.APPLY_FULL] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_then_full_usfm) + + observed_usfm = change_quotation_marks( + normalized_usfm, + "typewriter_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[QuotationMarkUpdateStrategy.APPLY_FALLBACK, QuotationMarkUpdateStrategy.SKIP] + ), + ) + assert_usfm_equal(observed_usfm, expected_fallback_then_skip_usfm) + + +def test_multi_character_quotation_marks_in_source_quote_convention() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, <?>> + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, “Has God really said, " + + "‘You shall not eat of any tree of the garden’?”" + ) + + observed_usfm = change_quotation_marks(input_usfm, "typewriter_french", "standard_english") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_multi_character_quotation_marks_in_target_quote_convention() -> None: + input_usfm = """\\c 1 + \\v 1 Now the serpent was more subtle than any animal + of the field which Yahweh God had made. + He said to the woman, “Has God really said, + ‘You shall not eat of any tree of the garden’?” + """ + + expected_usfm = ( + "\\c 1\n" + + "\\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to " + + "the woman, <?>>" + ) + + observed_usfm = change_quotation_marks(input_usfm, "standard_english", "typewriter_french") + assert_usfm_equal(observed_usfm, expected_usfm) + + +def test_process_scripture_element() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "british_english") + ) + quote_convention_changer._quotation_mark_finder = MockQuotationMarkFinder() + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")], + ) + mock_quotation_mark_resolver: QuotationMarkResolver = MockQuotationMarkResolver() + quote_convention_changer._process_scripture_element(update_element, mock_quotation_mark_resolver) + + assert quote_convention_changer._quotation_mark_finder.num_times_called == 1 + assert mock_quotation_mark_resolver.num_times_called == 1 + assert quote_convention_changer._quotation_mark_finder.matches_to_return[0]._text_segment._text == "this is a ‘test" + assert ( + quote_convention_changer._quotation_mark_finder.matches_to_return[1]._text_segment._text + == "the test ends” here" + ) + + +def test_create_text_segments_basic() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, tokens=[UsfmToken(UsfmTokenType.TEXT, text="test segment")] + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0]._text == "test segment" + assert text_segments[0]._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert text_segments[0]._markers_in_preceding_context == set() + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_preceding_markers() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment"), + ], + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 1 + assert text_segments[0]._text == "test segment" + assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH + assert text_segments[0]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.PARAGRAPH, + } + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment is None + + +def test_create_text_segments_with_multiple_text_tokens() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + update_element: UsfmUpdateBlockElement = UsfmUpdateBlockElement( + UsfmUpdateBlockElementType.TEXT, + tokens=[ + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.PARAGRAPH), + UsfmToken(UsfmTokenType.TEXT, text="test segment1"), + UsfmToken(UsfmTokenType.VERSE), + UsfmToken(UsfmTokenType.CHARACTER), + UsfmToken(UsfmTokenType.TEXT, text="test segment2"), + UsfmToken(UsfmTokenType.PARAGRAPH), + ], + ) + text_segments: List[TextSegment] = quote_convention_changer._create_text_segments(update_element) + + assert len(text_segments) == 2 + assert text_segments[0]._text == "test segment1" + assert text_segments[0]._immediate_preceding_marker == UsfmMarkerType.PARAGRAPH + assert text_segments[0]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.PARAGRAPH, + } + assert text_segments[0].previous_segment is None + assert text_segments[0].next_segment == text_segments[1] + assert text_segments[1]._text == "test segment2" + assert text_segments[1]._immediate_preceding_marker == UsfmMarkerType.CHARACTER + assert text_segments[1]._markers_in_preceding_context == { + UsfmMarkerType.VERSE, + UsfmMarkerType.CHARACTER, + } + assert text_segments[1].previous_segment == text_segments[0] + assert text_segments[1].next_segment is None + + +def test_create_text_segment() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + usfm_token: UsfmToken = UsfmToken(UsfmTokenType.TEXT, text="test segment") + segment: Union[TextSegment, None] = quote_convention_changer._create_text_segment(usfm_token) + + assert segment is not None + assert segment._text == "test segment" + assert segment._immediate_preceding_marker is UsfmMarkerType.NO_MARKER + assert segment._markers_in_preceding_context == set() + assert segment._usfm_token == usfm_token + + +def test_set_previous_and_next_for_segments() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + segments: List[TextSegment] = [ + TextSegment.Builder().set_text("segment 1 text").build(), + TextSegment.Builder().set_text("segment 2 text").build(), + TextSegment.Builder().set_text("segment 3 text").build(), + ] + + quote_convention_changer._set_previous_and_next_for_segments(segments) + + assert segments[0].previous_segment is None + assert segments[0].next_segment == segments[1] + assert segments[1].previous_segment == segments[0] + assert segments[1].next_segment == segments[2] + assert segments[2].previous_segment == segments[1] + assert segments[2].next_segment is None + + +def test_update_quotation_marks() -> None: + multi_char_to_single_char_quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("typewriter_french", "standard_english") + ) + + multi_character_text_segment: TextSegment = TextSegment.Builder().set_text("this < >>").build() + multi_character_quotation_marks: List[QuotationMarkMetadata] = [ + QuotationMarkMetadata( + quotation_mark="<<", + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=multi_character_text_segment, + start_index=5, + end_index=7, + ), + QuotationMarkMetadata( + quotation_mark="<", + depth=2, + direction=QuotationMarkDirection.OPENING, + text_segment=multi_character_text_segment, + start_index=10, + end_index=11, + ), + QuotationMarkMetadata( + quotation_mark=">", + depth=2, + direction=QuotationMarkDirection.CLOSING, + text_segment=multi_character_text_segment, + start_index=25, + end_index=26, + ), + QuotationMarkMetadata( + quotation_mark=">>", + depth=1, + direction=QuotationMarkDirection.CLOSING, + text_segment=multi_character_text_segment, + start_index=27, + end_index=29, + ), + ] + + multi_char_to_single_char_quote_convention_changer._update_quotation_marks(multi_character_quotation_marks) + + assert multi_character_text_segment.text == "this “is ‘a test segment’ ”" + + assert multi_character_quotation_marks[0].start_index == 5 + assert multi_character_quotation_marks[0].end_index == 6 + assert multi_character_quotation_marks[0].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[1].start_index == 9 + assert multi_character_quotation_marks[1].end_index == 10 + assert multi_character_quotation_marks[1].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[2].start_index == 24 + assert multi_character_quotation_marks[2].end_index == 25 + assert multi_character_quotation_marks[2].text_segment == multi_character_text_segment + + assert multi_character_quotation_marks[3].start_index == 26 + assert multi_character_quotation_marks[3].end_index == 27 + assert multi_character_quotation_marks[3].text_segment == multi_character_text_segment + + single_char_to_multi_char_quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "typewriter_french") + ) + + single_character_text_segment: TextSegment = TextSegment.Builder().set_text("this “is ‘a test segment’ ”").build() + single_character_quotation_marks: List[QuotationMarkMetadata] = [ + QuotationMarkMetadata( + quotation_mark="“", + depth=1, + direction=QuotationMarkDirection.OPENING, + text_segment=single_character_text_segment, + start_index=5, + end_index=6, + ), + QuotationMarkMetadata( + quotation_mark="‘", + depth=2, + direction=QuotationMarkDirection.OPENING, + text_segment=single_character_text_segment, + start_index=9, + end_index=10, + ), + QuotationMarkMetadata( + quotation_mark="’", + depth=2, + direction=QuotationMarkDirection.CLOSING, + text_segment=single_character_text_segment, + start_index=24, + end_index=25, + ), + QuotationMarkMetadata( + quotation_mark="”", + depth=1, + direction=QuotationMarkDirection.CLOSING, + text_segment=single_character_text_segment, + start_index=26, + end_index=27, + ), + ] + + single_char_to_multi_char_quote_convention_changer._update_quotation_marks(single_character_quotation_marks) + + assert single_character_text_segment.text == "this < >>" + + assert single_character_quotation_marks[0].start_index == 5 + assert single_character_quotation_marks[0].end_index == 7 + assert single_character_quotation_marks[0].text_segment == single_character_text_segment + + assert single_character_quotation_marks[1].start_index == 10 + assert single_character_quotation_marks[1].end_index == 11 + assert single_character_quotation_marks[1].text_segment == single_character_text_segment + + assert single_character_quotation_marks[2].start_index == 25 + assert single_character_quotation_marks[2].end_index == 26 + assert single_character_quotation_marks[2].text_segment == single_character_text_segment + + assert single_character_quotation_marks[3].start_index == 27 + assert single_character_quotation_marks[3].end_index == 29 + assert single_character_quotation_marks[3].text_segment == single_character_text_segment + + +def test_check_for_chapter_change() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler("standard_english", "standard_english") + ) + + assert quote_convention_changer._current_chapter_number == 0 + + quote_convention_changer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("MAT 1:1")], [])) + + assert quote_convention_changer._current_chapter_number == 1 + + quote_convention_changer._check_for_chapter_change(UsfmUpdateBlock([ScriptureRef.parse("ISA 15:22")], [])) + + assert quote_convention_changer._current_chapter_number == 15 + + +def test_start_new_chapter() -> None: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler( + "standard_english", + "standard_english", + QuotationMarkUpdateSettings( + chapter_strategies=[ + QuotationMarkUpdateStrategy.SKIP, + QuotationMarkUpdateStrategy.APPLY_FULL, + QuotationMarkUpdateStrategy.APPLY_FALLBACK, + ] + ), + ) + ) + + quote_convention_changer._next_scripture_text_segment_builder.add_preceding_marker(UsfmMarkerType.EMBED).set_text( + "this text should be erased" + ) + quote_convention_changer._verse_text_quotation_mark_resolver._issues.add( + QuotationMarkResolutionIssue.INCOMPATIBLE_QUOTATION_MARK + ) + + quote_convention_changer._start_new_chapter(1) + segment = quote_convention_changer._next_scripture_text_segment_builder.build() + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.SKIP + assert segment._immediate_preceding_marker == UsfmMarkerType.CHAPTER + assert segment._text == "" + assert UsfmMarkerType.EMBED not in segment._markers_in_preceding_context + assert quote_convention_changer._verse_text_quotation_mark_resolver._issues == set() + + quote_convention_changer._start_new_chapter(2) + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.APPLY_FULL + + quote_convention_changer._start_new_chapter(3) + assert quote_convention_changer._current_strategy == QuotationMarkUpdateStrategy.APPLY_FALLBACK + + +def change_quotation_marks( + normalized_usfm: str, + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> str: + quote_convention_changer: QuoteConventionChangingUsfmUpdateBlockHandler = ( + create_quote_convention_changing_usfm_update_block_handler( + source_quote_convention_name, target_quote_convention_name, quotation_mark_update_settings + ) + ) + + updater = UpdateUsfmParserHandler(update_block_handlers=[quote_convention_changer]) + parse_usfm(normalized_usfm, updater) + + return updater.get_usfm() + + +def create_quote_convention_changing_usfm_update_block_handler( + source_quote_convention_name: str, + target_quote_convention_name: str, + quotation_mark_update_settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(), +) -> QuoteConventionChangingUsfmUpdateBlockHandler: + source_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(source_quote_convention_name) + assert source_quote_convention is not None + + target_quote_convention = STANDARD_QUOTE_CONVENTIONS.get_quote_convention_by_name(target_quote_convention_name) + assert target_quote_convention is not None + + return QuoteConventionChangingUsfmUpdateBlockHandler( + source_quote_convention, + target_quote_convention, + quotation_mark_update_settings, + ) + + +def assert_usfm_equal(observed_usfm: str, expected_usfm: str) -> None: + for observed_line, expected_line in zip(observed_usfm.split("\n"), expected_usfm.split("\n")): + assert observed_line.strip() == expected_line.strip() + + +class MockQuotationMarkFinder(QuotationMarkFinder): + def __init__(self) -> None: + super().__init__(QuoteConventionSet([])) + self.num_times_called = 0 + self.matches_to_return = [ + QuotationMarkStringMatch(TextSegment.Builder().set_text('this is a "test').build(), 10, 11), + QuotationMarkStringMatch(TextSegment.Builder().set_text('the test ends" here').build(), 13, 14), + ] + + def find_all_potential_quotation_marks_in_text_segments( + self, text_segments: List[TextSegment] + ) -> List[QuotationMarkStringMatch]: + self.num_times_called += 1 + return self.matches_to_return + + +class MockQuotationMarkResolver(QuotationMarkResolver): + def __init__(self): + self.num_times_called = 0 + + def reset(self) -> None: + self.num_times_called = 0 + + def resolve_quotation_marks( + self, quote_matches: List[QuotationMarkStringMatch] + ) -> Generator[QuotationMarkMetadata, None, None]: + self.num_times_called += 1 + current_depth = 1 + current_direction = QuotationMarkDirection.OPENING + for quote_match in quote_matches: + yield quote_match.resolve(current_depth, current_direction) + current_depth += 1 + current_direction = ( + QuotationMarkDirection.CLOSING + if current_direction == QuotationMarkDirection.OPENING + else QuotationMarkDirection.OPENING + ) + + def get_issues(self) -> Set[QuotationMarkResolutionIssue]: + return set()