diff --git a/machine/corpora/scripture_ref.py b/machine/corpora/scripture_ref.py index 1c4c7bf3..3c952372 100644 --- a/machine/corpora/scripture_ref.py +++ b/machine/corpora/scripture_ref.py @@ -85,9 +85,7 @@ def to_relaxed(self) -> ScriptureRef: return ScriptureRef(self.verse_ref, [pe.to_relaxed() for pe in self.path]) def change_versification(self, versification: Versification) -> ScriptureRef: - vr: VerseRef = self.verse_ref.copy() - vr.change_versification(versification) - return ScriptureRef(vr, self.path) + return ScriptureRef(self.verse_ref.to_versification(versification), self.path) def compare_to(self, other: object, compare_segments: bool = True) -> int: if not isinstance(other, ScriptureRef): diff --git a/machine/corpora/text_corpus_enumerator.py b/machine/corpora/text_corpus_enumerator.py index 5267d3e9..79d1f2df 100644 --- a/machine/corpora/text_corpus_enumerator.py +++ b/machine/corpora/text_corpus_enumerator.py @@ -15,6 +15,7 @@ def __init__( versification: Optional[Versification], ): self._generator = generator + self._versification = versification self._ref_versification = ref_versification self._is_scripture = ( ref_versification is not None and versification is not None and ref_versification != versification @@ -54,17 +55,19 @@ def __exit__(self, type: Any, value: Any, traceback: Any) -> None: def _collect_verses(self): assert self._ref_versification is not None + assert self._versification is not None + has_cross_book_mappings = self._versification.has_cross_book_mappings(self._ref_versification) rows: List[Tuple[ScriptureRef, TextRow]] = [] - out_of_order = False + verses_out_of_order = False prev_ref = EMPTY_SCRIPTURE_REF range_start_offset = -1 while self._row is not None: row = cast(TextRow, self._row) ref = cast(ScriptureRef, row.ref) - if not prev_ref.is_empty and ref.book_num != prev_ref.book_num: + ref = ref.change_versification(self._ref_versification) + if not has_cross_book_mappings and not prev_ref.is_empty and ref.book_num != prev_ref.book_num: break - ref = ref.change_versification(self._ref_versification) # convert one-to-many mapping to a verse range if ref == prev_ref: range_start_ref, range_start_row = rows[range_start_offset] @@ -88,12 +91,12 @@ def _collect_verses(self): else: range_start_offset = -1 rows.append((ref, row)) - if not out_of_order and ref < prev_ref: - out_of_order = True + if not verses_out_of_order and ref < prev_ref: + verses_out_of_order = True prev_ref = ref self._row = next(self._generator, None) - if out_of_order: + if verses_out_of_order: rows.sort(key=lambda t: t[0]) for _, row in rows: diff --git a/machine/scripture/canon.py b/machine/scripture/canon.py index 04ae8f02..3a87638f 100644 --- a/machine/scripture/canon.py +++ b/machine/scripture/canon.py @@ -151,9 +151,9 @@ def book_number_to_id(number: int, error_value: str = "***") -> str: - if number < 1 or number >= len(ALL_BOOK_IDS): - return error_value index = number - 1 + if index < 0 or index >= len(ALL_BOOK_IDS): + return error_value return ALL_BOOK_IDS[index] diff --git a/machine/scripture/verse_ref.py b/machine/scripture/verse_ref.py index 5732fcfc..11a3931e 100644 --- a/machine/scripture/verse_ref.py +++ b/machine/scripture/verse_ref.py @@ -4,7 +4,7 @@ from enum import Enum, IntEnum, auto from io import TextIOWrapper from pathlib import Path, PurePath -from typing import BinaryIO, Dict, Iterable, List, Optional, Set, TextIO, Tuple, Union, cast +from typing import BinaryIO, Dict, Generator, Iterable, List, Optional, Set, TextIO, Tuple, Union, cast import regex as re @@ -876,6 +876,32 @@ def change_versification(self, vref: VerseRef, ignore_segments: bool = False) -> vref.versification = self return True + def all_included_verses(self) -> Generator[VerseRef, None, None]: + for book, chapters in enumerate(self.book_list): + book = book + 1 + if not is_canonical(book) or (book > 86 and book < 93): + continue + for chapter, last_verse in enumerate(chapters): + chapter = chapter + 1 + first_verse = self.first_included_verse(book, chapter) + yielded_first_verse = False + for verse_number in range(2, last_verse + 1): + verse = VerseRef(book=book, chapter=chapter, verse=verse_number, versification=self) + if self.is_excluded(verse.bbbcccvvv): + continue + if not yielded_first_verse and first_verse is not None: + yield first_verse + yielded_first_verse = True + yield verse + + def has_cross_book_mappings(self, reference_versification: Optional[Versification] = None) -> bool: + reference_versification = reference_versification or Versification.get_builtin("Original") + for verse_ref in self.all_included_verses(): + standard_ref = verse_ref.to_versification(reference_versification) + if verse_ref.book_num != standard_ref.book_num: + return True + return False + def __eq__(self, other: Versification) -> bool: if self is other: return True diff --git a/pyproject.toml b/pyproject.toml index 4f72c865..e7d2d8b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ reportMissingModuleSource = false [tool.poetry] name = "sil-machine" -version = "1.8.9" +version = "1.8.10" description = "A natural language processing library that is focused on providing tools for resource-poor languages." license = "MIT" authors = ["SIL International"] diff --git a/tests/corpora/test_parallel_text_corpus.py b/tests/corpora/test_parallel_text_corpus.py index e9647061..33062084 100644 --- a/tests/corpora/test_parallel_text_corpus.py +++ b/tests/corpora/test_parallel_text_corpus.py @@ -17,7 +17,12 @@ TextRow, TextRowFlags, ) -from machine.scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION, Versification +from machine.scripture import ( + ENGLISH_VERSIFICATION, + ORIGINAL_VERSIFICATION, + RUSSIAN_ORTHODOX_VERSIFICATION, + Versification, +) def test_get_rows_no_rows() -> None: @@ -1330,6 +1335,104 @@ def test_get_rows_different_versifications_with_verse_segments(): assert rows[5].target_segment == "target chapter sixteen, verse thirty nine b .".split() +def test_get_rows_different_versifications_with_cross_book_mappings(): + source_corpus = DictionaryTextCorpus( + MemoryText( + "DAN", + [ + text_row( + "DAN", + ScriptureRef.parse("DAN 3:23", ORIGINAL_VERSIFICATION), + "DAN source chapter three, verse twenty three .", + ), + text_row( + "DAN", + ScriptureRef.parse("DAN 3:24", ORIGINAL_VERSIFICATION), + "DAN source chapter three, verse twenty four .", + ), + ], + ), + MemoryText( + "S3Y", + [ + text_row( + "S3Y", + ScriptureRef.parse("S3Y 1:1", ORIGINAL_VERSIFICATION), + "S3Y source chapter one, verse one .", + ), + text_row( + "S3Y", + ScriptureRef.parse("S3Y 1:68", ORIGINAL_VERSIFICATION), + "S3Y source chapter one, verse sixty eight .", + ), + ], + ), + ) + source_corpus.versification = ORIGINAL_VERSIFICATION + + target_corpus = DictionaryTextCorpus( + MemoryText( + "DAN", + [ + text_row( + "DAN", + ScriptureRef.parse("DAN 3:23", RUSSIAN_ORTHODOX_VERSIFICATION), + "DAN target chapter three, verse twenty three .", + ), + text_row( + "DAN", + ScriptureRef.parse("DAN 3:24", RUSSIAN_ORTHODOX_VERSIFICATION), + "DAN target chapter three, verse twenty four .", + ), + text_row( + "DAN", + ScriptureRef.parse("DAN 3:90", RUSSIAN_ORTHODOX_VERSIFICATION), + "DAN target chapter three, verse ninety .", + ), + text_row( + "DAN", + ScriptureRef.parse("DAN 3:91", RUSSIAN_ORTHODOX_VERSIFICATION), + "DAN target chapter three, verse ninety one .", + ), + ], + ) + ) + target_corpus.versification = RUSSIAN_ORTHODOX_VERSIFICATION + + # Russian Orthodox vs. Original + # DAN 3:24-90 = DAG 3:24-90 + # DAN 3:91-100 = DAN 3:24-33 + # Original + # S3Y 1:1-29 = DAG 3:24-52 + # ... + # S3Y 1:38-68 = DAG 3:60-90 + + parallel_corpus = source_corpus.align_rows(target_corpus, all_source_rows=True) + rows = list(parallel_corpus.get_rows()) + + assert len(rows) == 4 + + assert rows[0].source_refs == [ScriptureRef.parse("DAN 3:23", ORIGINAL_VERSIFICATION)] + assert rows[0].target_refs == [ScriptureRef.parse("DAN 3:23", RUSSIAN_ORTHODOX_VERSIFICATION)] + assert rows[0].source_segment == "DAN source chapter three, verse twenty three .".split() + assert rows[0].target_segment == "DAN target chapter three, verse twenty three .".split() + + assert rows[1].source_refs == [ScriptureRef.parse("DAN 3:24", ORIGINAL_VERSIFICATION)] + assert rows[1].target_refs == [ScriptureRef.parse("DAN 3:91", RUSSIAN_ORTHODOX_VERSIFICATION)] + assert rows[1].source_segment == "DAN source chapter three, verse twenty four .".split() + assert rows[1].target_segment == "DAN target chapter three, verse ninety one .".split() + + assert rows[2].source_refs == [ScriptureRef.parse("S3Y 1:1", ORIGINAL_VERSIFICATION)] + assert rows[2].target_refs == [ScriptureRef.parse("DAN 3:24", RUSSIAN_ORTHODOX_VERSIFICATION)] + assert rows[2].source_segment == "S3Y source chapter one, verse one .".split() + assert rows[2].target_segment == "DAN target chapter three, verse twenty four .".split() + + assert rows[3].source_refs == [ScriptureRef.parse("S3Y 1:68", ORIGINAL_VERSIFICATION)] + assert rows[3].target_refs == [ScriptureRef.parse("DAN 3:90", RUSSIAN_ORTHODOX_VERSIFICATION)] + assert rows[3].source_segment == "S3Y source chapter one, verse sixty eight .".split() + assert rows[3].target_segment == "DAN target chapter three, verse ninety .".split() + + def test_to_pandas() -> None: source_corpus = DictionaryTextCorpus( MemoryText( diff --git a/tests/scripture/test_versification.py b/tests/scripture/test_versification.py index 506fc560..0a42e80b 100644 --- a/tests/scripture/test_versification.py +++ b/tests/scripture/test_versification.py @@ -96,3 +96,24 @@ def test_builtin_versification_type() -> None: assert RUSSIAN_PROTESTANT_VERSIFICATION.type == VersificationType.RUSSIAN_PROTESTANT assert VULGATE_VERSIFICATION.type == VersificationType.VULGATE assert SEPTUAGINT_VERSIFICATION.type == VersificationType.SEPTUAGINT + + +def test_all_included_verses() -> None: + original_verses = list(ORIGINAL_VERSIFICATION.all_included_verses()) + assert len(original_verses) == 41899 + assert original_verses[21899].bbbcccvvv == 27003024 + english_verses = list(ENGLISH_VERSIFICATION.all_included_verses()) + assert len(english_verses) == 38393 + assert english_verses[-1].bbbcccvvv == 123001020 + russian_orthodox_verses = list(RUSSIAN_ORTHODOX_VERSIFICATION.all_included_verses()) + assert len(russian_orthodox_verses) == 37280 + assert russian_orthodox_verses[-1].bbbcccvvv == 83001015 + + +def test_has_cross_book_mappings() -> None: + assert not ORIGINAL_VERSIFICATION.has_cross_book_mappings() + assert ENGLISH_VERSIFICATION.has_cross_book_mappings() + assert RUSSIAN_ORTHODOX_VERSIFICATION.has_cross_book_mappings() + assert not RUSSIAN_PROTESTANT_VERSIFICATION.has_cross_book_mappings() + assert VULGATE_VERSIFICATION.has_cross_book_mappings() + assert VULGATE_VERSIFICATION.has_cross_book_mappings(ENGLISH_VERSIFICATION)