Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions machine/corpora/scripture_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ def to_relaxed(self) -> ScriptureRef:
return ScriptureRef(self.verse_ref, [pe.to_relaxed() for pe in self.path])

def change_versification(self, versification: Versification) -> ScriptureRef:
vr: VerseRef = self.verse_ref.copy()
vr.change_versification(versification)
return ScriptureRef(vr, self.path)
return ScriptureRef(self.verse_ref.to_versification(versification), self.path)

def compare_to(self, other: object, compare_segments: bool = True) -> int:
if not isinstance(other, ScriptureRef):
Expand Down
15 changes: 9 additions & 6 deletions machine/corpora/text_corpus_enumerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(
versification: Optional[Versification],
):
self._generator = generator
self._versification = versification
self._ref_versification = ref_versification
self._is_scripture = (
ref_versification is not None and versification is not None and ref_versification != versification
Expand Down Expand Up @@ -54,17 +55,19 @@ def __exit__(self, type: Any, value: Any, traceback: Any) -> None:

def _collect_verses(self):
assert self._ref_versification is not None
assert self._versification is not None
has_cross_book_mappings = self._versification.has_cross_book_mappings(self._ref_versification)
rows: List[Tuple[ScriptureRef, TextRow]] = []
out_of_order = False
verses_out_of_order = False
prev_ref = EMPTY_SCRIPTURE_REF
range_start_offset = -1
while self._row is not None:
row = cast(TextRow, self._row)
ref = cast(ScriptureRef, row.ref)
if not prev_ref.is_empty and ref.book_num != prev_ref.book_num:
ref = ref.change_versification(self._ref_versification)
if not has_cross_book_mappings and not prev_ref.is_empty and ref.book_num != prev_ref.book_num:
break

ref = ref.change_versification(self._ref_versification)
# convert one-to-many mapping to a verse range
if ref == prev_ref:
range_start_ref, range_start_row = rows[range_start_offset]
Expand All @@ -88,12 +91,12 @@ def _collect_verses(self):
else:
range_start_offset = -1
rows.append((ref, row))
if not out_of_order and ref < prev_ref:
out_of_order = True
if not verses_out_of_order and ref < prev_ref:
verses_out_of_order = True
prev_ref = ref
self._row = next(self._generator, None)

if out_of_order:
if verses_out_of_order:
rows.sort(key=lambda t: t[0])

for _, row in rows:
Expand Down
4 changes: 2 additions & 2 deletions machine/scripture/canon.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@


def book_number_to_id(number: int, error_value: str = "***") -> str:
if number < 1 or number >= len(ALL_BOOK_IDS):
return error_value
index = number - 1
if index < 0 or index >= len(ALL_BOOK_IDS):
return error_value
return ALL_BOOK_IDS[index]


Expand Down
28 changes: 27 additions & 1 deletion machine/scripture/verse_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from enum import Enum, IntEnum, auto
from io import TextIOWrapper
from pathlib import Path, PurePath
from typing import BinaryIO, Dict, Iterable, List, Optional, Set, TextIO, Tuple, Union, cast
from typing import BinaryIO, Dict, Generator, Iterable, List, Optional, Set, TextIO, Tuple, Union, cast

import regex as re

Expand Down Expand Up @@ -876,6 +876,32 @@ def change_versification(self, vref: VerseRef, ignore_segments: bool = False) ->
vref.versification = self
return True

def all_included_verses(self) -> Generator[VerseRef, None, None]:
for book, chapters in enumerate(self.book_list):
book = book + 1
if not is_canonical(book) or (book > 86 and book < 93):
continue
for chapter, last_verse in enumerate(chapters):
chapter = chapter + 1
first_verse = self.first_included_verse(book, chapter)
yielded_first_verse = False
for verse_number in range(2, last_verse + 1):
verse = VerseRef(book=book, chapter=chapter, verse=verse_number, versification=self)
if self.is_excluded(verse.bbbcccvvv):
continue
if not yielded_first_verse and first_verse is not None:
yield first_verse
yielded_first_verse = True
yield verse

def has_cross_book_mappings(self, reference_versification: Optional[Versification] = None) -> bool:
reference_versification = reference_versification or Versification.get_builtin("Original")
for verse_ref in self.all_included_verses():
standard_ref = verse_ref.to_versification(reference_versification)
if verse_ref.book_num != standard_ref.book_num:
return True
return False

def __eq__(self, other: Versification) -> bool:
if self is other:
return True
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ reportMissingModuleSource = false

[tool.poetry]
name = "sil-machine"
version = "1.8.9"
version = "1.8.10"
description = "A natural language processing library that is focused on providing tools for resource-poor languages."
license = "MIT"
authors = ["SIL International"]
Expand Down
105 changes: 104 additions & 1 deletion tests/corpora/test_parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
TextRow,
TextRowFlags,
)
from machine.scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION, Versification
from machine.scripture import (
ENGLISH_VERSIFICATION,
ORIGINAL_VERSIFICATION,
RUSSIAN_ORTHODOX_VERSIFICATION,
Versification,
)


def test_get_rows_no_rows() -> None:
Expand Down Expand Up @@ -1330,6 +1335,104 @@ def test_get_rows_different_versifications_with_verse_segments():
assert rows[5].target_segment == "target chapter sixteen, verse thirty nine b .".split()


def test_get_rows_different_versifications_with_cross_book_mappings():
source_corpus = DictionaryTextCorpus(
MemoryText(
"DAN",
[
text_row(
"DAN",
ScriptureRef.parse("DAN 3:23", ORIGINAL_VERSIFICATION),
"DAN source chapter three, verse twenty three .",
),
text_row(
"DAN",
ScriptureRef.parse("DAN 3:24", ORIGINAL_VERSIFICATION),
"DAN source chapter three, verse twenty four .",
),
],
),
MemoryText(
"S3Y",
[
text_row(
"S3Y",
ScriptureRef.parse("S3Y 1:1", ORIGINAL_VERSIFICATION),
"S3Y source chapter one, verse one .",
),
text_row(
"S3Y",
ScriptureRef.parse("S3Y 1:68", ORIGINAL_VERSIFICATION),
"S3Y source chapter one, verse sixty eight .",
),
],
),
)
source_corpus.versification = ORIGINAL_VERSIFICATION

target_corpus = DictionaryTextCorpus(
MemoryText(
"DAN",
[
text_row(
"DAN",
ScriptureRef.parse("DAN 3:23", RUSSIAN_ORTHODOX_VERSIFICATION),
"DAN target chapter three, verse twenty three .",
),
text_row(
"DAN",
ScriptureRef.parse("DAN 3:24", RUSSIAN_ORTHODOX_VERSIFICATION),
"DAN target chapter three, verse twenty four .",
),
text_row(
"DAN",
ScriptureRef.parse("DAN 3:90", RUSSIAN_ORTHODOX_VERSIFICATION),
"DAN target chapter three, verse ninety .",
),
text_row(
"DAN",
ScriptureRef.parse("DAN 3:91", RUSSIAN_ORTHODOX_VERSIFICATION),
"DAN target chapter three, verse ninety one .",
),
],
)
)
target_corpus.versification = RUSSIAN_ORTHODOX_VERSIFICATION

# Russian Orthodox vs. Original
# DAN 3:24-90 = DAG 3:24-90
# DAN 3:91-100 = DAN 3:24-33
# Original
# S3Y 1:1-29 = DAG 3:24-52
# ...
# S3Y 1:38-68 = DAG 3:60-90

parallel_corpus = source_corpus.align_rows(target_corpus, all_source_rows=True)
rows = list(parallel_corpus.get_rows())

assert len(rows) == 4

assert rows[0].source_refs == [ScriptureRef.parse("DAN 3:23", ORIGINAL_VERSIFICATION)]
assert rows[0].target_refs == [ScriptureRef.parse("DAN 3:23", RUSSIAN_ORTHODOX_VERSIFICATION)]
assert rows[0].source_segment == "DAN source chapter three, verse twenty three .".split()
assert rows[0].target_segment == "DAN target chapter three, verse twenty three .".split()

assert rows[1].source_refs == [ScriptureRef.parse("DAN 3:24", ORIGINAL_VERSIFICATION)]
assert rows[1].target_refs == [ScriptureRef.parse("DAN 3:91", RUSSIAN_ORTHODOX_VERSIFICATION)]
assert rows[1].source_segment == "DAN source chapter three, verse twenty four .".split()
assert rows[1].target_segment == "DAN target chapter three, verse ninety one .".split()

assert rows[2].source_refs == [ScriptureRef.parse("S3Y 1:1", ORIGINAL_VERSIFICATION)]
assert rows[2].target_refs == [ScriptureRef.parse("DAN 3:24", RUSSIAN_ORTHODOX_VERSIFICATION)]
assert rows[2].source_segment == "S3Y source chapter one, verse one .".split()
assert rows[2].target_segment == "DAN target chapter three, verse twenty four .".split()

assert rows[3].source_refs == [ScriptureRef.parse("S3Y 1:68", ORIGINAL_VERSIFICATION)]
assert rows[3].target_refs == [ScriptureRef.parse("DAN 3:90", RUSSIAN_ORTHODOX_VERSIFICATION)]
assert rows[3].source_segment == "S3Y source chapter one, verse sixty eight .".split()
assert rows[3].target_segment == "DAN target chapter three, verse ninety .".split()


def test_to_pandas() -> None:
source_corpus = DictionaryTextCorpus(
MemoryText(
Expand Down
21 changes: 21 additions & 0 deletions tests/scripture/test_versification.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,24 @@ def test_builtin_versification_type() -> None:
assert RUSSIAN_PROTESTANT_VERSIFICATION.type == VersificationType.RUSSIAN_PROTESTANT
assert VULGATE_VERSIFICATION.type == VersificationType.VULGATE
assert SEPTUAGINT_VERSIFICATION.type == VersificationType.SEPTUAGINT


def test_all_included_verses() -> None:
original_verses = list(ORIGINAL_VERSIFICATION.all_included_verses())
assert len(original_verses) == 41899
assert original_verses[21899].bbbcccvvv == 27003024
english_verses = list(ENGLISH_VERSIFICATION.all_included_verses())
assert len(english_verses) == 38393
assert english_verses[-1].bbbcccvvv == 123001020
russian_orthodox_verses = list(RUSSIAN_ORTHODOX_VERSIFICATION.all_included_verses())
assert len(russian_orthodox_verses) == 37280
assert russian_orthodox_verses[-1].bbbcccvvv == 83001015


def test_has_cross_book_mappings() -> None:
assert not ORIGINAL_VERSIFICATION.has_cross_book_mappings()
assert ENGLISH_VERSIFICATION.has_cross_book_mappings()
assert RUSSIAN_ORTHODOX_VERSIFICATION.has_cross_book_mappings()
assert not RUSSIAN_PROTESTANT_VERSIFICATION.has_cross_book_mappings()
assert VULGATE_VERSIFICATION.has_cross_book_mappings()
assert VULGATE_VERSIFICATION.has_cross_book_mappings(ENGLISH_VERSIFICATION)
Loading