Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
Expand Down
39 changes: 36 additions & 3 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC
from typing import Callable, Iterable, Optional, Sequence, Union
from typing import Callable, Iterable, List, Optional, Sequence, Union

from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
Expand All @@ -10,7 +10,9 @@
UpdateUsfmRow,
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_parser import UsfmParser
from .usfm_token import UsfmTokenType
from .usfm_tokenizer import UsfmToken, UsfmTokenizer
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


Expand All @@ -30,6 +32,7 @@ def update_usfm(
self,
book_id: str,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
chapters: Optional[Sequence[int]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
Expand Down Expand Up @@ -60,7 +63,11 @@ def update_usfm(
compare_segments=compare_segments,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
tokenizer = UsfmTokenizer(self._settings.stylesheet)
tokens = tokenizer.tokenize(usfm)
tokens = filter_tokens_by_chapter(tokens, chapters)
parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
parser.process_tokens()
return handler.get_usfm(self._settings.stylesheet)
except Exception as e:
error_message = (
Expand All @@ -69,3 +76,29 @@ def update_usfm(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e


def filter_tokens_by_chapter(
tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
) -> Sequence[UsfmToken]:
if chapters is None:
return tokens
tokens_within_chapters: List[UsfmToken] = []
in_chapter: bool = False
in_id_marker: bool = False
for index, token in enumerate(tokens):
if index == 0 and token.marker == "id":
in_id_marker = True
if 1 in chapters:
in_chapter = True
elif in_id_marker and token.marker is not None and token.marker != "id":
in_id_marker = False
elif token.type == UsfmTokenType.CHAPTER:
if token.data and int(token.data) in chapters:
in_chapter = True
else:
in_chapter = False

if in_id_marker or in_chapter:
tokens_within_chapters.append(token)
return tokens_within_chapters
16 changes: 8 additions & 8 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,14 +345,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
if len(tokens) > 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
while tokens[index].marker in markers_to_skip:
index += 1
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
index += 1
for remark_token in reversed(remark_tokens):
tokens.insert(index, remark_token)
for index, token in enumerate(tokens):
if token.type == UsfmTokenType.CHAPTER:
insertion_index = index + 1
while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem":
insertion_index += 1
if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT:
insertion_index += 1
tokens[insertion_index:insertion_index] = remark_tokens
return tokenizer.detokenize(tokens)

def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
Expand Down
78 changes: 74 additions & 4 deletions tests/corpora/test_update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
UpdateUsfmParserHandler,
UpdateUsfmRow,
UpdateUsfmTextBehavior,
UsfmParser,
UsfmTokenizer,
UsfmUpdateBlock,
UsfmUpdateBlockElementType,
UsfmUpdateBlockHandler,
parse_usfm,
filter_tokens_by_chapter,
)


Expand Down Expand Up @@ -1387,17 +1389,22 @@ def test_pass_remark():
\v 1 Some text
\v 2
\v 3 Other text
\c 2
\v 1 More text
"""

target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"])
result = r"""\id MAT - Test
\ide UTF-8
\rem Existing remark
\rem New remark
\c 1
\rem New remark
\v 1 Some text
\v 2 Update 2
\v 3 Other text
\c 2
\rem New remark
\v 1 More text
"""

assert_usfm_equals(target, result)
Expand All @@ -1406,12 +1413,16 @@ def test_pass_remark():
result = r"""\id MAT - Test
\ide UTF-8
\rem Existing remark
\c 1
\rem New remark
\rem New remark 2
\c 1
\v 1 Some text
\v 2 Update 2
\v 3 Other text
\c 2
\rem New remark
\rem New remark 2
\v 1 More text
"""

assert_usfm_equals(target, result)
Expand Down Expand Up @@ -1494,13 +1505,67 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text():
)


def test_filter_chapters() -> None:
usfm = r"""\id MAT - Test
\h Matthew
\c 1
\v 1 Some text
\v 2
\v 3 Other text
\c 2
\v 1 Some text
\c 3
\v 1 Some text
\c 4
\v 1 Some text
"""
chapters = [2, 4]
target = update_usfm(chapters=chapters, source=usfm)
result = r"""\id MAT - Test
\c 2
\v 1 Some text
\c 4
\v 1 Some text
"""
assert_usfm_equals(target, result)


def test_filter_chapters_with_chapter_1_and_header() -> None:
usfm = r"""\id MAT - Test
\h Matthew
\c 1
\v 1 Some text
\v 2
\v 3 Other text
\c 2
\v 1 Some text
\c 3
\v 1 Some text
\c 4
\v 1 Some text
"""
chapters = [1, 3]
target = update_usfm(chapters=chapters, source=usfm)
result = r"""\id MAT - Test
\h Matthew
\c 1
\v 1 Some text
\v 2
\v 3 Other text
\c 3
\v 1 Some text
"""
assert_usfm_equals(target, result)


def scr_ref(*refs: str) -> List[ScriptureRef]:
return [ScriptureRef.parse(ref) for ref in refs]


def update_usfm(
rows: Optional[Sequence[UpdateUsfmRow]] = None,
source: Optional[str] = None,
chapters: Optional[Sequence[int]] = None,
id_text: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
Expand All @@ -1516,6 +1581,7 @@ def update_usfm(
return updater.update_usfm(
"MAT",
rows,
chapters,
id_text,
text_behavior,
paragraph_behavior,
Expand All @@ -1542,7 +1608,11 @@ def update_usfm(
lambda _: False,
compare_segments,
)
parse_usfm(source, updater)
tokenizer = UsfmTokenizer()
tokens = tokenizer.tokenize(source)
tokens = filter_tokens_by_chapter(tokens, chapters)
parser = UsfmParser(tokens, updater)
parser.process_tokens()
return updater.get_usfm()


Expand Down
Loading