diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index d07e52e..7cbc288 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -27,7 +27,7 @@ from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase -from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0e7bfdf..77e4ec3 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Callable, Iterable, Optional, Sequence, Union +from typing import Callable, Iterable, List, Optional, Sequence, Union from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings @@ -10,7 +10,9 @@ UpdateUsfmRow, UpdateUsfmTextBehavior, ) -from .usfm_parser import parse_usfm +from .usfm_parser import UsfmParser +from .usfm_token import UsfmTokenType +from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError @@ -30,6 +32,7 @@ def update_usfm( self, book_id: str, rows: Optional[Sequence[UpdateUsfmRow]] = None, + chapters: Optional[Sequence[int]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -60,7 +63,11 @@ def update_usfm( compare_segments=compare_segments, ) try: - parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) + tokenizer = UsfmTokenizer(self._settings.stylesheet) + tokens = tokenizer.tokenize(usfm) + tokens = filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) + parser.process_tokens() return handler.get_usfm(self._settings.stylesheet) except Exception as e: error_message = ( @@ -69,3 +76,29 @@ def update_usfm( f". Error: '{e}'" ) raise RuntimeError(error_message) from e + + +def filter_tokens_by_chapter( + tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None +) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + in_id_marker: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + in_id_marker = True + if 1 in chapters: + in_chapter = True + elif in_id_marker and token.marker is not None and token.marker != "id": + in_id_marker = False + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + else: + in_chapter = False + + if in_id_marker or in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 9d95850..5317a85 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -345,14 +345,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - index = 0 - markers_to_skip = {"id", "ide", "rem"} - while tokens[index].marker in markers_to_skip: - index += 1 - if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: - index += 1 - for remark_token in reversed(remark_tokens): - tokens.insert(index, remark_token) + for index, token in enumerate(tokens): + if token.type == UsfmTokenType.CHAPTER: + insertion_index = index + 1 + while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem": + insertion_index += 1 + if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT: + insertion_index += 1 + tokens[insertion_index:insertion_index] = remark_tokens return tokenizer.detokenize(tokens) def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index a9c1cdc..90ba2bb 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -9,10 +9,12 @@ UpdateUsfmParserHandler, UpdateUsfmRow, UpdateUsfmTextBehavior, + UsfmParser, + UsfmTokenizer, UsfmUpdateBlock, UsfmUpdateBlockElementType, UsfmUpdateBlockHandler, - parse_usfm, + filter_tokens_by_chapter, ) @@ -1387,17 +1389,22 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text +\c 2 +\v 1 More text """ target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark \c 1 +\rem New remark \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\v 1 More text """ assert_usfm_equals(target, result) @@ -1406,12 +1413,16 @@ def test_pass_remark(): result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark +\c 1 \rem New remark \rem New remark 2 -\c 1 \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\rem New remark 2 +\v 1 More text """ assert_usfm_equals(target, result) @@ -1494,6 +1505,59 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text(): ) +def test_filter_chapters() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\c 2 +\v 1 Some text +\c 4 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + +def test_filter_chapters_with_chapter_1_and_header() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [1, 3] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 3 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] @@ -1501,6 +1565,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: def update_usfm( rows: Optional[Sequence[UpdateUsfmRow]] = None, source: Optional[str] = None, + chapters: Optional[Sequence[int]] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -1516,6 +1581,7 @@ def update_usfm( return updater.update_usfm( "MAT", rows, + chapters, id_text, text_behavior, paragraph_behavior, @@ -1542,7 +1608,11 @@ def update_usfm( lambda _: False, compare_segments, ) - parse_usfm(source, updater) + tokenizer = UsfmTokenizer() + tokens = tokenizer.tokenize(source) + tokens = filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, updater) + parser.process_tokens() return updater.get_usfm()