From d8ca02da8aada71b266f56c41dd14b179d76fc04 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 26 Mar 2026 17:28:43 -0400 Subject: [PATCH 1/6] modify usfm for chapter-level drafting to avoid import issues; move remarks to chapters --- .../paratext_project_text_updater_base.py | 3 +- machine/corpora/update_usfm_parser_handler.py | 34 ++++++++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0e7bfdfd..0a80c407 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -29,6 +29,7 @@ def __init__( def update_usfm( self, book_id: str, + chapters: Optional[Sequence[int]] = None, rows: Optional[Sequence[UpdateUsfmRow]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, @@ -61,7 +62,7 @@ def update_usfm( ) try: parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) - return handler.get_usfm(self._settings.stylesheet) + return handler.get_usfm(self._settings.stylesheet, chapters) except Exception as e: error_message = ( f"An error occurred while parsing the usfm for '{book_id}'" diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 9d95850c..4c187ac7 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) - if embed_outside_of_block: self._end_update_block(state, [scripture_ref]) - def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: + def get_usfm( + self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None + ) -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) + if chapters is not None: + tokens = self._get_incremental_draft_tokens(tokens, chapters) if len(self._remarks) > 0: remark_tokens: List[UsfmToken] = [] for remark in self._remarks: remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem")) remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark)) if len(tokens) > 0: - index = 0 - markers_to_skip = {"id", "ide", "rem"} - while tokens[index].marker in markers_to_skip: - index += 1 - if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT: - index += 1 - for remark_token in reversed(remark_tokens): - tokens.insert(index, remark_token) + for index, token in enumerate(tokens): + if token.type == UsfmTokenType.CHAPTER: + tokens[index + 1 : index + 1] = remark_tokens return tokenizer.detokenize(tokens) + def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]: + incremental_draft_tokens: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + incremental_draft_tokens.append(token) + continue + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + incremental_draft_tokens.append(token) + else: + in_chapter = False + elif in_chapter: + incremental_draft_tokens.append(token) + return incremental_draft_tokens + def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] row_metadata = None From aef5d5d71a892fac1b299a5dfbda9ad84768f91b Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Wed, 15 Apr 2026 14:02:28 -0400 Subject: [PATCH 2/6] move filtering before token processing --- .../paratext_project_text_updater_base.py | 36 ++++++++++++++++--- machine/corpora/update_usfm_parser_handler.py | 23 +----------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index 0a80c407..a7dc464d 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Callable, Iterable, Optional, Sequence, Union +from typing import Callable, Iterable, List, Optional, Sequence, Union from .paratext_project_file_handler import ParatextProjectFileHandler from .paratext_project_settings import ParatextProjectSettings @@ -10,7 +10,9 @@ UpdateUsfmRow, UpdateUsfmTextBehavior, ) -from .usfm_parser import parse_usfm +from .usfm_parser import UsfmParser +from .usfm_token import UsfmTokenType +from .usfm_tokenizer import UsfmToken, UsfmTokenizer from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError @@ -61,8 +63,12 @@ def update_usfm( compare_segments=compare_segments, ) try: - parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification) - return handler.get_usfm(self._settings.stylesheet, chapters) + tokenizer = UsfmTokenizer(self._settings.stylesheet) + tokens = tokenizer.tokenize(usfm) + tokens = self.filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) + parser.process_tokens() + return handler.get_usfm(self._settings.stylesheet) except Exception as e: error_message = ( f"An error occurred while parsing the usfm for '{book_id}'" @@ -70,3 +76,25 @@ def update_usfm( f". Error: '{e}'" ) raise RuntimeError(error_message) from e + + def filter_tokens_by_chapter( + self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None + ) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + tokens_within_chapters.append(token) + if 1 in chapters: + in_chapter = True + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True + tokens_within_chapters.append(token) + else: + in_chapter = False + elif in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 4c187ac7..78fd85f4 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -334,15 +334,11 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) - if embed_outside_of_block: self._end_update_block(state, [scripture_ref]) - def get_usfm( - self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None - ) -> str: + def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if isinstance(stylesheet, str): stylesheet = UsfmStylesheet(stylesheet) tokenizer = UsfmTokenizer(stylesheet) tokens = list(self._tokens) - if chapters is not None: - tokens = self._get_incremental_draft_tokens(tokens, chapters) if len(self._remarks) > 0: remark_tokens: List[UsfmToken] = [] for remark in self._remarks: @@ -354,23 +350,6 @@ def get_usfm( tokens[index + 1 : index + 1] = remark_tokens return tokenizer.detokenize(tokens) - def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]: - incremental_draft_tokens: List[UsfmToken] = [] - in_chapter: bool = False - for index, token in enumerate(tokens): - if index == 0 and token.marker == "id": - incremental_draft_tokens.append(token) - continue - elif token.type == UsfmTokenType.CHAPTER: - if token.data and int(token.data) in chapters: - in_chapter = True - incremental_draft_tokens.append(token) - else: - in_chapter = False - elif in_chapter: - incremental_draft_tokens.append(token) - return incremental_draft_tokens - def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: row_texts: List[str] = [] row_metadata = None From e42370887ef0b161edd325abbad82b795efd423d Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Wed, 15 Apr 2026 17:08:20 -0400 Subject: [PATCH 3/6] add test case for chapter filtering --- machine/corpora/__init__.py | 2 +- .../paratext_project_text_updater_base.py | 45 ++++++++++--------- .../test_update_usfm_parser_handler.py | 37 ++++++++++++++- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py index d07e52ee..7cbc2889 100644 --- a/machine/corpora/__init__.py +++ b/machine/corpora/__init__.py @@ -27,7 +27,7 @@ from .paratext_project_settings import ParatextProjectSettings from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase -from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase +from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase from .paratext_text_corpus import ParatextTextCorpus from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index a7dc464d..a32bdb99 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -31,8 +31,8 @@ def __init__( def update_usfm( self, book_id: str, - chapters: Optional[Sequence[int]] = None, rows: Optional[Sequence[UpdateUsfmRow]] = None, + chapters: Optional[Sequence[int]] = None, full_name: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -65,7 +65,7 @@ def update_usfm( try: tokenizer = UsfmTokenizer(self._settings.stylesheet) tokens = tokenizer.tokenize(usfm) - tokens = self.filter_tokens_by_chapter(tokens, chapters) + tokens = filter_tokens_by_chapter(tokens, chapters) parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification) parser.process_tokens() return handler.get_usfm(self._settings.stylesheet) @@ -77,24 +77,25 @@ def update_usfm( ) raise RuntimeError(error_message) from e - def filter_tokens_by_chapter( - self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None - ) -> Sequence[UsfmToken]: - if chapters is None: - return tokens - tokens_within_chapters: List[UsfmToken] = [] - in_chapter: bool = False - for index, token in enumerate(tokens): - if index == 0 and token.marker == "id": - tokens_within_chapters.append(token) - if 1 in chapters: - in_chapter = True - elif token.type == UsfmTokenType.CHAPTER: - if token.data and int(token.data) in chapters: - in_chapter = True - tokens_within_chapters.append(token) - else: - in_chapter = False - elif in_chapter: + +def filter_tokens_by_chapter( + tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None +) -> Sequence[UsfmToken]: + if chapters is None: + return tokens + tokens_within_chapters: List[UsfmToken] = [] + in_chapter: bool = False + for index, token in enumerate(tokens): + if index == 0 and token.marker == "id": + tokens_within_chapters.append(token) + if 1 in chapters: + in_chapter = True + elif token.type == UsfmTokenType.CHAPTER: + if token.data and int(token.data) in chapters: + in_chapter = True tokens_within_chapters.append(token) - return tokens_within_chapters + else: + in_chapter = False + elif in_chapter: + tokens_within_chapters.append(token) + return tokens_within_chapters diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index a9c1cdc1..940878bf 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -9,10 +9,12 @@ UpdateUsfmParserHandler, UpdateUsfmRow, UpdateUsfmTextBehavior, + UsfmParser, + UsfmTokenizer, UsfmUpdateBlock, UsfmUpdateBlockElementType, UsfmUpdateBlockHandler, - parse_usfm, + filter_tokens_by_chapter, ) @@ -1494,6 +1496,31 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text(): ) +def test_filter_chapters() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [2, 4] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT +\c 2 +\v 1 Some text +\c 4 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs] @@ -1501,6 +1528,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]: def update_usfm( rows: Optional[Sequence[UpdateUsfmRow]] = None, source: Optional[str] = None, + chapters: Optional[Sequence[int]] = None, id_text: Optional[str] = None, text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW, paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE, @@ -1516,6 +1544,7 @@ def update_usfm( return updater.update_usfm( "MAT", rows, + chapters, id_text, text_behavior, paragraph_behavior, @@ -1542,7 +1571,11 @@ def update_usfm( lambda _: False, compare_segments, ) - parse_usfm(source, updater) + tokenizer = UsfmTokenizer() + tokens = tokenizer.tokenize(source) + tokens = filter_tokens_by_chapter(tokens, chapters) + parser = UsfmParser(tokens, updater) + parser.process_tokens() return updater.get_usfm() From 1e2e99956e51c0620fb4b2876516d8498e902692 Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 10:02:53 -0400 Subject: [PATCH 4/6] make sure all text in \id is included --- machine/corpora/paratext_project_text_updater_base.py | 9 ++++++--- tests/corpora/test_update_usfm_parser_handler.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py index a32bdb99..77e4ec3a 100644 --- a/machine/corpora/paratext_project_text_updater_base.py +++ b/machine/corpora/paratext_project_text_updater_base.py @@ -85,17 +85,20 @@ def filter_tokens_by_chapter( return tokens tokens_within_chapters: List[UsfmToken] = [] in_chapter: bool = False + in_id_marker: bool = False for index, token in enumerate(tokens): if index == 0 and token.marker == "id": - tokens_within_chapters.append(token) + in_id_marker = True if 1 in chapters: in_chapter = True + elif in_id_marker and token.marker is not None and token.marker != "id": + in_id_marker = False elif token.type == UsfmTokenType.CHAPTER: if token.data and int(token.data) in chapters: in_chapter = True - tokens_within_chapters.append(token) else: in_chapter = False - elif in_chapter: + + if in_id_marker or in_chapter: tokens_within_chapters.append(token) return tokens_within_chapters diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 940878bf..1505444b 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1512,7 +1512,7 @@ def test_filter_chapters() -> None: """ chapters = [2, 4] target = update_usfm(chapters=chapters, source=usfm) - result = r"""\id MAT + result = r"""\id MAT - Test \c 2 \v 1 Some text \c 4 From 707119c38bb2038fa7c6c19b3a4f178b3a9a0c0a Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 11:19:18 -0400 Subject: [PATCH 5/6] update remark test and ensure remarks are added at the end of existing chapter remarks --- machine/corpora/update_usfm_parser_handler.py | 7 ++++++- tests/corpora/test_update_usfm_parser_handler.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py index 78fd85f4..5317a85b 100644 --- a/machine/corpora/update_usfm_parser_handler.py +++ b/machine/corpora/update_usfm_parser_handler.py @@ -347,7 +347,12 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str: if len(tokens) > 0: for index, token in enumerate(tokens): if token.type == UsfmTokenType.CHAPTER: - tokens[index + 1 : index + 1] = remark_tokens + insertion_index = index + 1 + while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem": + insertion_index += 1 + if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT: + insertion_index += 1 + tokens[insertion_index:insertion_index] = remark_tokens return tokenizer.detokenize(tokens) def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]: diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index 1505444b..e896e21b 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1389,17 +1389,22 @@ def test_pass_remark(): \v 1 Some text \v 2 \v 3 Other text +\c 2 +\v 1 More text """ target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"]) result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark -\rem New remark \c 1 +\rem New remark \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\v 1 More text """ assert_usfm_equals(target, result) @@ -1408,12 +1413,16 @@ def test_pass_remark(): result = r"""\id MAT - Test \ide UTF-8 \rem Existing remark +\c 1 \rem New remark \rem New remark 2 -\c 1 \v 1 Some text \v 2 Update 2 \v 3 Other text +\c 2 +\rem New remark +\rem New remark 2 +\v 1 More text """ assert_usfm_equals(target, result) From e1865ea7afd4405b87e78b7ef1bc78df0246511c Mon Sep 17 00:00:00 2001 From: mshannon-sil Date: Thu, 16 Apr 2026 11:57:28 -0400 Subject: [PATCH 6/6] add test case for including chapter 1 and header information --- .../test_update_usfm_parser_handler.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py index e896e21b..90ba2bba 100644 --- a/tests/corpora/test_update_usfm_parser_handler.py +++ b/tests/corpora/test_update_usfm_parser_handler.py @@ -1530,6 +1530,34 @@ def test_filter_chapters() -> None: assert_usfm_equals(target, result) +def test_filter_chapters_with_chapter_1_and_header() -> None: + usfm = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 2 +\v 1 Some text +\c 3 +\v 1 Some text +\c 4 +\v 1 Some text +""" + chapters = [1, 3] + target = update_usfm(chapters=chapters, source=usfm) + result = r"""\id MAT - Test +\h Matthew +\c 1 +\v 1 Some text +\v 2 +\v 3 Other text +\c 3 +\v 1 Some text +""" + assert_usfm_equals(target, result) + + def scr_ref(*refs: str) -> List[ScriptureRef]: return [ScriptureRef.parse(ref) for ref in refs]