sillsdev · mshannon-sil · Mar 26, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026
diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
@@ -27,7 +27,7 @@
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
-from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
+from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
 from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Callable, Iterable, Optional, Sequence, Union
+from typing import Callable, Iterable, List, Optional, Sequence, Union
 
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
@@ -10,7 +10,9 @@
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
 )
-from .usfm_parser import parse_usfm
+from .usfm_parser import UsfmParser
+from .usfm_token import UsfmTokenType
+from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 
@@ -30,6 +32,7 @@ def update_usfm(
         self,
         book_id: str,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
+        chapters: Optional[Sequence[int]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
         paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -60,7 +63,11 @@ def update_usfm(
             compare_segments=compare_segments,
         )
         try:
-            parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
+            tokenizer = UsfmTokenizer(self._settings.stylesheet)
+            tokens = tokenizer.tokenize(usfm)
+            tokens = filter_tokens_by_chapter(tokens, chapters)
+            parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
+            parser.process_tokens()
             return handler.get_usfm(self._settings.stylesheet)
         except Exception as e:
             error_message = (
@@ -69,3 +76,29 @@ def update_usfm(
                 f". Error: '{e}'"
             )
             raise RuntimeError(error_message) from e
+
+
+def filter_tokens_by_chapter(
+    tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+) -> Sequence[UsfmToken]:
+    if chapters is None:
+        return tokens
+    tokens_within_chapters: List[UsfmToken] = []
+    in_chapter: bool = False
+    in_id_marker: bool = False
+    for index, token in enumerate(tokens):
+        if index == 0 and token.marker == "id":
+            in_id_marker = True
+            if 1 in chapters:
+                in_chapter = True
+        elif in_id_marker and token.marker is not None and token.marker != "id":
+            in_id_marker = False
+        elif token.type == UsfmTokenType.CHAPTER:
+            if token.data and int(token.data) in chapters:
+                in_chapter = True
+            else:
+                in_chapter = False
+
+        if in_id_marker or in_chapter:
+            tokens_within_chapters.append(token)
+    return tokens_within_chapters
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
@@ -345,14 +345,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
                 remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
                 remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
             if len(tokens) > 0:
-                index = 0
-                markers_to_skip = {"id", "ide", "rem"}
-                while tokens[index].marker in markers_to_skip:
-                    index += 1
-                    if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
-                        index += 1
-                for remark_token in reversed(remark_tokens):
-                    tokens.insert(index, remark_token)
+                for index, token in enumerate(tokens):
+                    if token.type == UsfmTokenType.CHAPTER:
+                        insertion_index = index + 1
+                        while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem":
+                            insertion_index += 1
+                            if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT:
+                                insertion_index += 1
+                        tokens[insertion_index:insertion_index] = remark_tokens
         return tokenizer.detokenize(tokens)
 
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:

diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
@@ -9,10 +9,12 @@
     UpdateUsfmParserHandler,
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
+    UsfmParser,
+    UsfmTokenizer,
     UsfmUpdateBlock,
     UsfmUpdateBlockElementType,
     UsfmUpdateBlockHandler,
-    parse_usfm,
+    filter_tokens_by_chapter,
 )
 
 
@@ -1387,17 +1389,22 @@ def test_pass_remark():
 \v 1 Some text
 \v 2
 \v 3 Other text
+\c 2
+\v 1 More text
 """
 
     target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"])
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
-\rem New remark
 \c 1
+\rem New remark
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)
@@ -1406,12 +1413,16 @@ def test_pass_remark():
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
+\c 1
 \rem New remark
 \rem New remark 2
-\c 1
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\rem New remark 2
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)
@@ -1494,13 +1505,67 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text():
     )
 
 
+def test_filter_chapters() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [2, 4]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT - Test
+\c 2
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
+def test_filter_chapters_with_chapter_1_and_header() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [1, 3]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 3
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 
 
 def update_usfm(
     rows: Optional[Sequence[UpdateUsfmRow]] = None,
     source: Optional[str] = None,
+    chapters: Optional[Sequence[int]] = None,
     id_text: Optional[str] = None,
     text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
     paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -1516,6 +1581,7 @@ def update_usfm(
         return updater.update_usfm(
             "MAT",
             rows,
+            chapters,
             id_text,
             text_behavior,
             paragraph_behavior,
@@ -1542,7 +1608,11 @@ def update_usfm(
             lambda _: False,
             compare_segments,
         )
-        parse_usfm(source, updater)
+        tokenizer = UsfmTokenizer()
+        tokens = tokenizer.tokenize(source)
+        tokens = filter_tokens_by_chapter(tokens, chapters)
+        parser = UsfmParser(tokens, updater)
+        parser.process_tokens()
         return updater.get_usfm()