From d8ca02da8aada71b266f56c41dd14b179d76fc04 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 26 Mar 2026 17:28:43 -0400
Subject: [PATCH 1/6] modify usfm for chapter-level drafting to avoid import
 issues; move remarks to chapters

---
 .../paratext_project_text_updater_base.py     |  3 +-
 machine/corpora/update_usfm_parser_handler.py | 34 ++++++++++++++-----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 0e7bfdfd..0a80c407 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -29,6 +29,7 @@ def __init__(
     def update_usfm(
         self,
         book_id: str,
+        chapters: Optional[Sequence[int]] = None,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
@@ -61,7 +62,7 @@ def update_usfm(
         )
         try:
             parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
-            return handler.get_usfm(self._settings.stylesheet)
+            return handler.get_usfm(self._settings.stylesheet, chapters)
         except Exception as e:
             error_message = (
                 f"An error occurred while parsing the usfm for '{book_id}'"
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 9d95850c..4c187ac7 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -334,27 +334,43 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
         if embed_outside_of_block:
             self._end_update_block(state, [scripture_ref])
 
-    def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
+    def get_usfm(
+        self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
+    ) -> str:
         if isinstance(stylesheet, str):
             stylesheet = UsfmStylesheet(stylesheet)
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
+        if chapters is not None:
+            tokens = self._get_incremental_draft_tokens(tokens, chapters)
         if len(self._remarks) > 0:
             remark_tokens: List[UsfmToken] = []
             for remark in self._remarks:
                 remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
                 remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
             if len(tokens) > 0:
-                index = 0
-                markers_to_skip = {"id", "ide", "rem"}
-                while tokens[index].marker in markers_to_skip:
-                    index += 1
-                    if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
-                        index += 1
-                for remark_token in reversed(remark_tokens):
-                    tokens.insert(index, remark_token)
+                for index, token in enumerate(tokens):
+                    if token.type == UsfmTokenType.CHAPTER:
+                        tokens[index + 1 : index + 1] = remark_tokens
         return tokenizer.detokenize(tokens)
 
+    def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
+        incremental_draft_tokens: List[UsfmToken] = []
+        in_chapter: bool = False
+        for index, token in enumerate(tokens):
+            if index == 0 and token.marker == "id":
+                incremental_draft_tokens.append(token)
+                continue
+            elif token.type == UsfmTokenType.CHAPTER:
+                if token.data and int(token.data) in chapters:
+                    in_chapter = True
+                    incremental_draft_tokens.append(token)
+                else:
+                    in_chapter = False
+            elif in_chapter:
+                incremental_draft_tokens.append(token)
+        return incremental_draft_tokens
+
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
         row_texts: List[str] = []
         row_metadata = None

From aef5d5d71a892fac1b299a5dfbda9ad84768f91b Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Wed, 15 Apr 2026 14:02:28 -0400
Subject: [PATCH 2/6] move filtering before token processing

---
 .../paratext_project_text_updater_base.py     | 36 ++++++++++++++++---
 machine/corpora/update_usfm_parser_handler.py | 23 +-----------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index 0a80c407..a7dc464d 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Callable, Iterable, Optional, Sequence, Union
+from typing import Callable, Iterable, List, Optional, Sequence, Union
 
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
@@ -10,7 +10,9 @@
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
 )
-from .usfm_parser import parse_usfm
+from .usfm_parser import UsfmParser
+from .usfm_token import UsfmTokenType
+from .usfm_tokenizer import UsfmToken, UsfmTokenizer
 from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError
 
 
@@ -61,8 +63,12 @@ def update_usfm(
             compare_segments=compare_segments,
         )
         try:
-            parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
-            return handler.get_usfm(self._settings.stylesheet, chapters)
+            tokenizer = UsfmTokenizer(self._settings.stylesheet)
+            tokens = tokenizer.tokenize(usfm)
+            tokens = self.filter_tokens_by_chapter(tokens, chapters)
+            parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
+            parser.process_tokens()
+            return handler.get_usfm(self._settings.stylesheet)
         except Exception as e:
             error_message = (
                 f"An error occurred while parsing the usfm for '{book_id}'"
@@ -70,3 +76,25 @@ def update_usfm(
                 f". Error: '{e}'"
             )
             raise RuntimeError(error_message) from e
+
+    def filter_tokens_by_chapter(
+        self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+    ) -> Sequence[UsfmToken]:
+        if chapters is None:
+            return tokens
+        tokens_within_chapters: List[UsfmToken] = []
+        in_chapter: bool = False
+        for index, token in enumerate(tokens):
+            if index == 0 and token.marker == "id":
+                tokens_within_chapters.append(token)
+                if 1 in chapters:
+                    in_chapter = True
+            elif token.type == UsfmTokenType.CHAPTER:
+                if token.data and int(token.data) in chapters:
+                    in_chapter = True
+                    tokens_within_chapters.append(token)
+                else:
+                    in_chapter = False
+            elif in_chapter:
+                tokens_within_chapters.append(token)
+        return tokens_within_chapters
diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 4c187ac7..78fd85f4 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -334,15 +334,11 @@ def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -
         if embed_outside_of_block:
             self._end_update_block(state, [scripture_ref])
 
-    def get_usfm(
-        self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty", chapters: Optional[Sequence[int]] = None
-    ) -> str:
+    def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         if isinstance(stylesheet, str):
             stylesheet = UsfmStylesheet(stylesheet)
         tokenizer = UsfmTokenizer(stylesheet)
         tokens = list(self._tokens)
-        if chapters is not None:
-            tokens = self._get_incremental_draft_tokens(tokens, chapters)
         if len(self._remarks) > 0:
             remark_tokens: List[UsfmToken] = []
             for remark in self._remarks:
@@ -354,23 +350,6 @@ def get_usfm(
                         tokens[index + 1 : index + 1] = remark_tokens
         return tokenizer.detokenize(tokens)
 
-    def _get_incremental_draft_tokens(self, tokens: List[UsfmToken], chapters: Sequence[int]) -> List[UsfmToken]:
-        incremental_draft_tokens: List[UsfmToken] = []
-        in_chapter: bool = False
-        for index, token in enumerate(tokens):
-            if index == 0 and token.marker == "id":
-                incremental_draft_tokens.append(token)
-                continue
-            elif token.type == UsfmTokenType.CHAPTER:
-                if token.data and int(token.data) in chapters:
-                    in_chapter = True
-                    incremental_draft_tokens.append(token)
-                else:
-                    in_chapter = False
-            elif in_chapter:
-                incremental_draft_tokens.append(token)
-        return incremental_draft_tokens
-
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
         row_texts: List[str] = []
         row_metadata = None

From e42370887ef0b161edd325abbad82b795efd423d Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Wed, 15 Apr 2026 17:08:20 -0400
Subject: [PATCH 3/6] add test case for chapter filtering

---
 machine/corpora/__init__.py                   |  2 +-
 .../paratext_project_text_updater_base.py     | 45 ++++++++++---------
 .../test_update_usfm_parser_handler.py        | 37 ++++++++++++++-
 3 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/machine/corpora/__init__.py b/machine/corpora/__init__.py
index d07e52ee..7cbc2889 100644
--- a/machine/corpora/__init__.py
+++ b/machine/corpora/__init__.py
@@ -27,7 +27,7 @@
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
-from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
+from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase, filter_tokens_by_chapter
 from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index a7dc464d..a32bdb99 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -31,8 +31,8 @@ def __init__(
     def update_usfm(
         self,
         book_id: str,
-        chapters: Optional[Sequence[int]] = None,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
+        chapters: Optional[Sequence[int]] = None,
         full_name: Optional[str] = None,
         text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
         paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -65,7 +65,7 @@ def update_usfm(
         try:
             tokenizer = UsfmTokenizer(self._settings.stylesheet)
             tokens = tokenizer.tokenize(usfm)
-            tokens = self.filter_tokens_by_chapter(tokens, chapters)
+            tokens = filter_tokens_by_chapter(tokens, chapters)
             parser = UsfmParser(tokens, handler, self._settings.stylesheet, self._settings.versification)
             parser.process_tokens()
             return handler.get_usfm(self._settings.stylesheet)
@@ -77,24 +77,25 @@ def update_usfm(
             )
             raise RuntimeError(error_message) from e
 
-    def filter_tokens_by_chapter(
-        self, tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
-    ) -> Sequence[UsfmToken]:
-        if chapters is None:
-            return tokens
-        tokens_within_chapters: List[UsfmToken] = []
-        in_chapter: bool = False
-        for index, token in enumerate(tokens):
-            if index == 0 and token.marker == "id":
-                tokens_within_chapters.append(token)
-                if 1 in chapters:
-                    in_chapter = True
-            elif token.type == UsfmTokenType.CHAPTER:
-                if token.data and int(token.data) in chapters:
-                    in_chapter = True
-                    tokens_within_chapters.append(token)
-                else:
-                    in_chapter = False
-            elif in_chapter:
+
+def filter_tokens_by_chapter(
+    tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
+) -> Sequence[UsfmToken]:
+    if chapters is None:
+        return tokens
+    tokens_within_chapters: List[UsfmToken] = []
+    in_chapter: bool = False
+    for index, token in enumerate(tokens):
+        if index == 0 and token.marker == "id":
+            tokens_within_chapters.append(token)
+            if 1 in chapters:
+                in_chapter = True
+        elif token.type == UsfmTokenType.CHAPTER:
+            if token.data and int(token.data) in chapters:
+                in_chapter = True
                 tokens_within_chapters.append(token)
-        return tokens_within_chapters
+            else:
+                in_chapter = False
+        elif in_chapter:
+            tokens_within_chapters.append(token)
+    return tokens_within_chapters
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index a9c1cdc1..940878bf 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -9,10 +9,12 @@
     UpdateUsfmParserHandler,
     UpdateUsfmRow,
     UpdateUsfmTextBehavior,
+    UsfmParser,
+    UsfmTokenizer,
     UsfmUpdateBlock,
     UsfmUpdateBlockElementType,
     UsfmUpdateBlockHandler,
-    parse_usfm,
+    filter_tokens_by_chapter,
 )
 
 
@@ -1494,6 +1496,31 @@ def test_update_block_footnote_at_start_of_chapter_with_preceding_text():
     )
 
 
+def test_filter_chapters() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [2, 4]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT
+\c 2
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 
@@ -1501,6 +1528,7 @@ def scr_ref(*refs: str) -> List[ScriptureRef]:
 def update_usfm(
     rows: Optional[Sequence[UpdateUsfmRow]] = None,
     source: Optional[str] = None,
+    chapters: Optional[Sequence[int]] = None,
     id_text: Optional[str] = None,
     text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_NEW,
     paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -1516,6 +1544,7 @@ def update_usfm(
         return updater.update_usfm(
             "MAT",
             rows,
+            chapters,
             id_text,
             text_behavior,
             paragraph_behavior,
@@ -1542,7 +1571,11 @@ def update_usfm(
             lambda _: False,
             compare_segments,
         )
-        parse_usfm(source, updater)
+        tokenizer = UsfmTokenizer()
+        tokens = tokenizer.tokenize(source)
+        tokens = filter_tokens_by_chapter(tokens, chapters)
+        parser = UsfmParser(tokens, updater)
+        parser.process_tokens()
         return updater.get_usfm()
 
 

From 1e2e99956e51c0620fb4b2876516d8498e902692 Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 10:02:53 -0400
Subject: [PATCH 4/6] make sure all text in \id is included

---
 machine/corpora/paratext_project_text_updater_base.py | 9 ++++++---
 tests/corpora/test_update_usfm_parser_handler.py      | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/machine/corpora/paratext_project_text_updater_base.py b/machine/corpora/paratext_project_text_updater_base.py
index a32bdb99..77e4ec3a 100644
--- a/machine/corpora/paratext_project_text_updater_base.py
+++ b/machine/corpora/paratext_project_text_updater_base.py
@@ -85,17 +85,20 @@ def filter_tokens_by_chapter(
         return tokens
     tokens_within_chapters: List[UsfmToken] = []
     in_chapter: bool = False
+    in_id_marker: bool = False
     for index, token in enumerate(tokens):
         if index == 0 and token.marker == "id":
-            tokens_within_chapters.append(token)
+            in_id_marker = True
             if 1 in chapters:
                 in_chapter = True
+        elif in_id_marker and token.marker is not None and token.marker != "id":
+            in_id_marker = False
         elif token.type == UsfmTokenType.CHAPTER:
             if token.data and int(token.data) in chapters:
                 in_chapter = True
-                tokens_within_chapters.append(token)
             else:
                 in_chapter = False
-        elif in_chapter:
+
+        if in_id_marker or in_chapter:
             tokens_within_chapters.append(token)
     return tokens_within_chapters
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 940878bf..1505444b 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1512,7 +1512,7 @@ def test_filter_chapters() -> None:
 """
     chapters = [2, 4]
     target = update_usfm(chapters=chapters, source=usfm)
-    result = r"""\id MAT
+    result = r"""\id MAT - Test
 \c 2
 \v 1 Some text
 \c 4

From 707119c38bb2038fa7c6c19b3a4f178b3a9a0c0a Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 11:19:18 -0400
Subject: [PATCH 5/6] update remark test and ensure remarks are added at the
 end of existing chapter remarks

---
 machine/corpora/update_usfm_parser_handler.py    |  7 ++++++-
 tests/corpora/test_update_usfm_parser_handler.py | 13 +++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/machine/corpora/update_usfm_parser_handler.py b/machine/corpora/update_usfm_parser_handler.py
index 78fd85f4..5317a85b 100644
--- a/machine/corpora/update_usfm_parser_handler.py
+++ b/machine/corpora/update_usfm_parser_handler.py
@@ -347,7 +347,12 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
             if len(tokens) > 0:
                 for index, token in enumerate(tokens):
                     if token.type == UsfmTokenType.CHAPTER:
-                        tokens[index + 1 : index + 1] = remark_tokens
+                        insertion_index = index + 1
+                        while insertion_index < len(tokens) and tokens[insertion_index].marker == "rem":
+                            insertion_index += 1
+                            if insertion_index < len(tokens) and tokens[insertion_index].type == UsfmTokenType.TEXT:
+                                insertion_index += 1
+                        tokens[insertion_index:insertion_index] = remark_tokens
         return tokenizer.detokenize(tokens)
 
     def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index 1505444b..e896e21b 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1389,17 +1389,22 @@ def test_pass_remark():
 \v 1 Some text
 \v 2
 \v 3 Other text
+\c 2
+\v 1 More text
 """
 
     target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.PREFER_EXISTING, remarks=["New remark"])
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
-\rem New remark
 \c 1
+\rem New remark
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)
@@ -1408,12 +1413,16 @@ def test_pass_remark():
     result = r"""\id MAT - Test
 \ide UTF-8
 \rem Existing remark
+\c 1
 \rem New remark
 \rem New remark 2
-\c 1
 \v 1 Some text
 \v 2 Update 2
 \v 3 Other text
+\c 2
+\rem New remark
+\rem New remark 2
+\v 1 More text
 """
 
     assert_usfm_equals(target, result)

From e1865ea7afd4405b87e78b7ef1bc78df0246511c Mon Sep 17 00:00:00 2001
From: mshannon-sil <matthew_shannon@sil.org>
Date: Thu, 16 Apr 2026 11:57:28 -0400
Subject: [PATCH 6/6] add test case for including chapter 1 and header
 information

---
 .../test_update_usfm_parser_handler.py        | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/corpora/test_update_usfm_parser_handler.py b/tests/corpora/test_update_usfm_parser_handler.py
index e896e21b..90ba2bba 100644
--- a/tests/corpora/test_update_usfm_parser_handler.py
+++ b/tests/corpora/test_update_usfm_parser_handler.py
@@ -1530,6 +1530,34 @@ def test_filter_chapters() -> None:
     assert_usfm_equals(target, result)
 
 
+def test_filter_chapters_with_chapter_1_and_header() -> None:
+    usfm = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 2
+\v 1 Some text
+\c 3
+\v 1 Some text
+\c 4
+\v 1 Some text
+"""
+    chapters = [1, 3]
+    target = update_usfm(chapters=chapters, source=usfm)
+    result = r"""\id MAT - Test
+\h Matthew
+\c 1
+\v 1 Some text
+\v 2
+\v 3 Other text
+\c 3
+\v 1 Some text
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]