From df9db6ba002538882f0f6aa367393523dea51dd3 Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Thu, 16 Apr 2026 12:36:27 +1200 Subject: [PATCH] Handle invalid book id tags more robustly --- src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs | 12 ++- src/SIL.Machine/Corpora/UsfmParser.cs | 2 + src/SIL.Machine/Corpora/UsfmTextBase.cs | 5 + .../Corpora/TestData/usfm/Tes/03LEVTes.SFM | 2 +- .../Corpora/TestData/usfm/Tes/131CHTes.SFM | 2 +- .../Corpora/UsfmMemoryTextTests.cs | 93 +++++++++++++++++++ 6 files changed, 110 insertions(+), 6 deletions(-) diff --git a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs index abb336dfa..ce4795a26 100644 --- a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs +++ b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs @@ -1,4 +1,5 @@ -using System.IO; +using System; +using System.IO; using System.Text; using SIL.Scripture; @@ -46,12 +47,15 @@ private static string GetId(string fileName, Encoding encoding) while ((line = reader.ReadLine()) != null) { line = line.Trim(); - if (line.StartsWith("\\id ")) + if (line.StartsWith("\\id ", StringComparison.InvariantCulture)) { string id = line.Substring(4); - int index = id.IndexOf(" "); + int index = id.IndexOf(" ", StringComparison.OrdinalIgnoreCase); + // If the id is longer than 3 characters, truncate it to 3 characters. + if ((index == -1 || index > 3) && id.Length >= 3) + index = 3; if (index != -1) - id = id.Substring(0, index); + id = id.Substring(0, index).ToUpperInvariant(); return id.Trim(); } } diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 302b79a2d..d40e288a6 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -301,6 +301,8 @@ public bool ProcessToken() // Code is always upper case string code = token.Data.ToUpperInvariant(); + if (code.Length > 3) + code = code.Substring(0, 3); vref = State.VerseRef; // Update verse ref. Leave book alone if not empty to prevent parsing errors diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 40061f3f0..4c27fef4b 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -108,6 +108,11 @@ public TextRowCollector(UsfmTextBase text) public override void StartBook(UsfmParserState state, string marker, string code) { base.StartBook(state, marker, code); + if (!string.IsNullOrEmpty(state.VerseRef.Book) && state.VerseRef.Book != code) + { + // Ignore \id markers that don't match the book code in the verse ref, if it was set + return; + } if (!Canon.AllBookIds.Contains(code, StringComparison.InvariantCulture)) { throw new ArgumentException($"The book {code} is not a valid book id."); diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM index 6d57d1474..2ced0843e 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM @@ -1,4 +1,4 @@ -\id LEV - Test +\id Leviticus \h Leviticus \mt Leviticus \c 14 diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM index 328b513aa..f080c42b6 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM @@ -1,4 +1,4 @@ -\id 1CH - Test +\id 1CH \h 1 Chronicles \mt 1 Chronicles \c 12 diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index b9699dd71..40172f7f1 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -758,6 +758,99 @@ public void GetRows_IncompleteVerseRange() }); } + [Test] + public void GetRows_BookCodeDifferentToFilename() => + Assert.Throws(() => + GetRows( + @"\id LUK - Test +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeInvalid() => + Assert.Throws(() => + GetRows( + @"\id ZZZ - Test +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeTruncated() => + Assert.Throws(() => + GetRows( + @"\id MA +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeMultiple() + { + TextRow[] rows = GetRows( + @"\id MAT +\id LUK +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(1)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Verse 1 Text"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + }); + } + + [Test] + public void GetRows_BookCodeNoSpace() + { + TextRow[] rows = GetRows( + @"\id Matthew +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(1)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Verse 1 Text"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + }); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text = new(