diff --git a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs index abb336df..ce4795a2 100644 --- a/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs +++ b/src/SIL.Machine/Corpora/UsfmFileTextCorpus.cs @@ -1,4 +1,5 @@ -using System.IO; +using System; +using System.IO; using System.Text; using SIL.Scripture; @@ -46,12 +47,15 @@ private static string GetId(string fileName, Encoding encoding) while ((line = reader.ReadLine()) != null) { line = line.Trim(); - if (line.StartsWith("\\id ")) + if (line.StartsWith("\\id ", StringComparison.InvariantCulture)) { string id = line.Substring(4); - int index = id.IndexOf(" "); + int index = id.IndexOf(" ", StringComparison.OrdinalIgnoreCase); + // If the id is longer than 3 characters, truncate it to 3 characters. + if ((index == -1 || index > 3) && id.Length >= 3) + index = 3; if (index != -1) - id = id.Substring(0, index); + id = id.Substring(0, index).ToUpperInvariant(); return id.Trim(); } } diff --git a/src/SIL.Machine/Corpora/UsfmParser.cs b/src/SIL.Machine/Corpora/UsfmParser.cs index 302b79a2..d40e288a 100644 --- a/src/SIL.Machine/Corpora/UsfmParser.cs +++ b/src/SIL.Machine/Corpora/UsfmParser.cs @@ -301,6 +301,8 @@ public bool ProcessToken() // Code is always upper case string code = token.Data.ToUpperInvariant(); + if (code.Length > 3) + code = code.Substring(0, 3); vref = State.VerseRef; // Update verse ref. Leave book alone if not empty to prevent parsing errors diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 40061f3f..4c27fef4 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -108,6 +108,11 @@ public TextRowCollector(UsfmTextBase text) public override void StartBook(UsfmParserState state, string marker, string code) { base.StartBook(state, marker, code); + if (!string.IsNullOrEmpty(state.VerseRef.Book) && state.VerseRef.Book != code) + { + // Ignore \id markers that don't match the book code in the verse ref, if it was set + return; + } if (!Canon.AllBookIds.Contains(code, StringComparison.InvariantCulture)) { throw new ArgumentException($"The book {code} is not a valid book id."); diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM index 6d57d147..2ced0843 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/03LEVTes.SFM @@ -1,4 +1,4 @@ -\id LEV - Test +\id Leviticus \h Leviticus \mt Leviticus \c 14 diff --git a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM index 328b513a..f080c42b 100644 --- a/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM +++ b/tests/SIL.Machine.Tests/Corpora/TestData/usfm/Tes/131CHTes.SFM @@ -1,4 +1,4 @@ -\id 1CH - Test +\id 1CH \h 1 Chronicles \mt 1 Chronicles \c 12 diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index b9699dd7..40172f7f 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -758,6 +758,99 @@ public void GetRows_IncompleteVerseRange() }); } + [Test] + public void GetRows_BookCodeDifferentToFilename() => + Assert.Throws(() => + GetRows( + @"\id LUK - Test +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeInvalid() => + Assert.Throws(() => + GetRows( + @"\id ZZZ - Test +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeTruncated() => + Assert.Throws(() => + GetRows( + @"\id MA +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ) + ); + + [Test] + public void GetRows_BookCodeMultiple() + { + TextRow[] rows = GetRows( + @"\id MAT +\id LUK +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(1)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Verse 1 Text"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + }); + } + + [Test] + public void GetRows_BookCodeNoSpace() + { + TextRow[] rows = GetRows( + @"\id Matthew +\c 1 +\v 1 Verse 1 Text +", + includeAllText: true + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(1)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Verse 1 Text"), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + }); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text = new(