Skip to content

Commit 92d868b

Browse files
committed
Remove RTL markers from verse token data; add tests
1 parent 58392e4 commit 92d868b

File tree

3 files changed

+73
-1
lines changed

3 files changed

+73
-1
lines changed

src/SIL.Machine/Corpora/UsfmTokenizer.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ public IReadOnlyList<UsfmToken> Tokenize(string usfm, bool preserveWhitespace =
175175
marker,
176176
null,
177177
null,
178-
GetNextWord(usfm, ref index, preserveWhitespace)
178+
SanitizeVerseData(GetNextWord(usfm, ref index, preserveWhitespace))
179179
)
180180
{
181181
LineNumber = lineNum,
@@ -563,6 +563,11 @@ private static string GetNextWord(string usfm, ref int index, bool preserveWhite
563563
return data;
564564
}
565565

566+
private static string SanitizeVerseData(string verseData)
567+
{
568+
return verseData.Replace("‏", "");
569+
}
570+
566571
/// <summary>
567572
/// Converts all control characters, carriage returns and tabs into
568573
/// spaces, and then strips duplicate spaces.

tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,39 @@ public void UpdateBlock_Verse_Range()
906906
);
907907
}
908908

909+
[Test]
910+
public void UpdateBlock_Verse_Range_RightToLeftMarker()
911+
{
912+
var rows = new List<UpdateUsfmRow> { new UpdateUsfmRow(ScrRef("MAT 1:1", "MAT 1:2", "MAT 1:3"), "Update 1-3") };
913+
string usfm =
914+
@"\id MAT - Test
915+
\c 1
916+
\v 1‏-3 verse 1 through 3
917+
";
918+
TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler();
919+
string updatedUsfm = UpdateUsfm(
920+
rows,
921+
usfm,
922+
embedBehavior: UpdateUsfmMarkerBehavior.Preserve,
923+
usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]
924+
);
925+
string expectedUsfm =
926+
@"\id MAT - Test
927+
\c 1
928+
\v 1-3 Update 1-3
929+
";
930+
Assert.That(updatedUsfm.Replace("\r\n", "\n"), Is.EqualTo(expectedUsfm));
931+
Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1));
932+
933+
UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0];
934+
AssertUpdateBlockEquals(
935+
usfmUpdateBlock,
936+
["MAT 1:1", "MAT 1:2", "MAT 1:3"],
937+
(UsfmUpdateBlockElementType.Text, "Update 1-3 ", false),
938+
(UsfmUpdateBlockElementType.Text, "verse 1 through 3 ", true)
939+
);
940+
}
941+
909942
[Test]
910943
public void UpdateBlock_Footnote_PreserveEmbeds()
911944
{

tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,40 @@ public void GetRows_PrivateUseMarker()
425425
});
426426
}
427427

428+
[Test]
429+
public void GetRows_VerseRangeWithRightToLeftMarker()
430+
{
431+
TextRow[] rows = GetRows(
432+
@"\id MAT - Test
433+
\h
434+
\mt
435+
\c 1
436+
\v 1‏-2 Verse one and two.
437+
"
438+
);
439+
440+
Assert.Multiple(() =>
441+
{
442+
Assert.That(rows, Has.Length.EqualTo(2));
443+
444+
Assert.That(
445+
rows[0].Ref,
446+
Is.EqualTo(ScriptureRef.Parse("MAT 1:1")),
447+
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
448+
);
449+
Assert.That(
450+
rows[0].Text,
451+
Is.EqualTo("Verse one and two."),
452+
string.Join(",", rows.ToList().Select(tr => tr.Text))
453+
);
454+
Assert.That(
455+
rows[1].Ref,
456+
Is.EqualTo(ScriptureRef.Parse("MAT 1:2")),
457+
string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString()))
458+
);
459+
});
460+
}
461+
428462
private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false)
429463
{
430464
UsfmMemoryText text =

0 commit comments

Comments
 (0)