Skip to content

Commit bc5f656

Browse files
committed
Custom string processing to combine only surrogate pairs (not combining characters)
1 parent 0a81571 commit bc5f656

File tree

6 files changed

+107
-31
lines changed

6 files changed

+107
-31
lines changed

src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using System.Collections.Generic;
2-
using System.Globalization;
32
using System.Linq;
43
using PCRE;
54

@@ -46,17 +45,9 @@ public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegmen
4645
)
4746
.Select(m =>
4847
{
49-
int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text);
50-
int startIndex = 0;
51-
int endIndex = textElementIndices.Length;
52-
for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++)
53-
{
54-
int stringIndex = textElementIndices[textElementIndex];
55-
if (stringIndex == m.Groups[0].Index)
56-
startIndex = textElementIndex;
57-
if (stringIndex == m.Groups[0].EndIndex)
58-
endIndex = textElementIndex;
59-
}
48+
CodePointString codePointString = new CodePointString(textSegment.Text);
49+
int startIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].Index);
50+
int endIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].EndIndex);
6051
return new QuotationMarkStringMatch(textSegment, startIndex, endIndex);
6152
})
6253
.ToList();

src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using System;
2-
using System.Globalization;
32
using System.Text.RegularExpressions;
43
using PCRE;
54

@@ -42,8 +41,7 @@ public override int GetHashCode()
4241
return code;
4342
}
4443

45-
public string QuotationMark =>
46-
new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex, EndIndex - StartIndex);
44+
public string QuotationMark => TextSegment.Substring(StartIndex, EndIndex - StartIndex);
4745

4846
public bool IsValidOpeningQuotationMark(QuoteConventionSet quoteConventions) =>
4947
quoteConventions.IsValidOpeningQuotationMark(QuotationMark);
@@ -74,14 +72,11 @@ public string PreviousCharacter
7472
TextSegment previousSegment = TextSegment.PreviousSegment;
7573
if (previousSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph))
7674
{
77-
return new StringInfo(previousSegment.Text).SubstringByTextElements(
78-
StringInfo.ParseCombiningCharacters(previousSegment.Text).Length - 1,
79-
1
80-
);
75+
return previousSegment.Substring(previousSegment.Length - 1, 1);
8176
}
8277
return null;
8378
}
84-
return new StringInfo(TextSegment.Text).SubstringByTextElements(StartIndex - 1, 1);
79+
return TextSegment.Substring(StartIndex - 1, 1);
8580
}
8681
}
8782

@@ -94,11 +89,11 @@ public string NextCharacter
9489
TextSegment nextSegment = TextSegment.NextSegment;
9590
if (nextSegment != null && !TextSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Paragraph))
9691
{
97-
return new StringInfo(nextSegment.Text).SubstringByTextElements(0, 1);
92+
return nextSegment.Substring(0, 1);
9893
}
9994
return null;
10095
}
101-
return new StringInfo(TextSegment.Text).SubstringByTextElements(EndIndex, 1);
96+
return TextSegment.Substring(EndIndex, 1);
10297
}
10398
}
10499

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
using System;
22
using System.Collections.Generic;
3-
using System.Globalization;
3+
using System.Linq;
44
using SIL.Machine.Corpora;
55

66
namespace SIL.Machine.PunctuationAnalysis
77
{
88
public class TextSegment : IEquatable<TextSegment>
99
{
10-
public string Text { get; private set; }
10+
public string Text
11+
{
12+
get => _text;
13+
private set
14+
{
15+
_codePointString = new CodePointString(value);
16+
_text = value;
17+
}
18+
}
1119
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
1220
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
1321
public TextSegment PreviousSegment { get; set; }
@@ -16,6 +24,9 @@ public class TextSegment : IEquatable<TextSegment>
1624
public int NumSegmentsInVerse { get; set; }
1725
public UsfmToken UsfmToken { get; private set; }
1826

27+
private string _text;
28+
private CodePointString _codePointString;
29+
1930
public TextSegment()
2031
{
2132
Text = "";
@@ -71,16 +82,21 @@ public override int GetHashCode()
7182
return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode();
7283
}
7384

74-
public int Length => StringInfo.ParseCombiningCharacters(Text).Length;
85+
public int Length => _codePointString.Length;
86+
87+
public string Substring(int startIndex, int length)
88+
{
89+
return _codePointString.Substring(startIndex, length);
90+
}
7591

7692
public string SubstringBefore(int index)
7793
{
78-
return Text.Substring(0, index);
94+
return Substring(0, index);
7995
}
8096

8197
public string SubstringAfter(int index)
8298
{
83-
return Text.Substring(index);
99+
return Substring(index, Length - index);
84100
}
85101

86102
public bool MarkerIsInPrecedingContext(UsfmMarkerType marker)
@@ -147,4 +163,78 @@ public TextSegment Build()
147163
}
148164
}
149165
}
166+
167+
public class CodePointString
168+
{
169+
public string String => _stringValue;
170+
public int Length => _stringIndexByCodePointIndex.Count;
171+
172+
private readonly string _stringValue;
173+
private readonly Dictionary<int, int> _codePointIndexByStringIndex;
174+
private readonly Dictionary<int, int> _stringIndexByCodePointIndex;
175+
176+
public CodePointString(string stringValue)
177+
{
178+
_stringValue = stringValue;
179+
IEnumerable<(int CodePointIndex, int StringIndex)> indexPairs = _stringValue
180+
.Select((c, i) => (c, i))
181+
.Where(tup => !char.IsLowSurrogate(tup.c))
182+
.Select((tup, i) => (tup.i, i));
183+
_codePointIndexByStringIndex = indexPairs.ToDictionary(tup => tup.StringIndex, tup => tup.CodePointIndex);
184+
_stringIndexByCodePointIndex = indexPairs.ToDictionary(tup => tup.CodePointIndex, tup => tup.StringIndex);
185+
}
186+
187+
public string this[int codePointIndex]
188+
{
189+
get
190+
{
191+
if (codePointIndex < 0 || codePointIndex > Length)
192+
{
193+
throw new IndexOutOfRangeException(
194+
$"Index {codePointIndex} is out of bounds for CodePointString with length {Length}."
195+
);
196+
}
197+
int stringIndex = _stringIndexByCodePointIndex[codePointIndex];
198+
char characterAtStringIndex = _stringValue[stringIndex];
199+
if (
200+
stringIndex < _stringValue.Length
201+
&& char.IsSurrogatePair(characterAtStringIndex, _stringValue[stringIndex + 1])
202+
)
203+
{
204+
return _stringValue.Substring(stringIndex, 2);
205+
}
206+
return characterAtStringIndex.ToString();
207+
}
208+
}
209+
210+
public int GetCodePointIndexForStringIndex(int stringIndex)
211+
{
212+
if (stringIndex == _stringValue.Length)
213+
{
214+
return _codePointIndexByStringIndex.Count;
215+
}
216+
if (!_codePointIndexByStringIndex.TryGetValue(stringIndex, out int codePointIndex))
217+
{
218+
throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}");
219+
}
220+
return codePointIndex;
221+
}
222+
223+
public string Substring(int startCodePointIndex, int length)
224+
{
225+
int endCodePointIndex = startCodePointIndex + length;
226+
int startStringIndex = GetStringIndexForCodePointIndex(startCodePointIndex);
227+
int endStringIndex = GetStringIndexForCodePointIndex(endCodePointIndex);
228+
return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex);
229+
}
230+
231+
public int GetStringIndexForCodePointIndex(int codePointIndex)
232+
{
233+
if (codePointIndex == _codePointIndexByStringIndex.Count)
234+
{
235+
return _stringValue.Length;
236+
}
237+
return _codePointIndexByStringIndex[codePointIndex];
238+
}
239+
}
150240
}

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,8 @@ public void ThatAllPossibleQuotationMarksAreIdentified()
292292
[
293293
new QuotationMarkStringMatch(
294294
new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(),
295-
6,
296-
7
295+
9,
296+
10
297297
),
298298
]
299299
)

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkStringMatchTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ public void GetPreviousCharacter()
205205
0,
206206
1
207207
);
208-
Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("ले"));
208+
Assert.That(quotationMarkStringMatch.PreviousCharacter, Is.EqualTo("\u0947"));
209209
}
210210

211211
[Test]

tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ public void Length()
191191

192192
//Combining characters
193193
textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build();
194-
Assert.That(textSegment.Length, Is.EqualTo(11));
194+
Assert.That(textSegment.Length, Is.EqualTo(17));
195195

196196
//Surrogate pairs
197197
textSegment = new TextSegment.Builder().SetText("𝜺𝜺").Build();

0 commit comments

Comments
 (0)