Skip to content

Commit d519508

Browse files
committed
Change code point to surrogate pair
1 parent b279bc7 commit d519508

File tree

2 files changed

+34
-34
lines changed

2 files changed

+34
-34
lines changed

src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegmen
4545
)
4646
.Select(m =>
4747
{
48-
CodePointString codePointString = new CodePointString(textSegment.Text);
49-
int startIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].Index);
50-
int endIndex = codePointString.GetCodePointIndexForStringIndex(m.Groups[0].EndIndex);
48+
SurrogatePairString surrogatePairString = new SurrogatePairString(textSegment.Text);
49+
int startIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].Index);
50+
int endIndex = surrogatePairString.GetSurrogatePairIndexForStringIndex(m.Groups[0].EndIndex);
5151
return new QuotationMarkStringMatch(textSegment, startIndex, endIndex);
5252
})
5353
.ToList();

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ public class TextSegment : IEquatable<TextSegment>
99
{
1010
public string Text
1111
{
12-
get => _codePointString.ToString();
13-
private set => _codePointString = new CodePointString(value);
12+
get => _surrogatePairString.ToString();
13+
private set => _surrogatePairString = new SurrogatePairString(value);
1414
}
1515
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
1616
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
@@ -19,7 +19,7 @@ public string Text
1919
public int IndexInVerse { get; set; }
2020
public int NumSegmentsInVerse { get; set; }
2121
public UsfmToken UsfmToken { get; private set; }
22-
private CodePointString _codePointString;
22+
private SurrogatePairString _surrogatePairString;
2323

2424
public TextSegment()
2525
{
@@ -76,11 +76,11 @@ public override int GetHashCode()
7676
return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode();
7777
}
7878

79-
public int Length => _codePointString.Length;
79+
public int Length => _surrogatePairString.Length;
8080

8181
public string Substring(int startIndex, int length)
8282
{
83-
return _codePointString.Substring(startIndex, length);
83+
return _surrogatePairString.Substring(startIndex, length);
8484
}
8585

8686
public string SubstringBefore(int index)
@@ -161,28 +161,28 @@ public TextSegment Build()
161161
/// <summary>
162162
/// Class to handle indexing of strings by unicode code point, treating surrogate pairs as single characters.
163163
/// </summary>
164-
public class CodePointString
164+
public class SurrogatePairString
165165
{
166166
public string String => _stringValue;
167-
public int Length => _stringIndexByCodePointIndex.Count;
167+
public int Length => _stringIndexBySurrogatePairIndex.Count;
168168

169169
private readonly string _stringValue;
170-
private readonly Dictionary<int, int> _codePointIndexByStringIndex;
171-
private readonly Dictionary<int, int> _stringIndexByCodePointIndex;
170+
private readonly Dictionary<int, int> _surrogatePairIndexByStringIndex;
171+
private readonly Dictionary<int, int> _stringIndexBySurrogatePairIndex;
172172

173-
public CodePointString(string stringValue)
173+
public SurrogatePairString(string stringValue)
174174
{
175175
_stringValue = stringValue;
176-
IEnumerable<(int CodePointIndex, int StringIndex)> indexPairs = _stringValue
176+
IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue
177177
.Select((c, i) => (c, i))
178178
.Where(tup => !char.IsLowSurrogate(tup.c))
179179
.Select((tup, i) => (tup.i, i));
180-
_codePointIndexByStringIndex = new Dictionary<int, int>();
181-
_stringIndexByCodePointIndex = new Dictionary<int, int>();
182-
foreach ((int codePointIndex, int stringIndex) in indexPairs)
180+
_surrogatePairIndexByStringIndex = new Dictionary<int, int>();
181+
_stringIndexBySurrogatePairIndex = new Dictionary<int, int>();
182+
foreach ((int surrogatePairIndex, int stringIndex) in indexPairs)
183183
{
184-
_codePointIndexByStringIndex[stringIndex] = codePointIndex;
185-
_stringIndexByCodePointIndex[codePointIndex] = stringIndex;
184+
_surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex;
185+
_stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex;
186186
}
187187
}
188188

@@ -191,17 +191,17 @@ public override string ToString()
191191
return _stringValue;
192192
}
193193

194-
public string this[int codePointIndex]
194+
public string this[int surrogatePairIndex]
195195
{
196196
get
197197
{
198-
if (codePointIndex < 0 || codePointIndex > Length)
198+
if (surrogatePairIndex < 0 || surrogatePairIndex > Length)
199199
{
200200
throw new IndexOutOfRangeException(
201-
$"Index {codePointIndex} is out of bounds for CodePointString with length {Length}."
201+
$"Index {surrogatePairIndex} is out of bounds for SurrogatePairString with length {Length}."
202202
);
203203
}
204-
int stringIndex = _stringIndexByCodePointIndex[codePointIndex];
204+
int stringIndex = _stringIndexBySurrogatePairIndex[surrogatePairIndex];
205205
char characterAtStringIndex = _stringValue[stringIndex];
206206
if (
207207
stringIndex < _stringValue.Length
@@ -214,34 +214,34 @@ public string this[int codePointIndex]
214214
}
215215
}
216216

217-
public int GetCodePointIndexForStringIndex(int stringIndex)
217+
public int GetSurrogatePairIndexForStringIndex(int stringIndex)
218218
{
219219
if (stringIndex == _stringValue.Length)
220220
{
221-
return _codePointIndexByStringIndex.Count;
221+
return _surrogatePairIndexByStringIndex.Count;
222222
}
223-
if (!_codePointIndexByStringIndex.TryGetValue(stringIndex, out int codePointIndex))
223+
if (!_surrogatePairIndexByStringIndex.TryGetValue(stringIndex, out int surrogatePairIndex))
224224
{
225225
throw new ArgumentException($"No non-surrogate code point begins at index {stringIndex}");
226226
}
227-
return codePointIndex;
227+
return surrogatePairIndex;
228228
}
229229

230-
public string Substring(int startCodePointIndex, int length)
230+
public string Substring(int startSurrogatePairIndex, int length)
231231
{
232-
int endCodePointIndex = startCodePointIndex + length;
233-
int startStringIndex = GetStringIndexForCodePointIndex(startCodePointIndex);
234-
int endStringIndex = GetStringIndexForCodePointIndex(endCodePointIndex);
232+
int endSurrogatePairIndex = startSurrogatePairIndex + length;
233+
int startStringIndex = GetStringIndexForSurrogatePairIndex(startSurrogatePairIndex);
234+
int endStringIndex = GetStringIndexForSurrogatePairIndex(endSurrogatePairIndex);
235235
return _stringValue.Substring(startStringIndex, endStringIndex - startStringIndex);
236236
}
237237

238-
public int GetStringIndexForCodePointIndex(int codePointIndex)
238+
public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex)
239239
{
240-
if (codePointIndex == _codePointIndexByStringIndex.Count)
240+
if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count)
241241
{
242242
return _stringValue.Length;
243243
}
244-
return _codePointIndexByStringIndex[codePointIndex];
244+
return _surrogatePairIndexByStringIndex[surrogatePairIndex];
245245
}
246246
}
247247
}

0 commit comments

Comments
 (0)