11using System ;
22using System . Collections . Generic ;
3- using System . Globalization ;
3+ using System . Linq ;
44using SIL . Machine . Corpora ;
55
66namespace SIL . Machine . PunctuationAnalysis
77{
88 public class TextSegment : IEquatable < TextSegment >
99 {
10- public string Text { get ; private set ; }
10+ public string Text
11+ {
12+ get => _text ;
13+ private set
14+ {
15+ _codePointString = new CodePointString ( value ) ;
16+ _text = value ;
17+ }
18+ }
1119 public UsfmMarkerType ImmediatePrecedingMarker { get ; private set ; }
1220 public HashSet < UsfmMarkerType > MarkersInPrecedingContext { get ; private set ; }
1321 public TextSegment PreviousSegment { get ; set ; }
@@ -16,6 +24,9 @@ public class TextSegment : IEquatable<TextSegment>
1624 public int NumSegmentsInVerse { get ; set ; }
1725 public UsfmToken UsfmToken { get ; private set ; }
1826
27+ private string _text ;
28+ private CodePointString _codePointString ;
29+
1930 public TextSegment ( )
2031 {
2132 Text = "" ;
@@ -71,16 +82,21 @@ public override int GetHashCode()
7182 return hashCode * 31 + ImmediatePrecedingMarker . GetHashCode ( ) ;
7283 }
7384
74- public int Length => StringInfo . ParseCombiningCharacters ( Text ) . Length ;
85+ public int Length => _codePointString . Length ;
86+
87+ public string Substring ( int startIndex , int length )
88+ {
89+ return _codePointString . Substring ( startIndex , length ) ;
90+ }
7591
7692 public string SubstringBefore ( int index )
7793 {
78- return Text . Substring ( 0 , index ) ;
94+ return Substring ( 0 , index ) ;
7995 }
8096
8197 public string SubstringAfter ( int index )
8298 {
83- return Text . Substring ( index ) ;
99+ return Substring ( index , Length - index ) ;
84100 }
85101
86102 public bool MarkerIsInPrecedingContext ( UsfmMarkerType marker )
@@ -147,4 +163,78 @@ public TextSegment Build()
147163 }
148164 }
149165 }
166+
167+ public class CodePointString
168+ {
169+ public string String => _stringValue ;
170+ public int Length => _stringIndexByCodePointIndex . Count ;
171+
172+ private readonly string _stringValue ;
173+ private readonly Dictionary < int , int > _codePointIndexByStringIndex ;
174+ private readonly Dictionary < int , int > _stringIndexByCodePointIndex ;
175+
176+ public CodePointString ( string stringValue )
177+ {
178+ _stringValue = stringValue ;
179+ IEnumerable < ( int CodePointIndex , int StringIndex ) > indexPairs = _stringValue
180+ . Select ( ( c , i ) => ( c , i ) )
181+ . Where ( tup => ! char . IsLowSurrogate ( tup . c ) )
182+ . Select ( ( tup , i ) => ( tup . i , i ) ) ;
183+ _codePointIndexByStringIndex = indexPairs . ToDictionary ( tup => tup . StringIndex , tup => tup . CodePointIndex ) ;
184+ _stringIndexByCodePointIndex = indexPairs . ToDictionary ( tup => tup . CodePointIndex , tup => tup . StringIndex ) ;
185+ }
186+
187+ public string this [ int codePointIndex ]
188+ {
189+ get
190+ {
191+ if ( codePointIndex < 0 || codePointIndex > Length )
192+ {
193+ throw new IndexOutOfRangeException (
194+ $ "Index { codePointIndex } is out of bounds for CodePointString with length { Length } ."
195+ ) ;
196+ }
197+ int stringIndex = _stringIndexByCodePointIndex [ codePointIndex ] ;
198+ char characterAtStringIndex = _stringValue [ stringIndex ] ;
199+ if (
200+ stringIndex < _stringValue . Length
201+ && char . IsSurrogatePair ( characterAtStringIndex , _stringValue [ stringIndex + 1 ] )
202+ )
203+ {
204+ return _stringValue . Substring ( stringIndex , 2 ) ;
205+ }
206+ return characterAtStringIndex . ToString ( ) ;
207+ }
208+ }
209+
210+ public int GetCodePointIndexForStringIndex ( int stringIndex )
211+ {
212+ if ( stringIndex == _stringValue . Length )
213+ {
214+ return _codePointIndexByStringIndex . Count ;
215+ }
216+ if ( ! _codePointIndexByStringIndex . TryGetValue ( stringIndex , out int codePointIndex ) )
217+ {
218+ throw new ArgumentException ( $ "No non-surrogate code point begins at index { stringIndex } ") ;
219+ }
220+ return codePointIndex ;
221+ }
222+
223+ public string Substring ( int startCodePointIndex , int length )
224+ {
225+ int endCodePointIndex = startCodePointIndex + length ;
226+ int startStringIndex = GetStringIndexForCodePointIndex ( startCodePointIndex ) ;
227+ int endStringIndex = GetStringIndexForCodePointIndex ( endCodePointIndex ) ;
228+ return _stringValue . Substring ( startStringIndex , endStringIndex - startStringIndex ) ;
229+ }
230+
231+ public int GetStringIndexForCodePointIndex ( int codePointIndex )
232+ {
233+ if ( codePointIndex == _codePointIndexByStringIndex . Count )
234+ {
235+ return _stringValue . Length ;
236+ }
237+ return _codePointIndexByStringIndex [ codePointIndex ] ;
238+ }
239+ }
150240}
0 commit comments