|
1 | 1 | using System; |
2 | 2 | using System.Globalization; |
3 | 3 | using System.Text.RegularExpressions; |
4 | | -using System.Unicode; |
| 4 | +using PCRE; |
5 | 5 |
|
6 | 6 | namespace SIL.Machine.PunctuationAnalysis |
7 | 7 | { |
8 | 8 | public class QuotationMarkStringMatch |
9 | 9 | { |
10 | | - // No LatinLetterPattern or LetterPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py |
| 10 | + private static readonly PcreRegex LetterPattern = new PcreRegex(@"[\p{L}\N{U+0001E200}-\N{U+0001E28F}]"); |
| 11 | + private static readonly PcreRegex LatinLetterPattern = new PcreRegex(@"^\p{Script_Extensions=Latin}$"); |
11 | 12 | private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled); |
12 | 13 | private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled); |
13 | 14 | private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled); |
@@ -55,9 +56,15 @@ public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventions) => |
55 | 56 | public bool NextCharacterMatches(Regex regexPattern) => |
56 | 57 | NextCharacter != null && regexPattern.IsMatch(NextCharacter); |
57 | 58 |
|
| 59 | + public bool NextCharacterMatches(PcreRegex regexPattern) => |
| 60 | + NextCharacter != null && regexPattern.IsMatch(NextCharacter); |
| 61 | + |
58 | 62 | public bool PreviousCharacterMatches(Regex regexPattern) => |
59 | 63 | PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); |
60 | 64 |
|
| 65 | + public bool PreviousCharacterMatches(PcreRegex regexPattern) => |
| 66 | + PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter); |
| 67 | + |
61 | 68 | public string PreviousCharacter |
62 | 69 | { |
63 | 70 | get |
@@ -98,9 +105,15 @@ public string NextCharacter |
98 | 105 | public bool LeadingSubstringMatches(Regex regexPattern) => |
99 | 106 | regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); |
100 | 107 |
|
| 108 | + public bool LeadingSubstringMatches(PcreRegex regexPattern) => |
| 109 | + regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex)); |
| 110 | + |
101 | 111 | public bool TrailingSubstringMatches(Regex regexPattern) => |
102 | 112 | regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); |
103 | 113 |
|
| 114 | + public bool TrailingSubstringMatches(PcreRegex regexPattern) => |
| 115 | + regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex)); |
| 116 | + |
104 | 117 | // This assumes that the two matches occur in the same verse |
105 | 118 | public bool Precedes(QuotationMarkStringMatch other) |
106 | 119 | { |
@@ -151,72 +164,27 @@ public bool HasTrailingPunctuation() |
151 | 164 |
|
152 | 165 | public bool HasLetterInLeadingSubstring() |
153 | 166 | { |
154 | | - string leadingSubstring = TextSegment.SubstringBefore(StartIndex); |
155 | | - if (leadingSubstring.Length == 0) |
156 | | - return false; |
157 | | - |
158 | | - TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(leadingSubstring); |
159 | | - while (charactersEnumerator.MoveNext()) |
160 | | - { |
161 | | - if (!IsLetter(charactersEnumerator.Current.ToString())) |
162 | | - return false; |
163 | | - } |
164 | | - return true; |
| 167 | + return LeadingSubstringMatches(LetterPattern); |
165 | 168 | } |
166 | 169 |
|
167 | 170 | public bool HasLetterInTrailingSubstring() |
168 | 171 | { |
169 | | - string trailingSubstring = TextSegment.SubstringAfter(EndIndex); |
170 | | - if (trailingSubstring.Length == 0) |
171 | | - return false; |
172 | | - TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(trailingSubstring); |
173 | | - while (charactersEnumerator.MoveNext()) |
174 | | - { |
175 | | - if (!IsLetter(charactersEnumerator.Current.ToString())) |
176 | | - return false; |
177 | | - } |
178 | | - return true; |
| 172 | + return TrailingSubstringMatches(LetterPattern); |
179 | 173 | } |
180 | 174 |
|
181 | 175 | public bool HasLeadingLatinLetter() |
182 | 176 | { |
183 | | - return PreviousCharacter != null && IsLatinScript(PreviousCharacter); |
| 177 | + return PreviousCharacterMatches(LatinLetterPattern); |
184 | 178 | } |
185 | 179 |
|
186 | 180 | public bool HasTrailingLatinLetter() |
187 | 181 | { |
188 | | - return NextCharacter != null && IsLatinScript(NextCharacter); |
| 182 | + return NextCharacterMatches(LatinLetterPattern); |
189 | 183 | } |
190 | 184 |
|
191 | 185 | public bool HasQuoteIntroducerInLeadingSubstring() |
192 | 186 | { |
193 | 187 | return LeadingSubstringMatches(QuoteIntroducerPattern); |
194 | 188 | } |
195 | | - |
196 | | - public static bool HasUnicodeProperty(string characterString, string attribute) |
197 | | - { |
198 | | - if (characterString.Length == 1) |
199 | | - { |
200 | | - return UnicodeInfo.GetName(characterString[0]).Contains(attribute); |
201 | | - } |
202 | | - else if (char.IsSurrogatePair(characterString[0], characterString[1])) |
203 | | - { |
204 | | - //Get true unicode value |
205 | | - int combinedCharacterValue = |
206 | | - (((int)characterString[0] - 0xD800) * 0x400) + ((int)characterString[1] - 0xDC00) + 0x10000; |
207 | | - return UnicodeInfo.GetName(combinedCharacterValue).Contains(attribute); |
208 | | - } |
209 | | - return false; |
210 | | - } |
211 | | - |
212 | | - private bool IsLatinScript(string characterString) |
213 | | - { |
214 | | - return HasUnicodeProperty(characterString, "LATIN"); |
215 | | - } |
216 | | - |
217 | | - private bool IsLetter(string characterString) |
218 | | - { |
219 | | - return HasUnicodeProperty(characterString, "LETTER"); |
220 | | - } |
221 | 189 | } |
222 | 190 | } |
0 commit comments