Skip to content

Commit a92dae8

Browse files
committed
Use PCRE.NET to mirror python regexes
1 parent 5788e3a commit a92dae8

File tree

3 files changed

+24
-83
lines changed

3 files changed

+24
-83
lines changed
Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
using System.Collections.Generic;
2-
using System.Globalization;
32
using System.Linq;
4-
using System.Text.RegularExpressions;
3+
using PCRE;
54

65
namespace SIL.Machine.PunctuationAnalysis
76
{
87
public class QuotationMarkFinder
98
{
10-
private static readonly Regex TypewriterGuillemetsPattern = new Regex(@"(<<|>>|<|>)", RegexOptions.Compiled);
9+
private static readonly PcreRegex QuotationMarkPattern = new PcreRegex(@"(\p{Quotation_Mark}|<<|>>|<|>)");
1110
private readonly QuoteConventionSet _quoteConventions;
1211

1312
public QuotationMarkFinder(QuoteConventionSet quoteConventions)
@@ -37,30 +36,9 @@ IReadOnlyList<TextSegment> textSegments
3736

3837
public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegment(TextSegment textSegment)
3938
{
40-
TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(textSegment.Text);
41-
int index = 0;
42-
List<QuotationMarkStringMatch> quotationMarkStringMatches = new List<QuotationMarkStringMatch>();
43-
while (charactersEnumerator.MoveNext())
44-
{
45-
string currentCharacterString = charactersEnumerator.Current.ToString();
46-
if (
47-
(
48-
QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "QUOTATION MARK")
49-
|| QuotationMarkStringMatch.HasUnicodeProperty(currentCharacterString, "APOSTROPHE")
50-
)
51-
&& (
52-
_quoteConventions.IsValidOpeningQuotationMark(currentCharacterString)
53-
|| _quoteConventions.IsValidClosingQuotationMark(currentCharacterString)
54-
)
55-
)
56-
{
57-
quotationMarkStringMatches.Add(new QuotationMarkStringMatch(textSegment, index, index + 1));
58-
}
59-
index++;
60-
}
61-
List<QuotationMarkStringMatch> typewriterGuillemetMatches = TypewriterGuillemetsPattern
39+
return QuotationMarkPattern
6240
.Matches(textSegment.Text)
63-
.Cast<Match>()
41+
.Cast<PcreMatch>()
6442
.Where(match =>
6543
_quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value)
6644
|| _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value)
@@ -71,11 +49,6 @@ public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegmen
7149
m.Groups[0].Index + m.Groups[0].Length
7250
))
7351
.ToList();
74-
75-
return quotationMarkStringMatches
76-
.Concat(typewriterGuillemetMatches)
77-
.OrderBy(match => match.StartIndex)
78-
.ToList();
7952
}
8053
}
8154
}

src/SIL.Machine/PunctuationAnalysis/QuotationMarkStringMatch.cs

Lines changed: 19 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
using System;
22
using System.Globalization;
33
using System.Text.RegularExpressions;
4-
using System.Unicode;
4+
using PCRE;
55

66
namespace SIL.Machine.PunctuationAnalysis
77
{
88
public class QuotationMarkStringMatch
99
{
10-
// No LatinLetterPattern or LetterPattern because C# does not support it in the same way as Python. Using UnicodeInfo to mirror machine.py
10+
private static readonly PcreRegex LetterPattern = new PcreRegex(@"[\p{L}\N{U+0001E200}-\N{U+0001E28F}]");
11+
private static readonly PcreRegex LatinLetterPattern = new PcreRegex(@"^\p{Script_Extensions=Latin}$");
1112
private static readonly Regex WhitespacePattern = new Regex(@"[\s~]", RegexOptions.Compiled);
1213
private static readonly Regex PunctuationPattern = new Regex(@"[\.,;\?!\)\]\-—۔،؛]", RegexOptions.Compiled);
1314
private static readonly Regex QuoteIntroducerPattern = new Regex(@"[:,]\s*$", RegexOptions.Compiled);
@@ -55,9 +56,15 @@ public bool IsValidClosingQuotationMark(QuoteConventionSet quoteConventions) =>
5556
public bool NextCharacterMatches(Regex regexPattern) =>
5657
NextCharacter != null && regexPattern.IsMatch(NextCharacter);
5758

59+
public bool NextCharacterMatches(PcreRegex regexPattern) =>
60+
NextCharacter != null && regexPattern.IsMatch(NextCharacter);
61+
5862
public bool PreviousCharacterMatches(Regex regexPattern) =>
5963
PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter);
6064

65+
public bool PreviousCharacterMatches(PcreRegex regexPattern) =>
66+
PreviousCharacter != null && regexPattern.IsMatch(PreviousCharacter);
67+
6168
public string PreviousCharacter
6269
{
6370
get
@@ -98,9 +105,15 @@ public string NextCharacter
98105
public bool LeadingSubstringMatches(Regex regexPattern) =>
99106
regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex));
100107

108+
public bool LeadingSubstringMatches(PcreRegex regexPattern) =>
109+
regexPattern.IsMatch(TextSegment.SubstringBefore(StartIndex));
110+
101111
public bool TrailingSubstringMatches(Regex regexPattern) =>
102112
regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex));
103113

114+
public bool TrailingSubstringMatches(PcreRegex regexPattern) =>
115+
regexPattern.IsMatch(TextSegment.SubstringAfter(EndIndex));
116+
104117
// This assumes that the two matches occur in the same verse
105118
public bool Precedes(QuotationMarkStringMatch other)
106119
{
@@ -151,72 +164,27 @@ public bool HasTrailingPunctuation()
151164

152165
public bool HasLetterInLeadingSubstring()
153166
{
154-
string leadingSubstring = TextSegment.SubstringBefore(StartIndex);
155-
if (leadingSubstring.Length == 0)
156-
return false;
157-
158-
TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(leadingSubstring);
159-
while (charactersEnumerator.MoveNext())
160-
{
161-
if (!IsLetter(charactersEnumerator.Current.ToString()))
162-
return false;
163-
}
164-
return true;
167+
return LeadingSubstringMatches(LetterPattern);
165168
}
166169

167170
public bool HasLetterInTrailingSubstring()
168171
{
169-
string trailingSubstring = TextSegment.SubstringAfter(EndIndex);
170-
if (trailingSubstring.Length == 0)
171-
return false;
172-
TextElementEnumerator charactersEnumerator = StringInfo.GetTextElementEnumerator(trailingSubstring);
173-
while (charactersEnumerator.MoveNext())
174-
{
175-
if (!IsLetter(charactersEnumerator.Current.ToString()))
176-
return false;
177-
}
178-
return true;
172+
return TrailingSubstringMatches(LetterPattern);
179173
}
180174

181175
public bool HasLeadingLatinLetter()
182176
{
183-
return PreviousCharacter != null && IsLatinScript(PreviousCharacter);
177+
return PreviousCharacterMatches(LatinLetterPattern);
184178
}
185179

186180
public bool HasTrailingLatinLetter()
187181
{
188-
return NextCharacter != null && IsLatinScript(NextCharacter);
182+
return NextCharacterMatches(LatinLetterPattern);
189183
}
190184

191185
public bool HasQuoteIntroducerInLeadingSubstring()
192186
{
193187
return LeadingSubstringMatches(QuoteIntroducerPattern);
194188
}
195-
196-
public static bool HasUnicodeProperty(string characterString, string attribute)
197-
{
198-
if (characterString.Length == 1)
199-
{
200-
return UnicodeInfo.GetName(characterString[0]).Contains(attribute);
201-
}
202-
else if (char.IsSurrogatePair(characterString[0], characterString[1]))
203-
{
204-
//Get true unicode value
205-
int combinedCharacterValue =
206-
(((int)characterString[0] - 0xD800) * 0x400) + ((int)characterString[1] - 0xDC00) + 0x10000;
207-
return UnicodeInfo.GetName(combinedCharacterValue).Contains(attribute);
208-
}
209-
return false;
210-
}
211-
212-
private bool IsLatinScript(string characterString)
213-
{
214-
return HasUnicodeProperty(characterString, "LATIN");
215-
}
216-
217-
private bool IsLetter(string characterString)
218-
{
219-
return HasUnicodeProperty(characterString, "LETTER");
220-
}
221189
}
222190
}

src/SIL.Machine/SIL.Machine.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@
3838
<ItemGroup>
3939
<PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
4040
<PackageReference Include="Nito.AsyncEx" Version="5.1.2" />
41+
<PackageReference Include="PCRE.NET" Version="1.2.0" />
4142
<PackageReference Include="Sandwych.QuickGraph.Core" Version="1.0.0" />
4243
<PackageReference Include="SIL.Scripture" Version="12.0.1" />
4344
<PackageReference Include="System.Text.Encoding.CodePages" Version="6.0.0" />
4445
<PackageReference Include="System.Threading.Tasks.Dataflow" Version="6.0.0" />
4546
<PackageReference Include="CaseExtensions" Version="1.1.0" />
46-
<PackageReference Include="UnicodeInformation" Version="2.7.1" />
4747
</ItemGroup>
4848

4949
<ItemGroup Condition="'$(TargetFramework)' == 'net461'">

0 commit comments

Comments
 (0)