Skip to content

Commit e9c5d19

Browse files
authored
Fix starting element error when analyzing corpora (#325)
1 parent 8e35c45 commit e9c5d19

File tree

6 files changed

+63
-6
lines changed

6 files changed

+63
-6
lines changed

src/SIL.Machine/PunctuationAnalysis/QuotationMarkFinder.cs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System.Collections.Generic;
2+
using System.Globalization;
23
using System.Linq;
34
using PCRE;
45

@@ -43,11 +44,21 @@ public List<QuotationMarkStringMatch> FindAllPotentialQuotationMarksInTextSegmen
4344
_quoteConventions.IsValidOpeningQuotationMark(match.Groups[0].Value)
4445
|| _quoteConventions.IsValidClosingQuotationMark(match.Groups[0].Value)
4546
)
46-
.Select(m => new QuotationMarkStringMatch(
47-
textSegment,
48-
m.Groups[0].Index,
49-
m.Groups[0].Index + m.Groups[0].Length
50-
))
47+
.Select(m =>
48+
{
49+
int[] textElementIndices = StringInfo.ParseCombiningCharacters(textSegment.Text);
50+
int startIndex = 0;
51+
int endIndex = textElementIndices.Length;
52+
for (int textElementIndex = 0; textElementIndex < textElementIndices.Length; textElementIndex++)
53+
{
54+
int stringIndex = textElementIndices[textElementIndex];
55+
if (stringIndex == m.Groups[0].Index)
56+
startIndex = textElementIndex;
57+
if (stringIndex == m.Groups[0].EndIndex)
58+
endIndex = textElementIndex;
59+
}
60+
return new QuotationMarkStringMatch(textSegment, startIndex, endIndex);
61+
})
5162
.ToList();
5263
}
5364
}

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System;
22
using System.Collections.Generic;
3+
using System.Globalization;
34
using SIL.Machine.Corpora;
45

56
namespace SIL.Machine.PunctuationAnalysis
@@ -70,7 +71,7 @@ public override int GetHashCode()
7071
return hashCode * 31 + ImmediatePrecedingMarker.GetHashCode();
7172
}
7273

73-
public int Length => Text.Length;
74+
public int Length => StringInfo.ParseCombiningCharacters(Text).Length;
7475

7576
public string SubstringBefore(int index)
7677
{

tests/SIL.Machine.Tests/Corpora/CorporaTestHelpers.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ internal static class CorporaTestHelpers
1616
);
1717
public static readonly string UsfmTestProjectPath = Path.Combine(TestDataPath, "usfm", "Tes");
1818
public static readonly string UsfmTargetProjectPath = Path.Combine(TestDataPath, "usfm", "target");
19+
public static readonly string UsfmTargetProjectZipPath = Path.Combine(TestDataPath, "project", "target");
1920
public static readonly string UsfmTargetCustomVrsPath = Path.Combine(TestDataPath, "usfm", "target", "custom.vrs");
2021
public static readonly string UsfmSourceProjectPath = Path.Combine(TestDataPath, "usfm", "source");
22+
public static readonly string UsfmSourceProjectZipPath = Path.Combine(TestDataPath, "project", "source");
2123
public static readonly string UsxTestProjectPath = Path.Combine(TestDataPath, "usx", "Tes");
2224
public static readonly string TextTestProjectPath = Path.Combine(TestDataPath, "txt");
2325
public static readonly string DeuterocanonicalsSourcePath = Path.Combine(

tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using System.IO.Compression;
22
using System.Text.Json;
33
using NUnit.Framework;
4+
using SIL.Machine.PunctuationAnalysis;
45

56
namespace SIL.Machine.Corpora;
67

@@ -170,4 +171,28 @@ async Task GetUsfmAsync(string projectPath)
170171
await GetUsfmAsync(ParatextProjectPath);
171172
}
172173
}
174+
175+
[Test]
176+
[Ignore("This is for manual testing only. Remove this tag to run the test.")]
177+
public void AnalyzeCorporaQuoteConventions()
178+
{
179+
var sourceHandler = new QuoteConventionDetector();
180+
using ZipArchive zipArchive = ZipFile.OpenRead(CorporaTestHelpers.UsfmSourceProjectZipPath);
181+
var quoteConventionDetector = new ZipParatextProjectQuoteConventionDetector(zipArchive);
182+
quoteConventionDetector.GetQuoteConventionAnalysis(sourceHandler);
183+
184+
var targetHandler = new QuoteConventionDetector();
185+
using ZipArchive zipArchive2 = ZipFile.OpenRead(CorporaTestHelpers.UsfmTargetProjectZipPath);
186+
var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2);
187+
quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler);
188+
189+
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention();
190+
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention();
191+
192+
Assert.Multiple(() =>
193+
{
194+
Assert.NotNull(sourceAnalysis);
195+
Assert.NotNull(targetAnalysis);
196+
});
197+
}
173198
}

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,22 @@ public void ThatAllPossibleQuotationMarksAreIdentified()
282282
]
283283
)
284284
);
285+
286+
Assert.That(
287+
quotationMarkFinder
288+
.FindAllPotentialQuotationMarksInTextSegment(
289+
new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build()
290+
)
291+
.SequenceEqual(
292+
[
293+
new QuotationMarkStringMatch(
294+
new TextSegment.Builder().SetText("उत्पत्ति \"पुस्तकले").Build(),
295+
6,
296+
7
297+
),
298+
]
299+
)
300+
);
285301
}
286302

287303
[Test]

tests/SIL.Machine.Tests/PunctuationAnalysis/TextSegmentTests.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ public void Length()
188188

189189
textSegment = new TextSegment.Builder().SetText("new example text").Build();
190190
Assert.That(textSegment.Length, Is.EqualTo("new example text".Length));
191+
textSegment = new TextSegment.Builder().SetText("उत्पत्ति पुस्तकले").Build();
192+
Assert.That(textSegment.Length, Is.EqualTo(11));
191193
}
192194

193195
[Test]

0 commit comments

Comments
 (0)