Skip to content

Commit 754c97d

Browse files
authored
Port improvements to quote denormalization (#359)
1 parent 6e1caae commit 754c97d

12 files changed

+289
-117
lines changed

src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,12 @@ ParatextProjectSettings settings
2222
_paratextProjectFileHandler = paratextProjectFileHandler;
2323
}
2424

25-
public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
26-
{
27-
Dictionary<int, List<int>> includeChapters = null;
28-
return GetQuoteConventionAnalysis(handler, includeChapters);
29-
}
30-
31-
public QuoteConventionAnalysis GetQuoteConventionAnalysis(
32-
QuoteConventionDetector handler = null,
33-
IReadOnlyDictionary<string, List<int>> includeChapters = null
34-
)
35-
{
36-
return GetQuoteConventionAnalysis(
37-
handler,
38-
includeChapters?.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value)
39-
);
40-
}
41-
4225
public QuoteConventionAnalysis GetQuoteConventionAnalysis(
43-
QuoteConventionDetector handler = null,
4426
IReadOnlyDictionary<int, List<int>> includeChapters = null
4527
)
4628
{
47-
handler = handler ?? new QuoteConventionDetector();
29+
var bookQuoteConventionsAnalyses = new List<QuoteConventionAnalysis>();
30+
4831
foreach (
4932
string bookId in Canon
5033
.AllBookNumbers.Where(num => Canon.IsCanonical(num))
@@ -54,12 +37,14 @@ string bookId in Canon
5437
if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId)))
5538
continue;
5639

40+
var handler = new QuoteConventionDetector();
41+
5742
string fileName = _settings.GetBookFileName(bookId);
58-
if (!Exists(fileName))
43+
if (!_paratextProjectFileHandler.Exists(fileName))
5944
continue;
6045

6146
string usfm;
62-
using (var reader = new StreamReader(Open(fileName)))
47+
using (var reader = new StreamReader(_paratextProjectFileHandler.Open(fileName)))
6348
{
6449
usfm = reader.ReadToEnd();
6550
}
@@ -77,12 +62,9 @@ string bookId in Canon
7762
sb.Append($". Error: '{ex.Message}'");
7863
throw new InvalidOperationException(sb.ToString(), ex);
7964
}
65+
bookQuoteConventionsAnalyses.Add(handler.DetectQuoteConvention(includeChapters));
8066
}
81-
return handler.DetectQuoteConvention(includeChapters);
67+
return QuoteConventionAnalysis.CombineWithWeightedAverage(bookQuoteConventionsAnalyses);
8268
}
83-
84-
private bool Exists(string fileName) => _paratextProjectFileHandler.Exists(fileName);
85-
86-
private Stream Open(string fileName) => _paratextProjectFileHandler.Open(fileName);
8769
}
8870
}

src/SIL.Machine/PunctuationAnalysis/QuotationMarkTabulator.cs

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
using System;
21
using System.Collections.Generic;
2+
using System.Linq;
33
using System.Text;
44
using SIL.Extensions;
55

@@ -23,6 +23,16 @@ public void CountQuotationMark(string quotationMark)
2323
TotalCount++;
2424
}
2525

26+
public void CountFrom(QuotationMarkCounts quotationMarkCounts)
27+
{
28+
foreach (KeyValuePair<string, int> kvp in quotationMarkCounts._quotationMarkCounter)
29+
{
30+
(string quotationMark, int count) = (kvp.Key, kvp.Value);
31+
_quotationMarkCounter.UpdateValue(quotationMark, () => 0, i => i + count);
32+
}
33+
TotalCount += quotationMarkCounts.TotalCount;
34+
}
35+
2636
public (string BestString, int BestStringCount, int TotalStringCount) FindBestQuotationMarkProportion()
2737
{
2838
string bestString = _quotationMarkCounter.MaxBy(kvp => kvp.Value).Key;
@@ -60,6 +70,29 @@ public void Tabulate(List<QuotationMarkMetadata> quotationMarks)
6070
}
6171
}
6272

73+
public void TabulateFrom(QuotationMarkTabulator tabulatedQuotationMarks)
74+
{
75+
foreach (
76+
(
77+
(int depth, QuotationMarkDirection direction),
78+
QuotationMarkCounts otherCounts
79+
) in tabulatedQuotationMarks._quotationCountsByDepthAndDirection.Select(kvp => (kvp.Key, kvp.Value))
80+
)
81+
{
82+
if (
83+
!_quotationCountsByDepthAndDirection.TryGetValue(
84+
(depth, direction),
85+
out QuotationMarkCounts thisCounts
86+
)
87+
)
88+
{
89+
thisCounts = new QuotationMarkCounts();
90+
_quotationCountsByDepthAndDirection[(depth, direction)] = thisCounts;
91+
}
92+
thisCounts.CountFrom(otherCounts);
93+
}
94+
}
95+
6396
private void CountQuotationMark(QuotationMarkMetadata quote)
6497
{
6598
(int Depth, QuotationMarkDirection Direction) key = (quote.Depth, quote.Direction);
@@ -75,26 +108,52 @@ private void CountQuotationMark(QuotationMarkMetadata quote)
75108
);
76109
}
77110

111+
public int GetTotalQuotationMarkCount()
112+
{
113+
return _quotationCountsByDepthAndDirection.Values.Select(c => c.TotalCount).Sum();
114+
}
115+
78116
public double CalculateSimilarity(QuoteConvention quoteConvention)
79117
{
80-
double weightedDifference = 0.0;
81-
double totalWeight = 0.0;
82-
foreach ((int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys)
118+
var numMarksByDepth = new Dictionary<int, int>();
119+
var numMatchingMarksByDepth = new Dictionary<int, int>();
120+
foreach (
121+
(int depth, QuotationMarkDirection direction) in _quotationCountsByDepthAndDirection.Keys.OrderBy(k =>
122+
k
123+
)
124+
)
83125
{
84126
string expectedQuotationMark = quoteConvention.GetExpectedQuotationMark(depth, direction);
85-
86-
// Give higher weight to shallower depths, since deeper marks are more likely to be mistakes
87-
weightedDifference += (
88-
_quotationCountsByDepthAndDirection[(depth, direction)]
89-
.CalculateNumDifferences(expectedQuotationMark) * Math.Pow(2, -depth)
127+
int numMatchingMarks = _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount;
128+
numMarksByDepth.UpdateValue(depth, () => 0, i => i + numMatchingMarks);
129+
numMatchingMarksByDepth.UpdateValue(
130+
depth,
131+
() => 0,
132+
i =>
133+
i
134+
+ numMatchingMarks
135+
- _quotationCountsByDepthAndDirection[(depth, direction)]
136+
.CalculateNumDifferences(expectedQuotationMark)
90137
);
91-
totalWeight += _quotationCountsByDepthAndDirection[(depth, direction)].TotalCount * Math.Pow(2, -depth);
92138
}
93-
if (totalWeight == 0.0)
139+
140+
// The scores of greater depths depend on the scores of shallower depths
141+
var scoresByDepth = new Dictionary<int, double>();
142+
foreach (int depth in numMarksByDepth.Keys.OrderBy(k => k))
94143
{
95-
return 0.0;
144+
double previousDepthScore = 1;
145+
if (scoresByDepth.TryGetValue(depth - 1, out double score))
146+
{
147+
previousDepthScore = score / numMarksByDepth[depth - 1];
148+
}
149+
scoresByDepth[depth] = previousDepthScore * numMatchingMarksByDepth[depth];
96150
}
97-
return 1 - (weightedDifference / totalWeight);
151+
int totalMarks = numMarksByDepth.Values.Sum();
152+
double totalScore = scoresByDepth.Values.Sum();
153+
154+
if (totalMarks == 0)
155+
return 0;
156+
return totalScore / totalMarks;
98157
}
99158

100159
private bool DepthAndDirectionObserved(int depth, QuotationMarkDirection direction)

src/SIL.Machine/PunctuationAnalysis/QuoteConvention.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ out char quote
4242
: ClosingQuotationMark;
4343
return new SingleLevelQuoteConvention(normalizedOpeningQuotationMark, normalizedClosingQuotationMark);
4444
}
45+
46+
public override int GetHashCode()
47+
{
48+
int hashCode = 23;
49+
hashCode = hashCode * 31 + OpeningQuotationMark.GetHashCode();
50+
hashCode = hashCode * 31 + ClosingQuotationMark.GetHashCode();
51+
return hashCode;
52+
}
4553
}
4654

4755
public class QuoteConvention
@@ -150,5 +158,15 @@ public QuoteConvention Normalize()
150158
{
151159
return new QuoteConvention(Name + "_normalized", LevelConventions.Select(l => l.Normalize()).ToList());
152160
}
161+
162+
public override int GetHashCode()
163+
{
164+
int hashCode = 23;
165+
foreach (SingleLevelQuoteConvention quoteConvention in LevelConventions)
166+
{
167+
hashCode = hashCode * 31 + quoteConvention.GetHashCode();
168+
}
169+
return hashCode;
170+
}
153171
}
154172
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
using System.Collections.Generic;
2+
using System.Linq;
3+
using SIL.Extensions;
4+
5+
namespace SIL.Machine.PunctuationAnalysis
6+
{
7+
public class QuoteConventionAnalysis
8+
{
9+
public QuoteConvention BestQuoteConvention { get; private set; }
10+
public double BestQuoteConventionScore { get; private set; }
11+
public string AnalysisSummary { get; private set; }
12+
public IReadOnlyDictionary<QuoteConvention, double> ConventionScores { get; private set; }
13+
public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; }
14+
public double AnalysisWeight { get; private set; }
15+
16+
public QuoteConventionAnalysis(
17+
Dictionary<QuoteConvention, double> conventionScores,
18+
QuotationMarkTabulator tabulatedQuotationMarks,
19+
double analysisWeight = 1.0
20+
)
21+
{
22+
ConventionScores = conventionScores;
23+
if (ConventionScores.Count > 0)
24+
{
25+
KeyValuePair<QuoteConvention, double> maxKvp = ConventionScores.MaxBy(kvp => kvp.Value);
26+
(BestQuoteConvention, BestQuoteConventionScore) = (maxKvp.Key, maxKvp.Value);
27+
}
28+
else
29+
{
30+
BestQuoteConventionScore = 0;
31+
BestQuoteConvention = null;
32+
}
33+
TabulatedQuotationMarks = tabulatedQuotationMarks;
34+
AnalysisWeight = analysisWeight;
35+
}
36+
37+
public class Builder
38+
{
39+
public Dictionary<QuoteConvention, double> ConventionScores { get; private set; }
40+
public QuotationMarkTabulator TabulatedQuotationMarks { get; private set; }
41+
42+
public Builder(QuotationMarkTabulator tabulatedQuotationMarks)
43+
{
44+
ConventionScores = new Dictionary<QuoteConvention, double>();
45+
TabulatedQuotationMarks = tabulatedQuotationMarks;
46+
}
47+
48+
public void RecordConventionScore(QuoteConvention quoteConvention, double score)
49+
{
50+
ConventionScores[quoteConvention] = score;
51+
}
52+
53+
public QuoteConventionAnalysis Build()
54+
{
55+
return new QuoteConventionAnalysis(
56+
ConventionScores,
57+
TabulatedQuotationMarks,
58+
TabulatedQuotationMarks.GetTotalQuotationMarkCount()
59+
);
60+
}
61+
}
62+
63+
public static QuoteConventionAnalysis CombineWithWeightedAverage(
64+
List<QuoteConventionAnalysis> quoteConventionAnalyses
65+
)
66+
{
67+
double totalWeight = 0;
68+
Dictionary<string, double> conventionVotes = new Dictionary<string, double>();
69+
Dictionary<string, QuoteConvention> quoteConventionsByName = new Dictionary<string, QuoteConvention>();
70+
QuotationMarkTabulator totalTabulatedQuotationMarks = new QuotationMarkTabulator();
71+
foreach (QuoteConventionAnalysis quoteConventionAnalysis in quoteConventionAnalyses)
72+
{
73+
totalTabulatedQuotationMarks.TabulateFrom(quoteConventionAnalysis.TabulatedQuotationMarks);
74+
totalWeight += quoteConventionAnalysis.AnalysisWeight;
75+
foreach (
76+
(QuoteConvention convention, double score) in quoteConventionAnalysis.ConventionScores.Select(kvp =>
77+
(kvp.Key, kvp.Value)
78+
)
79+
)
80+
{
81+
quoteConventionsByName[convention.Name] = convention;
82+
conventionVotes.UpdateValue(
83+
convention.Name,
84+
() => 0,
85+
s => s + score * quoteConventionAnalysis.AnalysisWeight
86+
);
87+
}
88+
}
89+
QuoteConventionAnalysis.Builder builder = new QuoteConventionAnalysis.Builder(totalTabulatedQuotationMarks);
90+
foreach ((string conventionName, double totalScore) in conventionVotes.Select(kvp => (kvp.Key, kvp.Value)))
91+
{
92+
if (totalScore > 0)
93+
{
94+
builder.RecordConventionScore(quoteConventionsByName[conventionName], totalScore / totalWeight);
95+
}
96+
}
97+
return builder.Build();
98+
}
99+
}
100+
}

src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,6 @@
33

44
namespace SIL.Machine.PunctuationAnalysis
55
{
6-
public class QuoteConventionAnalysis
7-
{
8-
public QuoteConvention BestQuoteConvention { get; private set; }
9-
public double BestQuoteConventionScore { get; private set; }
10-
public string AnalysisSummary { get; private set; }
11-
12-
public QuoteConventionAnalysis(
13-
QuoteConvention bestQuoteConvention,
14-
double bestQuoteConventionScore,
15-
string analysisSummary
16-
)
17-
{
18-
BestQuoteConvention = bestQuoteConvention;
19-
BestQuoteConventionScore = bestQuoteConventionScore;
20-
AnalysisSummary = analysisSummary;
21-
}
22-
}
23-
246
public class QuoteConventionDetector : UsfmStructureExtractor
257
{
268
private readonly QuotationMarkTabulator _quotationMarkTabulator;
@@ -60,19 +42,7 @@ public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary<int, Li
6042
{
6143
CountQuotationMarksInChapters(GetChapters(includeChapters));
6244

63-
(QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention(
64-
_quotationMarkTabulator
65-
);
66-
67-
if (score > 0 && bestQuoteConvention != null)
68-
{
69-
return new QuoteConventionAnalysis(
70-
bestQuoteConvention,
71-
score,
72-
_quotationMarkTabulator.GetSummaryMessage()
73-
);
74-
}
75-
return null;
45+
return QuoteConventions.Standard.ScoreAllQuoteConventions(_quotationMarkTabulator);
7646
}
7747
}
7848
}

src/SIL.Machine/PunctuationAnalysis/QuoteConventionSet.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,5 +230,16 @@ QuotationMarkTabulator tabulatedQuotationMarks
230230
}
231231
return (bestQuoteConvention, bestSimilarity);
232232
}
233+
234+
public QuoteConventionAnalysis ScoreAllQuoteConventions(QuotationMarkTabulator tabulatedQuotationMarks)
235+
{
236+
var builder = new QuoteConventionAnalysis.Builder(tabulatedQuotationMarks);
237+
foreach (QuoteConvention convention in Conventions)
238+
{
239+
double score = tabulatedQuotationMarks.CalculateSimilarity(convention);
240+
builder.RecordConventionScore(convention, score);
241+
}
242+
return builder.Build();
243+
}
233244
}
234245
}

src/SIL.Machine/PunctuationAnalysis/StandardQuoteConventions.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,15 @@ public static class QuoteConventions
215215
new SingleLevelQuoteConvention("\u2019", "\u2018"),
216216
}
217217
),
218+
new QuoteConvention(
219+
"arabic_inspired_western_european",
220+
new List<SingleLevelQuoteConvention>
221+
{
222+
new SingleLevelQuoteConvention("\u00ab", "\u00bb"),
223+
new SingleLevelQuoteConvention("\u201d", "\u201c"),
224+
new SingleLevelQuoteConvention("\u2019", "\u2018"),
225+
}
226+
),
218227
}
219228
);
220229
}

0 commit comments

Comments
 (0)