Skip to content

Commit 333171a

Browse files
authored
Filter quote convention analysis by books/chapter (#349)
* Port sillsdev/machine.py#236 * Add an overload to take a chapter-by-id dict for convenience in Serval
1 parent 6f7d447 commit 333171a

File tree

8 files changed

+266
-24
lines changed

8 files changed

+266
-24
lines changed

src/SIL.Machine/PunctuationAnalysis/ParatextProjectQuoteConventionDetector.cs

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
using System;
2+
using System.Collections.Generic;
23
using System.IO;
4+
using System.Linq;
35
using System.Text;
46
using SIL.Machine.Corpora;
7+
using SIL.Scripture;
58

69
namespace SIL.Machine.PunctuationAnalysis
710
{
@@ -20,10 +23,38 @@ protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBa
2023
}
2124

2225
public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
26+
{
27+
Dictionary<int, List<int>> includeChapters = null;
28+
return GetQuoteConventionAnalysis(handler, includeChapters);
29+
}
30+
31+
public QuoteConventionAnalysis GetQuoteConventionAnalysis(
32+
QuoteConventionDetector handler = null,
33+
IReadOnlyDictionary<string, List<int>> includeChapters = null
34+
)
35+
{
36+
return GetQuoteConventionAnalysis(
37+
handler,
38+
includeChapters.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value)
39+
);
40+
}
41+
42+
public QuoteConventionAnalysis GetQuoteConventionAnalysis(
43+
QuoteConventionDetector handler = null,
44+
IReadOnlyDictionary<int, List<int>> includeChapters = null
45+
)
2346
{
2447
handler = handler ?? new QuoteConventionDetector();
25-
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
48+
foreach (
49+
string bookId in Canon
50+
.AllBookNumbers.Where(num => Canon.IsCanonical(num))
51+
.Select(num => Canon.BookNumberToId(num))
52+
)
2653
{
54+
if (includeChapters != null && !includeChapters.ContainsKey(Canon.BookIdToNumber(bookId)))
55+
continue;
56+
57+
string fileName = _settings.GetBookFileName(bookId);
2758
if (!Exists(fileName))
2859
continue;
2960

@@ -47,7 +78,7 @@ public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetecto
4778
throw new InvalidOperationException(sb.ToString(), ex);
4879
}
4980
}
50-
return handler.DetectQuotationConvention();
81+
return handler.DetectQuoteConvention(includeChapters);
5182
}
5283

5384
protected abstract bool Exists(string fileName);

src/SIL.Machine/PunctuationAnalysis/QuoteConventionDetector.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ private void CountQuotationMarksInChapter(Chapter chapter, QuoteConventionSet po
5656
_quotationMarkTabulator.Tabulate(resolvedQuotationMarks);
5757
}
5858

59-
public QuoteConventionAnalysis DetectQuotationConvention()
59+
public QuoteConventionAnalysis DetectQuoteConvention(IReadOnlyDictionary<int, List<int>> includeChapters = null)
6060
{
61-
CountQuotationMarksInChapters(GetChapters());
61+
CountQuotationMarksInChapters(GetChapters(includeChapters));
6262

6363
(QuoteConvention bestQuoteConvention, double score) = QuoteConventions.Standard.FindMostSimilarConvention(
6464
_quotationMarkTabulator

src/SIL.Machine/PunctuationAnalysis/TextSegment.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ public string Text
1212
get => _surrogatePairString.ToString();
1313
private set => _surrogatePairString = new SurrogatePairString(value);
1414
}
15+
public string Book { get; private set; }
16+
public int Chapter { get; private set; }
1517
public UsfmMarkerType ImmediatePrecedingMarker { get; private set; }
1618
public HashSet<UsfmMarkerType> MarkersInPrecedingContext { get; private set; }
1719
public TextSegment PreviousSegment { get; set; }
@@ -139,6 +141,18 @@ public Builder AddPrecedingMarker(UsfmMarkerType marker)
139141
return this;
140142
}
141143

144+
public Builder SetBook(string code)
145+
{
146+
_textSegment.Book = code;
147+
return this;
148+
}
149+
150+
public Builder SetChapter(int number)
151+
{
152+
_textSegment.Chapter = number;
153+
return this;
154+
}
155+
142156
public Builder SetUsfmToken(UsfmToken token)
143157
{
144158
_textSegment.UsfmToken = token;

src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Collections.Generic;
22
using SIL.Machine.Corpora;
3+
using SIL.Scripture;
34

45
namespace SIL.Machine.PunctuationAnalysis
56
{
@@ -14,9 +15,15 @@ public UsfmStructureExtractor()
1415
_nextTextSegmentBuilder = new TextSegment.Builder();
1516
}
1617

18+
public void StartBook(UsfmParserState state, string marker, string code)
19+
{
20+
_nextTextSegmentBuilder.SetBook(code);
21+
}
22+
1723
public void Chapter(UsfmParserState state, string number, string marker, string altNumber, string pubNumber)
1824
{
1925
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Chapter);
26+
_nextTextSegmentBuilder.SetChapter(state.VerseRef.ChapterNum);
2027
}
2128

2229
public void EndBook(UsfmParserState state, string marker) { }
@@ -65,8 +72,6 @@ public void Ref(UsfmParserState state, string marker, string display, string tar
6572
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Embed);
6673
}
6774

68-
public void StartBook(UsfmParserState state, string marker, string code) { }
69-
7075
public void StartCell(UsfmParserState state, string marker, string align, int colspan) { }
7176

7277
public void StartChar(
@@ -127,13 +132,26 @@ public void Verse(UsfmParserState state, string number, string marker, string al
127132
_nextTextSegmentBuilder.AddPrecedingMarker(UsfmMarkerType.Verse);
128133
}
129134

130-
public List<Chapter> GetChapters()
135+
public List<Chapter> GetChapters(IReadOnlyDictionary<int, List<int>> includeChapters = null)
131136
{
132137
var chapters = new List<Chapter>();
138+
int currentBook = 0;
139+
int currentChapter = 0;
133140
var currentChapterVerses = new List<Verse>();
134141
var currentVerseSegments = new List<TextSegment>();
135142
foreach (TextSegment textSegment in _textSegments)
136143
{
144+
if (textSegment.Book != null)
145+
currentBook = Canon.BookIdToNumber(textSegment.Book);
146+
if (textSegment.Chapter > 0)
147+
currentChapter = textSegment.Chapter;
148+
if (includeChapters != null && currentBook > 0)
149+
{
150+
if (!includeChapters.TryGetValue(currentBook, out List<int> bookChapters))
151+
continue;
152+
if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter))
153+
continue;
154+
}
137155
if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse))
138156
{
139157
if (currentVerseSegments.Count > 0)

tests/SIL.Machine.Tests/Corpora/ParatextProjectQuoteConvetionDetectorTests.cs

Lines changed: 129 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ namespace SIL.Machine.Corpora;
88
[TestFixture]
99
public class ParatextProjectQuoteConventionDetectorTests
1010
{
11+
private static readonly QuoteConvention StandardEnglishQuoteConvention =
12+
QuoteConventions.Standard.GetQuoteConventionByName("standard_english");
13+
private static readonly QuoteConvention StandardFrenchQuoteConvention =
14+
QuoteConventions.Standard.GetQuoteConventionByName("standard_french");
15+
1116
[Test]
1217
public void TestGetQuotationAnalysis()
1318
{
@@ -16,18 +21,9 @@ public void TestGetQuotationAnalysis()
1621
{
1722
{
1823
"41MATTest.SFM",
19-
@"\id MAT
20-
\c 1
21-
\v 1 Someone said, “This is something I am saying!
22-
\v 2 This is also something I am saying” (that is, “something I am speaking”).
23-
\p
24-
\v 3 Other text, and someone else said,
25-
\q1
26-
\v 4 “Things
27-
\q2 someone else said!
28-
\q3 and more things someone else said.”
29-
\m That is why he said “things someone else said.”
30-
\v 5 Then someone said, “More things someone said.”"
24+
$@"\id MAT
25+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
26+
"
3127
}
3228
}
3329
);
@@ -37,6 +33,100 @@ public void TestGetQuotationAnalysis()
3733
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_english"));
3834
}
3935

36+
[Test]
37+
public void TestGetQuotationByBook()
38+
{
39+
var env = new TestEnvironment(
40+
files: new Dictionary<string, string>()
41+
{
42+
{
43+
"41MATTest.SFM",
44+
$@"\id MAT
45+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
46+
"
47+
},
48+
{
49+
"42MRKTest.SFM",
50+
$@"\id MRK
51+
{GetTestChapter(1, StandardFrenchQuoteConvention)}
52+
"
53+
}
54+
}
55+
);
56+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK");
57+
Assert.That(analysis, Is.Not.Null);
58+
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.8));
59+
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
60+
}
61+
62+
[Test]
63+
public void TestGetQuotationConventionByChapter()
64+
{
65+
var env = new TestEnvironment(
66+
files: new Dictionary<string, string>()
67+
{
68+
{
69+
"41MATTest.SFM",
70+
$@"\id MAT
71+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
72+
"
73+
},
74+
{
75+
"42MRKTest.SFM",
76+
$@"\id MRK
77+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
78+
{GetTestChapter(2, StandardFrenchQuoteConvention)}
79+
{GetTestChapter(3, StandardEnglishQuoteConvention)}
80+
{GetTestChapter(4, StandardEnglishQuoteConvention)}
81+
{GetTestChapter(5, StandardFrenchQuoteConvention)}
82+
"
83+
}
84+
}
85+
);
86+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MRK2,4-5");
87+
Assert.That(analysis, Is.Not.Null);
88+
Assert.That(analysis.BestQuoteConventionScore, Is.GreaterThan(0.66));
89+
Assert.That(analysis.BestQuoteConvention.Name, Is.EqualTo("standard_french"));
90+
}
91+
92+
[Test]
93+
public void TestGetQuotationConventionByChapterIndeterminate()
94+
{
95+
var env = new TestEnvironment(
96+
files: new Dictionary<string, string>()
97+
{
98+
{
99+
"41MATTest.SFM",
100+
$@"\id MAT
101+
{GetTestChapter(1)}
102+
{GetTestChapter(2, StandardEnglishQuoteConvention)}
103+
{GetTestChapter(3)}
104+
"
105+
}
106+
}
107+
);
108+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT1,3");
109+
Assert.That(analysis, Is.Null);
110+
}
111+
112+
[Test]
113+
public void TestGetQuotationConventionInvalidBookCode()
114+
{
115+
var env = new TestEnvironment(
116+
files: new Dictionary<string, string>()
117+
{
118+
{
119+
"41MATTest.SFM",
120+
$@"\id LUK
121+
{GetTestChapter(1, StandardEnglishQuoteConvention)}
122+
"
123+
}
124+
}
125+
);
126+
QuoteConventionAnalysis analysis = env.GetQuoteConvention("MAT");
127+
Assert.That(analysis, Is.Null);
128+
}
129+
40130
private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary<string, string>? files = null)
41131
{
42132
public ParatextProjectQuoteConventionDetector Detector { get; } =
@@ -45,12 +135,37 @@ private class TestEnvironment(ParatextProjectSettings? settings = null, Dictiona
45135
files ?? new()
46136
);
47137

48-
public QuoteConventionAnalysis GetQuoteConvention()
138+
public QuoteConventionAnalysis GetQuoteConvention(string? scriptureRange = null)
49139
{
50-
return Detector.GetQuoteConventionAnalysis();
140+
Dictionary<int, List<int>>? chapters = null;
141+
if (scriptureRange != null)
142+
{
143+
chapters = ScriptureRangeParser
144+
.GetChapters(scriptureRange)
145+
.ToDictionary(kvp => Canon.BookIdToNumber(kvp.Key), kvp => kvp.Value);
146+
}
147+
return Detector.GetQuoteConventionAnalysis(includeChapters: chapters);
51148
}
52149
}
53150

151+
private static string GetTestChapter(int number, QuoteConvention? quoteConvention = null)
152+
{
153+
string leftQuote = quoteConvention != null ? quoteConvention.GetOpeningQuotationMarkAtDepth(1) : "";
154+
string rightQuote = quoteConvention != null ? quoteConvention.GetClosingQuotationMarkAtDepth(1) : "";
155+
return $@"\c {number}
156+
\v 1 Someone said, {leftQuote}This is something I am saying!
157+
\v 2 This is also something I am saying{rightQuote} (that is, {leftQuote}something I am speaking{rightQuote}).
158+
\p
159+
\v 3 Other text, and someone else said,
160+
\q1
161+
\v 4 {leftQuote}Things
162+
\q2 someone else said!
163+
\q3 and more things someone else said.{rightQuote}
164+
\m That is why he said {leftQuote}things someone else said.{rightQuote}
165+
\v 5 Then someone said, {leftQuote}More things someone said.{rightQuote}
166+
";
167+
}
168+
54169
private class DefaultParatextProjectSettings(
55170
string name = "Test",
56171
string fullName = "TestProject",

tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,8 @@ public void AnalyzeCorporaQuoteConventions()
186186
var quoteConventionDetector2 = new ZipParatextProjectQuoteConventionDetector(zipArchive2);
187187
quoteConventionDetector2.GetQuoteConventionAnalysis(targetHandler);
188188

189-
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuotationConvention();
190-
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuotationConvention();
189+
QuoteConventionAnalysis sourceAnalysis = sourceHandler.DetectQuoteConvention();
190+
QuoteConventionAnalysis targetAnalysis = targetHandler.DetectQuoteConvention();
191191

192192
Assert.Multiple(() =>
193193
{

tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationConventionDetectorTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,6 @@ public QuoteConventionAnalysis DetectQuotationConvention(string usfm)
368368
{
369369
var quoteConventionDetector = new QuoteConventionDetector();
370370
UsfmParser.Parse(usfm, quoteConventionDetector);
371-
return quoteConventionDetector.DetectQuotationConvention();
371+
return quoteConventionDetector.DetectQuoteConvention();
372372
}
373373
}

0 commit comments

Comments
 (0)