Skip to content

Commit 2670829

Browse files
authored
Port quotation denormalization (#316)
* Port quotation denormalization * Add Paratext zip quotation convention detector * Fix remark adding when textBehavior is PreferExisting * Port add metadata to update block and marker behavior metadata * Move PunctuationAnalysis out of Corpora
1 parent 66ce53a commit 2670829

File tree

62 files changed

+16792
-400
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+16792
-400
lines changed
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
using System.Collections.Generic;
2+
using System.Linq;
3+
using SIL.Machine.PunctuationAnalysis;
4+
5+
namespace SIL.Machine.Corpora
6+
{
7+
public class FallbackQuotationMarkResolver : IQuotationMarkResolver
8+
{
9+
private readonly IQuotationMarkResolutionSettings _settings;
10+
public QuotationMarkMetadata LastQuotationMark { get; set; }
11+
public HashSet<QuotationMarkResolutionIssue> Issues { get; }
12+
13+
public FallbackQuotationMarkResolver(IQuotationMarkResolutionSettings settings)
14+
{
15+
_settings = settings;
16+
LastQuotationMark = null;
17+
Issues = new HashSet<QuotationMarkResolutionIssue>();
18+
}
19+
20+
public void Reset()
21+
{
22+
LastQuotationMark = null;
23+
Issues.Clear();
24+
}
25+
26+
public IEnumerable<QuotationMarkMetadata> ResolveQuotationMarks(
27+
IReadOnlyList<QuotationMarkStringMatch> quotationMarkMatches
28+
)
29+
{
30+
foreach (QuotationMarkStringMatch quoteMatch in quotationMarkMatches)
31+
{
32+
foreach (QuotationMarkMetadata quotationMarkMetadata in ResolveQuotationMark(quoteMatch))
33+
{
34+
yield return quotationMarkMetadata;
35+
}
36+
}
37+
}
38+
39+
public IEnumerable<QuotationMarkMetadata> ResolveQuotationMark(QuotationMarkStringMatch quotationMarkMatch)
40+
{
41+
if (IsOpeningQuotationMark(quotationMarkMatch))
42+
{
43+
QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch);
44+
if (quotationMark != null)
45+
{
46+
yield return quotationMark;
47+
}
48+
else
49+
{
50+
Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark);
51+
}
52+
}
53+
else if (IsClosingQuotationMark(quotationMarkMatch))
54+
{
55+
QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch);
56+
if (quotationMark != null)
57+
{
58+
yield return quotationMark;
59+
}
60+
else
61+
{
62+
Issues.Add(QuotationMarkResolutionIssue.UnexpectedQuotationMark);
63+
}
64+
}
65+
else
66+
{
67+
// Make a reasonable guess about the direction of the quotation mark
68+
if (LastQuotationMark == null || LastQuotationMark.Direction == QuotationMarkDirection.Closing)
69+
{
70+
QuotationMarkMetadata quotationMark = ResolveOpeningMark(quotationMarkMatch);
71+
if (quotationMark != null)
72+
yield return quotationMark;
73+
}
74+
else
75+
{
76+
QuotationMarkMetadata quotationMark = ResolveClosingMark(quotationMarkMatch);
77+
if (quotationMark != null)
78+
yield return quotationMark;
79+
}
80+
Issues.Add(QuotationMarkResolutionIssue.AmbiguousQuotationMark);
81+
}
82+
}
83+
84+
public bool IsOpeningQuotationMark(QuotationMarkStringMatch match)
85+
{
86+
if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match))
87+
{
88+
return (
89+
match.IsAtStartOfSegment
90+
|| match.HasLeadingWhitespace()
91+
|| DoesMostRecentOpeningMarkImmediatelyPrecede(match)
92+
|| match.HasQuoteIntroducerInLeadingSubstring()
93+
) && !(match.HasTrailingWhitespace() || match.HasTrailingPunctuation());
94+
}
95+
else if (_settings.IsValidOpeningQuotationMark(match))
96+
{
97+
return true;
98+
}
99+
100+
return false;
101+
}
102+
103+
public bool DoesMostRecentOpeningMarkImmediatelyPrecede(QuotationMarkStringMatch match)
104+
{
105+
if (LastQuotationMark == null || LastQuotationMark.Direction != QuotationMarkDirection.Opening)
106+
{
107+
return false;
108+
}
109+
return LastQuotationMark.TextSegment.Equals(match.TextSegment)
110+
&& LastQuotationMark.EndIndex == match.StartIndex;
111+
}
112+
113+
public bool IsClosingQuotationMark(QuotationMarkStringMatch match)
114+
{
115+
if (_settings.IsValidOpeningQuotationMark(match) && _settings.IsValidClosingQuotationMark(match))
116+
{
117+
return (match.HasTrailingWhitespace() || match.HasTrailingPunctuation() || match.IsAtEndOfSegment)
118+
&& !match.HasLeadingWhitespace();
119+
}
120+
else if (_settings.IsValidClosingQuotationMark(match))
121+
{
122+
return true;
123+
}
124+
125+
return false;
126+
}
127+
128+
public QuotationMarkMetadata ResolveOpeningMark(QuotationMarkStringMatch quotationMarkMatch)
129+
{
130+
HashSet<int> possibleDepths = _settings.GetPossibleDepths(
131+
quotationMarkMatch.QuotationMark,
132+
QuotationMarkDirection.Opening
133+
);
134+
if (possibleDepths.Count == 0)
135+
return null;
136+
137+
QuotationMarkMetadata quotationMark = quotationMarkMatch.Resolve(
138+
possibleDepths.Min(),
139+
QuotationMarkDirection.Opening
140+
);
141+
LastQuotationMark = quotationMark;
142+
return quotationMark;
143+
}
144+
145+
public QuotationMarkMetadata ResolveClosingMark(QuotationMarkStringMatch quotationMarkMatch)
146+
{
147+
HashSet<int> possibleDepths = _settings.GetPossibleDepths(
148+
quotationMarkMatch.QuotationMark,
149+
QuotationMarkDirection.Closing
150+
);
151+
if (possibleDepths.Count == 0)
152+
return null;
153+
154+
QuotationMarkMetadata quote = quotationMarkMatch.Resolve(
155+
possibleDepths.Min(),
156+
QuotationMarkDirection.Closing
157+
);
158+
LastQuotationMark = quote;
159+
return quote;
160+
}
161+
162+
public HashSet<QuotationMarkResolutionIssue> GetIssues()
163+
{
164+
return Issues;
165+
}
166+
}
167+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
using System;
2+
using System.IO;
3+
using System.Text;
4+
using SIL.Machine.PunctuationAnalysis;
5+
6+
namespace SIL.Machine.Corpora
7+
{
8+
public abstract class ParatextProjectQuoteConventionDetector
9+
{
10+
private readonly ParatextProjectSettings _settings;
11+
12+
protected ParatextProjectQuoteConventionDetector(ParatextProjectSettings settings)
13+
{
14+
_settings = settings;
15+
}
16+
17+
protected ParatextProjectQuoteConventionDetector(ParatextProjectSettingsParserBase settingsParser)
18+
{
19+
_settings = settingsParser.Parse();
20+
}
21+
22+
public QuoteConventionAnalysis GetQuoteConventionAnalysis(QuoteConventionDetector handler = null)
23+
{
24+
handler = handler ?? new QuoteConventionDetector();
25+
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
26+
{
27+
if (!Exists(fileName))
28+
continue;
29+
30+
string usfm;
31+
using (var reader = new StreamReader(Open(fileName)))
32+
{
33+
usfm = reader.ReadToEnd();
34+
}
35+
36+
try
37+
{
38+
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
39+
}
40+
catch (Exception ex)
41+
{
42+
var sb = new StringBuilder();
43+
sb.Append($"An error occurred while parsing the usfm for '{fileName}`");
44+
if (!string.IsNullOrEmpty(_settings.Name))
45+
sb.Append($" in project '{_settings.Name}'");
46+
sb.Append($". Error: '{ex.Message}'");
47+
throw new InvalidOperationException(sb.ToString(), ex);
48+
}
49+
}
50+
return handler.DetectQuotationConvention();
51+
}
52+
53+
protected abstract bool Exists(string fileName);
54+
protected abstract Stream Open(string fileName);
55+
}
56+
}

src/SIL.Machine/Corpora/ParatextProjectSettings.cs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System.Globalization;
1+
using System.Collections.Generic;
2+
using System.Globalization;
23
using System.Text;
34
using SIL.Scripture;
45

@@ -103,6 +104,16 @@ public string GetBookFileName(string bookId)
103104
return FileNamePrefix + bookPart + FileNameSuffix;
104105
}
105106

107+
public IEnumerable<string> GetAllScriptureBookFileNames()
108+
{
109+
BookSet scriptureBooks = Canon.ScriptureBooks;
110+
scriptureBooks.SelectAll();
111+
foreach (string bookId in scriptureBooks.SelectedBookIds)
112+
{
113+
yield return GetBookFileName(bookId);
114+
}
115+
}
116+
106117
private static string GetBookFileNameDigits(string bookId)
107118
{
108119
int bookNum = Canon.BookIdToNumber(bookId);

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ protected ParatextProjectTextUpdaterBase(ParatextProjectSettingsParserBase setti
2121

2222
public string UpdateUsfm(
2323
string bookId,
24-
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
24+
IReadOnlyList<UpdateUsfmRow> rows,
2525
string fullName = null,
2626
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
2727
UpdateUsfmMarkerBehavior paragraphBehavior = UpdateUsfmMarkerBehavior.Preserve,

src/SIL.Machine/Corpora/PlaceMarkersUsfmUpdateBlockHandler.cs

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,49 +8,60 @@ namespace SIL.Machine.Corpora
88
{
99
public class PlaceMarkersAlignmentInfo
1010
{
11-
public IReadOnlyList<string> Refs { get; }
11+
public const string MetadataKey = "alignment_info";
12+
1213
public IReadOnlyList<string> SourceTokens { get; }
1314
public IReadOnlyList<string> TranslationTokens { get; }
1415
public WordAlignmentMatrix Alignment { get; }
16+
public UpdateUsfmMarkerBehavior ParagraphBehavior { get; }
17+
public UpdateUsfmMarkerBehavior StyleBehavior { get; }
1518

1619
public PlaceMarkersAlignmentInfo(
17-
IReadOnlyList<string> refs,
1820
IReadOnlyList<string> sourceTokens,
1921
IReadOnlyList<string> translationTokens,
20-
WordAlignmentMatrix alignment
22+
WordAlignmentMatrix alignment,
23+
UpdateUsfmMarkerBehavior paragraphBehavior,
24+
UpdateUsfmMarkerBehavior styleBehavior
2125
)
2226
{
23-
Refs = refs;
2427
SourceTokens = sourceTokens;
2528
TranslationTokens = translationTokens;
2629
Alignment = alignment;
30+
ParagraphBehavior = paragraphBehavior;
31+
StyleBehavior = styleBehavior;
2732
}
2833
}
2934

3035
public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler
3136
{
32-
private readonly IDictionary<string, PlaceMarkersAlignmentInfo> _alignmentInfo;
33-
34-
public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable<PlaceMarkersAlignmentInfo> alignmentInfo)
35-
{
36-
_alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info);
37-
}
38-
3937
public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
4038
{
4139
string reference = block.Refs.FirstOrDefault().ToString();
4240
var elements = block.Elements.ToList();
4341

4442
// Nothing to do if there are no markers to place or no alignment to use
43+
if (!block.Metadata.TryGetValue(PlaceMarkersAlignmentInfo.MetadataKey, out object alignmentObject))
44+
{
45+
return block;
46+
}
47+
if (!(alignmentObject is PlaceMarkersAlignmentInfo alignmentInfo))
48+
{
49+
return block;
50+
}
4551
if (
4652
elements.Count == 0
47-
|| !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo)
4853
|| alignmentInfo.Alignment.RowCount == 0
4954
|| alignmentInfo.Alignment.ColumnCount == 0
5055
|| !elements.Any(e =>
51-
e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)
52-
&& !e.MarkedForRemoval
53-
&& e.Tokens.Count == 1
56+
(
57+
e.Type == UsfmUpdateBlockElementType.Paragraph
58+
&& alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Preserve
59+
&& e.Tokens.Count == 1
60+
)
61+
|| (
62+
e.Type == UsfmUpdateBlockElementType.Style
63+
&& alignmentInfo.StyleBehavior == UpdateUsfmMarkerBehavior.Preserve
64+
)
5465
)
5566
)
5667
{
@@ -112,7 +123,13 @@ public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
112123
{
113124
if (element.Type == UsfmUpdateBlockElementType.Text)
114125
{
115-
if (element.MarkedForRemoval)
126+
if (
127+
element.MarkedForRemoval
128+
|| (
129+
element.Type == UsfmUpdateBlockElementType.Paragraph
130+
&& alignmentInfo.ParagraphBehavior == UpdateUsfmMarkerBehavior.Strip
131+
)
132+
)
116133
{
117134
string text = element.Tokens[0].ToUsfm();
118135
sourceSentence += text;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using SIL.Machine.PunctuationAnalysis;
2+
3+
namespace SIL.Machine.Corpora
4+
{
5+
// This is a convenience class so that users don't have to know to normalize the source quote convention
6+
public class QuotationMarkDenormalizationFirstPass : QuotationMarkUpdateFirstPass
7+
{
8+
public QuotationMarkDenormalizationFirstPass(
9+
QuoteConvention sourceQuoteConvention,
10+
QuoteConvention targetQuoteConvention
11+
)
12+
: base(sourceQuoteConvention.Normalize(), targetQuoteConvention) { }
13+
}
14+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
using SIL.Machine.PunctuationAnalysis;
2+
3+
namespace SIL.Machine.Corpora
4+
{
5+
public class QuotationMarkDenormalizationUsfmUpdateBlockHandler : QuoteConventionChangingUsfmUpdateBlockHandler
6+
{
7+
// This is a convenience class so that users don't have to know to normalize the source quote convention
8+
public QuotationMarkDenormalizationUsfmUpdateBlockHandler(
9+
QuoteConvention sourceQuoteConvention,
10+
QuoteConvention targetQuoteConvention,
11+
QuotationMarkUpdateSettings settings = null
12+
)
13+
: base(
14+
sourceQuoteConvention.Normalize(),
15+
targetQuoteConvention,
16+
settings ?? new QuotationMarkUpdateSettings()
17+
) { }
18+
}
19+
}

0 commit comments

Comments
 (0)