Skip to content

Commit 41dc5cb

Browse files
committed
Add detector classes and manual test
1 parent 6f7d447 commit 41dc5cb

File tree

5 files changed

+247
-0
lines changed

5 files changed

+247
-0
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Text;
5+
6+
namespace SIL.Machine.Corpora
7+
{
8+
public abstract class ParatextProjectVersificationMismatchDetector
9+
{
10+
private readonly ParatextProjectSettings _settings;
11+
12+
protected ParatextProjectVersificationMismatchDetector(ParatextProjectSettings settings)
13+
{
14+
_settings = settings;
15+
}
16+
17+
protected ParatextProjectVersificationMismatchDetector(ParatextProjectSettingsParserBase settingsParser)
18+
{
19+
_settings = settingsParser.Parse();
20+
}
21+
22+
public IReadOnlyList<UsfmVersificationMismatch> GetUsfmVersificationMismatches(
23+
UsfmVersificationMismatchDetector handler = null
24+
)
25+
{
26+
handler = handler ?? new UsfmVersificationMismatchDetector(_settings.Versification);
27+
foreach (string fileName in _settings.GetAllScriptureBookFileNames())
28+
{
29+
if (!Exists(fileName))
30+
continue;
31+
32+
string usfm;
33+
using (var reader = new StreamReader(Open(fileName)))
34+
{
35+
usfm = reader.ReadToEnd();
36+
}
37+
38+
try
39+
{
40+
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
41+
}
42+
catch (Exception ex)
43+
{
44+
var sb = new StringBuilder();
45+
sb.Append($"An error occurred while parsing the usfm for '{fileName}`");
46+
if (!string.IsNullOrEmpty(_settings.Name))
47+
sb.Append($" in project '{_settings.Name}'");
48+
sb.Append($". Error: '{ex.Message}'");
49+
throw new InvalidOperationException(sb.ToString(), ex);
50+
}
51+
}
52+
return handler.Errors;
53+
}
54+
55+
protected abstract bool Exists(string fileName);
56+
protected abstract Stream Open(string fileName);
57+
}
58+
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
using System.Collections.Generic;
2+
using System.Linq;
3+
using SIL.Scripture;
4+
5+
namespace SIL.Machine.Corpora
6+
{
7+
public enum UsfmVersificationMismatchType
8+
{
9+
MissingChapter,
10+
ExtraChapter,
11+
MissingVerse,
12+
ExtraVerse,
13+
MissingSegment,
14+
ExtraSegment
15+
}
16+
17+
public class UsfmVersificationMismatch //TODO Better name
18+
{
19+
public int BookNum { get; set; }
20+
public int ExpectedChapter { get; set; }
21+
public int ExpectedVerse { get; set; }
22+
public int ActualChapter { get; set; }
23+
public int ActualVerse { get; set; }
24+
25+
public string ExpectedSegment { get; set; }
26+
public string ActualSegment { get; set; }
27+
public UsfmVersificationMismatchType Type { get; set; }
28+
29+
public bool IsMismatch()
30+
{
31+
if (ExpectedChapter != ActualChapter) //TODO set type
32+
return true;
33+
if (ExpectedVerse != ActualVerse)
34+
return true;
35+
if (ExpectedSegment != ActualSegment && ExpectedSegment != null)
36+
return true;
37+
return false;
38+
}
39+
40+
public string ExpectedVerseRef => new VerseRef(BookNum, ExpectedChapter, ExpectedVerse).ToString();
41+
public string ActualVerseRef => new VerseRef(BookNum, ActualChapter, ActualVerse).ToString();
42+
}
43+
44+
public class UsfmVersificationMismatchDetector : UsfmParserHandlerBase
45+
{
46+
private readonly ScrVers _versification;
47+
private int _currentBook;
48+
private int _currentChapter;
49+
private VerseRef _currentVerse;
50+
private readonly List<UsfmVersificationMismatch> _errors;
51+
52+
public UsfmVersificationMismatchDetector(ScrVers versification)
53+
{
54+
_versification = versification;
55+
_currentBook = 0;
56+
_currentChapter = 0;
57+
_currentVerse = new VerseRef();
58+
_errors = new List<UsfmVersificationMismatch>();
59+
}
60+
61+
public bool HasError => _errors.Count > 0;
62+
public IReadOnlyList<UsfmVersificationMismatch> Errors => _errors;
63+
64+
public override void StartBook(UsfmParserState state, string marker, string code)
65+
{
66+
if (_currentBook > 0 && Canon.IsCanonical(_currentBook))
67+
{
68+
var versificationMismatch = new UsfmVersificationMismatch()
69+
{
70+
BookNum = _currentBook,
71+
ExpectedChapter = _versification.GetLastChapter(_currentBook),
72+
ExpectedVerse = _versification.GetLastVerse(_currentBook, _currentChapter),
73+
ActualChapter = _currentChapter,
74+
ActualVerse = _currentVerse.AllVerses().Last().VerseNum,
75+
};
76+
if (versificationMismatch.IsMismatch())
77+
_errors.Add(versificationMismatch);
78+
}
79+
80+
_currentBook = state.VerseRef.BookNum;
81+
_currentChapter = 0;
82+
_currentVerse = new VerseRef();
83+
}
84+
85+
public override void Verse(
86+
UsfmParserState state,
87+
string number,
88+
string marker,
89+
string altNumber,
90+
string pubNumber
91+
)
92+
{
93+
_currentVerse = state.VerseRef;
94+
}
95+
96+
public override void Chapter(
97+
UsfmParserState state,
98+
string number,
99+
string marker,
100+
string altNumber,
101+
string pubNumber
102+
)
103+
{
104+
if (_currentChapter != 0)
105+
{
106+
var versificationMismatch = new UsfmVersificationMismatch()
107+
{
108+
BookNum = _currentBook,
109+
ExpectedChapter = _currentChapter,
110+
ExpectedVerse = _versification.GetLastVerse(_currentBook, _currentChapter),
111+
ActualChapter = _currentChapter,
112+
ActualVerse = _currentVerse.AllVerses().Last().VerseNum,
113+
};
114+
if (versificationMismatch.IsMismatch())
115+
_errors.Add(versificationMismatch);
116+
}
117+
118+
_currentChapter = state.VerseRef.ChapterNum;
119+
_currentVerse = new VerseRef();
120+
}
121+
}
122+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
using System.IO;
2+
using System.IO.Compression;
3+
4+
namespace SIL.Machine.Corpora
5+
{
6+
public class ZipParatextProjectVersificationMismatchDetector : ParatextProjectVersificationMismatchDetector
7+
{
8+
private readonly ZipArchive _archive;
9+
10+
public ZipParatextProjectVersificationMismatchDetector(ZipArchive archive)
11+
: base(new ZipParatextProjectSettingsParser(archive))
12+
{
13+
_archive = archive;
14+
}
15+
16+
protected override bool Exists(string fileName)
17+
{
18+
return _archive.GetEntry(fileName) != null;
19+
}
20+
21+
protected override Stream Open(string fileName)
22+
{
23+
ZipArchiveEntry entry = _archive.GetEntry(fileName);
24+
if (entry == null)
25+
return null;
26+
return entry.Open();
27+
}
28+
}
29+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
using System.Text;
2+
3+
namespace SIL.Machine.Corpora;
4+
5+
public class MemoryParatextProjectVersificationMismatchDetector(
6+
ParatextProjectSettings settings,
7+
IDictionary<string, string> files
8+
) : ParatextProjectVersificationMismatchDetector(settings)
9+
{
10+
public IDictionary<string, string> Files { get; } = files;
11+
12+
protected override bool Exists(string fileName)
13+
{
14+
return Files.ContainsKey(fileName);
15+
}
16+
17+
protected override Stream? Open(string fileName)
18+
{
19+
if (!Files.TryGetValue(fileName, out string? contents))
20+
return null;
21+
return new MemoryStream(Encoding.UTF8.GetBytes(contents));
22+
}
23+
}

tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,4 +195,19 @@ public void AnalyzeCorporaQuoteConventions()
195195
Assert.NotNull(targetAnalysis);
196196
});
197197
}
198+
199+
[Test]
200+
[Ignore("This is for manual testing only. Remove this tag to run the test.")]
201+
public void ValidateUsfmVersification()
202+
{
203+
using ZipArchive zipArchive = ZipFile.OpenRead(CorporaTestHelpers.UsfmSourceProjectZipPath);
204+
var quoteConventionDetector = new ZipParatextProjectVersificationMismatchDetector(zipArchive);
205+
IReadOnlyList<UsfmVersificationMismatch> mismatches = quoteConventionDetector.GetUsfmVersificationMismatches();
206+
207+
Assert.That(
208+
mismatches,
209+
Has.Count.EqualTo(0),
210+
JsonSerializer.Serialize(mismatches, new JsonSerializerOptions { WriteIndented = true })
211+
);
212+
}
198213
}

0 commit comments

Comments
 (0)