Skip to content

Commit ba5915c

Browse files
Enkidu93ddaspit
andauthored
Get update rows by reference (#341)
* Get update rows by reference; implement SF comparer; handle duplicate verses; add remark after ide tag Fix marker placement test given new proper out of order verse handling Add comment regarding row maps * Create rows map based on VerseRef instead of ScriptureRef - add compareSegments parameter to UpdateUsfmParserHandler * More scripture ref comparer to separate file; sort rows * Pass error handler in UpdateUsfm * Sort by verse ref not scripture ref; add error handler in tests * Do not sort rows * Edit comment since rows will no longer necessarily be sorted * Revert comment * Add comment regarding the rows parameter --------- Co-authored-by: Damien Daspit <[email protected]>
1 parent 1662968 commit ba5915c

File tree

7 files changed

+294
-36
lines changed

7 files changed

+294
-36
lines changed

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ public string UpdateUsfm(
2929
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
3030
IEnumerable<string> preserveParagraphStyles = null,
3131
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
32-
IEnumerable<string> remarks = null
32+
IEnumerable<string> remarks = null,
33+
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
34+
bool compareSegments = false
3335
)
3436
{
3537
string fileName = _settings.GetBookFileName(bookId);
@@ -51,7 +53,9 @@ public string UpdateUsfm(
5153
styleBehavior,
5254
preserveParagraphStyles,
5355
updateBlockHandlers,
54-
remarks
56+
remarks,
57+
errorHandler,
58+
compareSegments
5559
);
5660
try
5761
{
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
using System.Collections.Generic;
2+
using SIL.Extensions;
3+
using SIL.Machine.Corpora;
4+
5+
public class ScriptureRefComparer : IComparer<ScriptureRef>, IEqualityComparer<ScriptureRef>
6+
{
7+
public static ScriptureRefComparer Default { get; } = new ScriptureRefComparer(compareSegments: true);
8+
public static ScriptureRefComparer IgnoreSegments { get; } = new ScriptureRefComparer(compareSegments: false);
9+
private readonly bool _compareSegments;
10+
11+
public ScriptureRefComparer(bool compareSegments = true)
12+
{
13+
_compareSegments = compareSegments;
14+
}
15+
16+
public int Compare(ScriptureRef x, ScriptureRef y)
17+
{
18+
return x.CompareTo(y, _compareSegments);
19+
}
20+
21+
public bool Equals(ScriptureRef x, ScriptureRef y)
22+
{
23+
return x.CompareTo(y, _compareSegments) == 0;
24+
}
25+
26+
public int GetHashCode(ScriptureRef obj)
27+
{
28+
int hashCode = 23;
29+
hashCode =
30+
hashCode * 31
31+
+ (_compareSegments ? obj.VerseRef.BBBCCCVVVS.GetHashCode() : obj.VerseRef.BBBCCCVVV.GetHashCode());
32+
hashCode = hashCode * 31 + obj.Versification.GetHashCode();
33+
// Using ToRelaxed is necessary to maintain equality across relaxed refs, Equals properly handles relaxed ref comparison
34+
hashCode = hashCode * 31 + obj.ToRelaxed().Path.GetSequenceHashCode();
35+
return hashCode;
36+
}
37+
}

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
1818
private VerseRef _curVerseRef;
1919
private readonly Stack<ScriptureElement> _curElements;
2020
private readonly Stack<ScriptureTextType> _curTextType;
21-
private bool _duplicateVerse = false;
2221

2322
protected ScriptureRefUsfmParserHandlerBase()
2423
{
@@ -29,6 +28,8 @@ protected ScriptureRefUsfmParserHandlerBase()
2928
protected ScriptureTextType CurrentTextType =>
3029
_curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek();
3130

31+
protected bool DuplicateVerse { get; private set; }
32+
3233
private static readonly string[] EmbedStyles = new[] { "f", "fe", "x", "fig" };
3334

3435
private static bool IsEmbedStyle(string marker)
@@ -66,13 +67,13 @@ public override void Verse(
6667
string pubNumber
6768
)
6869
{
69-
if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse)
70+
if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse)
7071
{
7172
if (state.VerseRef.VerseNum > 0)
7273
{
7374
EndVerseText(state, CreateVerseRefs());
7475
// ignore duplicate verses
75-
_duplicateVerse = true;
76+
DuplicateVerse = true;
7677
}
7778
}
7879
else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse))
@@ -251,14 +252,14 @@ protected virtual void EndEmbedText(UsfmParserState state, ScriptureRef scriptur
251252

252253
private void StartVerseText(UsfmParserState state)
253254
{
254-
_duplicateVerse = false;
255+
DuplicateVerse = false;
255256
_curTextType.Push(ScriptureTextType.Verse);
256257
StartVerseText(state, CreateVerseRefs());
257258
}
258259

259260
private void EndVerseText(UsfmParserState state)
260261
{
261-
if (!_duplicateVerse && _curVerseRef.VerseNum > 0)
262+
if (!DuplicateVerse && _curVerseRef.VerseNum > 0)
262263
EndVerseText(state, CreateVerseRefs());
263264
if (_curVerseRef.VerseNum > 0)
264265
_curTextType.Pop();
@@ -291,7 +292,7 @@ private void StartEmbedText(UsfmParserState state, string marker)
291292
{
292293
if (_curVerseRef.IsDefault)
293294
UpdateVerseRef(state.VerseRef, marker);
294-
if (!_duplicateVerse)
295+
if (!DuplicateVerse)
295296
{
296297
CheckConvertVerseParaToNonVerse(state);
297298
NextElement(marker);
@@ -302,7 +303,7 @@ private void StartEmbedText(UsfmParserState state, string marker)
302303

303304
private void EndEmbedText(UsfmParserState state)
304305
{
305-
if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
306+
if (!DuplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
306307
{
307308
EndEmbedText(state, CreateNonVerseRef());
308309
_curTextType.Pop();

src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs

Lines changed: 117 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using System;
22
using System.Collections.Generic;
33
using System.Linq;
4+
using SIL.Scripture;
45

56
namespace SIL.Machine.Corpora
67
{
@@ -42,6 +43,12 @@ public UpdateUsfmRow(
4243
public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
4344
{
4445
private readonly IReadOnlyList<UpdateUsfmRow> _rows;
46+
private int _rowIndex;
47+
private VerseRef _verseRowsRef;
48+
private readonly List<int> _verseRows;
49+
private int _verseRowIndex;
50+
private readonly Dictionary<VerseRef, List<RowInfo>> _verseRowsMap;
51+
private readonly ScrVers _updateRowsVersification;
4552
private readonly List<UsfmToken> _tokens;
4653
private readonly List<UsfmToken> _updatedText;
4754
private readonly List<UsfmToken> _embedTokens;
@@ -55,10 +62,11 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
5562
private readonly Stack<IUsfmUpdateBlockHandler> _updateBlockHandlers;
5663
private readonly List<string> _remarks;
5764
private readonly Stack<bool> _replace;
58-
private int _rowIndex;
5965
private int _tokenIndex;
6066
private readonly Func<UsfmUpdateBlockHandlerException, bool> _errorHandler;
67+
private readonly bool _compareSegments;
6168

69+
/// <param name="rows">UpdateUsfmRows must be in order</param>
6270
public UpdateUsfmParserHandler(
6371
IReadOnlyList<UpdateUsfmRow> rows = null,
6472
string idText = null,
@@ -69,10 +77,18 @@ public UpdateUsfmParserHandler(
6977
IEnumerable<string> preserveParagraphStyles = null,
7078
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
7179
IEnumerable<string> remarks = null,
72-
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null
80+
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
81+
bool compareSegments = false
7382
)
7483
{
7584
_rows = rows ?? Array.Empty<UpdateUsfmRow>();
85+
_verseRows = new List<int>();
86+
_verseRowsMap = new Dictionary<VerseRef, List<RowInfo>>(
87+
compareSegments ? VerseRefComparer.Default : VerseRefComparer.IgnoreSegments
88+
);
89+
_updateRowsVersification = ScrVers.English;
90+
if (_rows.Count > 0)
91+
_updateRowsVersification = _rows.First(r => r.Refs.Count > 0).Refs[0].Versification;
7692
_tokens = new List<UsfmToken>();
7793
_updatedText = new List<UsfmToken>();
7894
_updateBlocks = new Stack<UsfmUpdateBlock>();
@@ -95,6 +111,7 @@ public UpdateUsfmParserHandler(
95111
_errorHandler = errorHandler;
96112
if (_errorHandler == null)
97113
_errorHandler = (error) => false;
114+
_compareSegments = compareSegments;
98115
}
99116

100117
public IReadOnlyList<UsfmToken> Tokens => _tokens;
@@ -107,6 +124,10 @@ public override void EndUsfm(UsfmParserState state)
107124

108125
public override void StartBook(UsfmParserState state, string marker, string code)
109126
{
127+
_verseRowsRef = state.VerseRef;
128+
UpdateVerseRowsMap();
129+
UpdateVerseRows();
130+
110131
CollectReadonlyTokens(state);
111132
_updateBlocks.Push(new UsfmUpdateBlock());
112133
var startBookTokens = new List<UsfmToken>();
@@ -137,7 +158,7 @@ IReadOnlyList<UsfmAttribute> attributes
137158
if (state.IsVerseText)
138159
{
139160
// Only strip paragraph markers in a verse
140-
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve)
161+
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve && !DuplicateVerse)
141162
{
142163
CollectUpdatableTokens(state);
143164
}
@@ -193,6 +214,13 @@ string pubNumber
193214
{
194215
UseUpdatedText();
195216

217+
if (!_verseRowsRef.Equals(state.VerseRef))
218+
{
219+
_verseRowsRef = state.VerseRef;
220+
UpdateVerseRowsMap();
221+
UpdateVerseRows();
222+
}
223+
196224
base.Chapter(state, number, marker, altNumber, pubNumber);
197225

198226
CollectReadonlyTokens(state);
@@ -230,16 +258,31 @@ string pubNumber
230258
}
231259
}
232260

261+
if (!_verseRowsRef.Equals(state.VerseRef))
262+
{
263+
_verseRowsRef = state.VerseRef;
264+
UpdateVerseRows();
265+
}
266+
233267
base.Verse(state, number, marker, altNumber, pubNumber);
234268

235-
CollectReadonlyTokens(state);
269+
if (DuplicateVerse)
270+
{
271+
SkipUpdatableTokens(state);
272+
}
273+
else
274+
{
275+
CollectReadonlyTokens(state);
276+
}
236277
}
237278

238279
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
239280
{
240281
base.StartNote(state, marker, caller, category);
241-
242-
CollectUpdatableTokens(state);
282+
if (!DuplicateVerse)
283+
CollectUpdatableTokens(state);
284+
else
285+
SkipUpdatableTokens(state);
243286
}
244287

245288
public override void EndNote(UsfmParserState state, string marker, bool closed)
@@ -319,7 +362,7 @@ public override void Text(UsfmParserState state, string text)
319362
base.Text(state, text);
320363

321364
// strip out text in verses that are being replaced
322-
if (ReplaceWithNewTokens(state))
365+
if (ReplaceWithNewTokens(state) || (DuplicateVerse && CurrentTextType == ScriptureTextType.Verse))
323366
SkipUpdatableTokens(state);
324367
else
325368
CollectUpdatableTokens(state);
@@ -390,15 +433,11 @@ public string GetUsfm(UsfmStylesheet stylesheet)
390433
remarkTokens.Add(new UsfmToken(UsfmTokenType.Paragraph, "rem", null, null));
391434
remarkTokens.Add(new UsfmToken(remark));
392435
}
393-
394-
if (tokens.Count > 0 && tokens[0].Marker == "id")
436+
if (tokens.Count > 0)
395437
{
396-
int index = 1;
397-
if (tokens.Count > 1 && tokens[1].Type == UsfmTokenType.Text)
398-
{
399-
index = 2;
400-
}
401-
while (tokens[index].Marker == "rem")
438+
int index = 0;
439+
HashSet<string> markersToSkip = new HashSet<string>() { "id", "ide", "rem" };
440+
while (markersToSkip.Contains(tokens[index].Marker))
402441
{
403442
index++;
404443
if (tokens.Count > index && tokens[index].Type == UsfmTokenType.Text)
@@ -407,6 +446,7 @@ public string GetUsfm(UsfmStylesheet stylesheet)
407446
tokens.InsertRange(index, remarkTokens);
408447
}
409448
}
449+
410450
return tokenizer.Detokenize(tokens);
411451
}
412452

@@ -418,11 +458,11 @@ IReadOnlyList<ScriptureRef> segScrRefs
418458
Dictionary<string, object> rowMetadata = null;
419459
int sourceIndex = 0;
420460
// search the sorted rows with updated text, starting from where we left off last.
421-
while (_rowIndex < _rows.Count && sourceIndex < segScrRefs.Count)
461+
while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count)
422462
{
423463
// get the set of references for the current row
424464
int compare = 0;
425-
UpdateUsfmRow row = _rows[_rowIndex];
465+
UpdateUsfmRow row = _rows[_verseRows[_verseRowIndex]];
426466
(IReadOnlyList<ScriptureRef> rowScrRefs, string text, IReadOnlyDictionary<string, object> metadata) = (
427467
row.Refs,
428468
row.Text,
@@ -432,7 +472,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
432472
{
433473
while (sourceIndex < segScrRefs.Count)
434474
{
435-
compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: false);
475+
compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: _compareSegments);
436476
if (compare > 0)
437477
// row is ahead of source, increment source
438478
sourceIndex++;
@@ -451,7 +491,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
451491
if (compare <= 0)
452492
{
453493
// source is ahead row, increment row
454-
_rowIndex++;
494+
_verseRowIndex++;
455495
}
456496
}
457497
return (rowTexts, rowMetadata);
@@ -649,5 +689,63 @@ private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement e
649689
UsfmTag paraTag = state.Stylesheet.GetTag(paraToken.Marker);
650690
return paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != UsfmTextType.NotSpecified;
651691
}
692+
693+
private void UpdateVerseRowsMap()
694+
{
695+
_verseRowsMap.Clear();
696+
while (_rowIndex < _rows.Count && _rows[_rowIndex].Refs[0].ChapterNum == _verseRowsRef.ChapterNum)
697+
{
698+
UpdateUsfmRow row = _rows[_rowIndex];
699+
var ri = new RowInfo(_rowIndex);
700+
foreach (ScriptureRef sr in row.Refs)
701+
{
702+
if (!_verseRowsMap.TryGetValue(sr.VerseRef, out List<RowInfo> rows))
703+
{
704+
rows = new List<RowInfo>();
705+
_verseRowsMap[sr.VerseRef] = rows;
706+
}
707+
rows.Add(ri);
708+
}
709+
_rowIndex++;
710+
}
711+
}
712+
713+
private void UpdateVerseRows()
714+
{
715+
VerseRef vref = _verseRowsRef;
716+
// We are using a dictionary, which uses an equality comparer. As a result, we need to change the
717+
// source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
718+
// would be less efficient.
719+
vref.ChangeVersification(_updateRowsVersification);
720+
721+
_verseRows.Clear();
722+
_verseRowIndex = 0;
723+
724+
foreach (VerseRef vr in vref.AllVerses())
725+
{
726+
if (_verseRowsMap.TryGetValue(vr, out List<RowInfo> rows))
727+
{
728+
foreach (RowInfo row in rows)
729+
{
730+
if (!row.IsConsumed)
731+
{
732+
_verseRows.Add(row.RowIndex);
733+
row.IsConsumed = true;
734+
}
735+
}
736+
}
737+
}
738+
}
739+
740+
private class RowInfo
741+
{
742+
public RowInfo(int rowIndex)
743+
{
744+
RowIndex = rowIndex;
745+
}
746+
747+
public int RowIndex { get; set; }
748+
public bool IsConsumed { get; set; }
749+
}
652750
}
653751
}

0 commit comments

Comments
 (0)