Skip to content

Commit dafe32c

Browse files
ddaspitEnkidu93
authored andcommitted
Create rows map based on VerseRef instead of ScriptureRef
- add compareSegments parameter to UpdateUsfmParserHandler
1 parent a52274e commit dafe32c

File tree

5 files changed

+179
-86
lines changed

5 files changed

+179
-86
lines changed

src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ public string UpdateUsfm(
2929
UpdateUsfmMarkerBehavior styleBehavior = UpdateUsfmMarkerBehavior.Strip,
3030
IEnumerable<string> preserveParagraphStyles = null,
3131
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
32-
IEnumerable<string> remarks = null
32+
IEnumerable<string> remarks = null,
33+
bool compareSegments = false
3334
)
3435
{
3536
string fileName = _settings.GetBookFileName(bookId);
@@ -51,7 +52,8 @@ public string UpdateUsfm(
5152
styleBehavior,
5253
preserveParagraphStyles,
5354
updateBlockHandlers,
54-
remarks
55+
remarks,
56+
compareSegments
5557
);
5658
try
5759
{

src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
1818
private VerseRef _curVerseRef;
1919
private readonly Stack<ScriptureElement> _curElements;
2020
private readonly Stack<ScriptureTextType> _curTextType;
21-
protected bool _duplicateVerse = false;
2221

2322
protected ScriptureRefUsfmParserHandlerBase()
2423
{
@@ -29,6 +28,8 @@ protected ScriptureRefUsfmParserHandlerBase()
2928
protected ScriptureTextType CurrentTextType =>
3029
_curTextType.Count == 0 ? ScriptureTextType.None : _curTextType.Peek();
3130

31+
protected bool DuplicateVerse { get; private set; }
32+
3233
private static readonly string[] EmbedStyles = new[] { "f", "fe", "x", "fig" };
3334

3435
private static bool IsEmbedStyle(string marker)
@@ -66,13 +67,13 @@ public override void Verse(
6667
string pubNumber
6768
)
6869
{
69-
if (state.VerseRef.Equals(_curVerseRef) && !_duplicateVerse)
70+
if (state.VerseRef.Equals(_curVerseRef) && !DuplicateVerse)
7071
{
7172
if (state.VerseRef.VerseNum > 0)
7273
{
7374
EndVerseText(state, CreateVerseRefs());
7475
// ignore duplicate verses
75-
_duplicateVerse = true;
76+
DuplicateVerse = true;
7677
}
7778
}
7879
else if (VerseRef.AreOverlappingVersesRanges(verse1: number, verse2: _curVerseRef.Verse))
@@ -251,14 +252,14 @@ protected virtual void EndEmbedText(UsfmParserState state, ScriptureRef scriptur
251252

252253
private void StartVerseText(UsfmParserState state)
253254
{
254-
_duplicateVerse = false;
255+
DuplicateVerse = false;
255256
_curTextType.Push(ScriptureTextType.Verse);
256257
StartVerseText(state, CreateVerseRefs());
257258
}
258259

259260
private void EndVerseText(UsfmParserState state)
260261
{
261-
if (!_duplicateVerse && _curVerseRef.VerseNum > 0)
262+
if (!DuplicateVerse && _curVerseRef.VerseNum > 0)
262263
EndVerseText(state, CreateVerseRefs());
263264
if (_curVerseRef.VerseNum > 0)
264265
_curTextType.Pop();
@@ -291,7 +292,7 @@ private void StartEmbedText(UsfmParserState state, string marker)
291292
{
292293
if (_curVerseRef.IsDefault)
293294
UpdateVerseRef(state.VerseRef, marker);
294-
if (!_duplicateVerse)
295+
if (!DuplicateVerse)
295296
{
296297
CheckConvertVerseParaToNonVerse(state);
297298
NextElement(marker);
@@ -302,7 +303,7 @@ private void StartEmbedText(UsfmParserState state, string marker)
302303

303304
private void EndEmbedText(UsfmParserState state)
304305
{
305-
if (!_duplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
306+
if (!DuplicateVerse && _curTextType.Count > 0 && _curTextType.Peek() == ScriptureTextType.Embed)
306307
{
307308
EndEmbedText(state, CreateNonVerseRef());
308309
_curTextType.Pop();

src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs

Lines changed: 132 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,12 @@ public UpdateUsfmRow(
4242
*/
4343
public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
4444
{
45-
private readonly Dictionary<ScriptureRef, List<UpdateUsfmRow>> _rowMapIgnoreSegments;
46-
private readonly Dictionary<ScriptureRef, List<UpdateUsfmRow>> _rowMapCheckSegments;
45+
private readonly IReadOnlyList<UpdateUsfmRow> _rows;
46+
private int _rowIndex;
47+
private VerseRef _verseRowsRef;
48+
private readonly List<int> _verseRows;
49+
private int _verseRowIndex;
50+
private readonly Dictionary<VerseRef, List<RowInfo>> _verseRowsMap;
4751
private readonly ScrVers _updateRowsVersification;
4852
private readonly List<UsfmToken> _tokens;
4953
private readonly List<UsfmToken> _updatedText;
@@ -60,6 +64,7 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
6064
private readonly Stack<bool> _replace;
6165
private int _tokenIndex;
6266
private readonly Func<UsfmUpdateBlockHandlerException, bool> _errorHandler;
67+
private readonly bool _compareSegments;
6368

6469
public UpdateUsfmParserHandler(
6570
IReadOnlyList<UpdateUsfmRow> rows = null,
@@ -71,17 +76,18 @@ public UpdateUsfmParserHandler(
7176
IEnumerable<string> preserveParagraphStyles = null,
7277
IEnumerable<IUsfmUpdateBlockHandler> updateBlockHandlers = null,
7378
IEnumerable<string> remarks = null,
74-
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null
79+
Func<UsfmUpdateBlockHandlerException, bool> errorHandler = null,
80+
bool compareSegments = false
7581
)
7682
{
77-
// We need two maps so that update rows can be specified per segment
78-
// but be handled correctly whether or not the USFM has segments for that verse
79-
(_rowMapIgnoreSegments, _rowMapCheckSegments) = GetRowMap(rows ?? Array.Empty<UpdateUsfmRow>());
83+
_rows = rows ?? Array.Empty<UpdateUsfmRow>();
84+
_verseRows = new List<int>();
85+
_verseRowsMap = new Dictionary<VerseRef, List<RowInfo>>(
86+
compareSegments ? VerseRefComparer.Default : VerseRefComparer.IgnoreSegments
87+
);
8088
_updateRowsVersification = ScrVers.English;
81-
if (rows != null && rows.Count > 0)
82-
{
83-
_updateRowsVersification = rows.First(r => r.Refs.Count > 0).Refs[0].Versification;
84-
}
89+
if (_rows.Count > 0)
90+
_updateRowsVersification = _rows.First(r => r.Refs.Count > 0).Refs[0].Versification;
8591
_tokens = new List<UsfmToken>();
8692
_updatedText = new List<UsfmToken>();
8793
_updateBlocks = new Stack<UsfmUpdateBlock>();
@@ -104,6 +110,7 @@ public UpdateUsfmParserHandler(
104110
_errorHandler = errorHandler;
105111
if (_errorHandler == null)
106112
_errorHandler = (error) => false;
113+
_compareSegments = compareSegments;
107114
}
108115

109116
public IReadOnlyList<UsfmToken> Tokens => _tokens;
@@ -116,6 +123,10 @@ public override void EndUsfm(UsfmParserState state)
116123

117124
public override void StartBook(UsfmParserState state, string marker, string code)
118125
{
126+
_verseRowsRef = state.VerseRef;
127+
UpdateVerseRowsMap();
128+
UpdateVerseRows();
129+
119130
CollectReadonlyTokens(state);
120131
_updateBlocks.Push(new UsfmUpdateBlock());
121132
var startBookTokens = new List<UsfmToken>();
@@ -146,7 +157,7 @@ IReadOnlyList<UsfmAttribute> attributes
146157
if (state.IsVerseText)
147158
{
148159
// Only strip paragraph markers in a verse
149-
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve && !_duplicateVerse)
160+
if (_paragraphBehavior == UpdateUsfmMarkerBehavior.Preserve && !DuplicateVerse)
150161
{
151162
CollectUpdatableTokens(state);
152163
}
@@ -202,6 +213,13 @@ string pubNumber
202213
{
203214
UseUpdatedText();
204215

216+
if (!_verseRowsRef.Equals(state.VerseRef))
217+
{
218+
_verseRowsRef = state.VerseRef;
219+
UpdateVerseRowsMap();
220+
UpdateVerseRows();
221+
}
222+
205223
base.Chapter(state, number, marker, altNumber, pubNumber);
206224

207225
CollectReadonlyTokens(state);
@@ -239,9 +257,15 @@ string pubNumber
239257
}
240258
}
241259

260+
if (!_verseRowsRef.Equals(state.VerseRef))
261+
{
262+
_verseRowsRef = state.VerseRef;
263+
UpdateVerseRows();
264+
}
265+
242266
base.Verse(state, number, marker, altNumber, pubNumber);
243267

244-
if (_duplicateVerse)
268+
if (DuplicateVerse)
245269
{
246270
SkipUpdatableTokens(state);
247271
}
@@ -254,7 +278,7 @@ string pubNumber
254278
public override void StartNote(UsfmParserState state, string marker, string caller, string category)
255279
{
256280
base.StartNote(state, marker, caller, category);
257-
if (!_duplicateVerse)
281+
if (!DuplicateVerse)
258282
CollectUpdatableTokens(state);
259283
else
260284
SkipUpdatableTokens(state);
@@ -337,7 +361,7 @@ public override void Text(UsfmParserState state, string text)
337361
base.Text(state, text);
338362

339363
// strip out text in verses that are being replaced
340-
if (ReplaceWithNewTokens(state) || (_duplicateVerse && CurrentTextType == ScriptureTextType.Verse))
364+
if (ReplaceWithNewTokens(state) || (DuplicateVerse && CurrentTextType == ScriptureTextType.Verse))
341365
SkipUpdatableTokens(state);
342366
else
343367
CollectUpdatableTokens(state);
@@ -425,63 +449,48 @@ public string GetUsfm(UsfmStylesheet stylesheet)
425449
return tokenizer.Detokenize(tokens);
426450
}
427451

428-
private (
429-
Dictionary<ScriptureRef, List<UpdateUsfmRow>> RowMapIgnoreSegments,
430-
Dictionary<ScriptureRef, List<UpdateUsfmRow>> RowMapCheckSegments
431-
) GetRowMap(IEnumerable<UpdateUsfmRow> rows)
432-
{
433-
var rowMapIgnoreSegments = new Dictionary<ScriptureRef, List<UpdateUsfmRow>>(
434-
comparer: ScriptureRefComparer.IgnoreSegments
435-
);
436-
var rowMapCheckSegments = new Dictionary<ScriptureRef, List<UpdateUsfmRow>>(
437-
comparer: ScriptureRefComparer.Default
438-
);
439-
foreach (UpdateUsfmRow row in rows)
440-
{
441-
ScriptureRef sr = row.Refs[0];
442-
if (!rowMapIgnoreSegments.ContainsKey(sr))
443-
rowMapIgnoreSegments[sr] = new List<UpdateUsfmRow>();
444-
rowMapIgnoreSegments[sr].Add(row);
445-
if (!rowMapCheckSegments.ContainsKey(sr))
446-
rowMapCheckSegments[sr] = new List<UpdateUsfmRow>();
447-
rowMapCheckSegments[sr].Add(row);
448-
}
449-
return (rowMapIgnoreSegments, rowMapCheckSegments);
450-
}
451-
452-
private List<UpdateUsfmRow> GetRowsForRef(ScriptureRef sr)
453-
{
454-
var normalizedScriptureRef = sr.ChangeVersification(_updateRowsVersification);
455-
if (_rowMapCheckSegments.TryGetValue(normalizedScriptureRef, out List<UpdateUsfmRow> rows))
456-
{
457-
return rows;
458-
}
459-
else if (_rowMapIgnoreSegments.TryGetValue(normalizedScriptureRef, out rows))
460-
{
461-
return rows;
462-
}
463-
return new List<UpdateUsfmRow>();
464-
}
465-
466-
private (IReadOnlyList<string> RowTexts, Dictionary<string, object> Metadata) GetRows(
452+
private (IReadOnlyList<string> RowTexts, Dictionary<string, object> Metadata) AdvanceRows(
467453
IReadOnlyList<ScriptureRef> segScrRefs
468454
)
469455
{
470456
var rowTexts = new List<string>();
471457
Dictionary<string, object> rowMetadata = null;
472-
foreach (ScriptureRef sr in segScrRefs)
458+
int sourceIndex = 0;
459+
// search the sorted rows with updated text, starting from where we left off last.
460+
while (_verseRowIndex < _verseRows.Count && sourceIndex < segScrRefs.Count)
473461
{
474-
List<UpdateUsfmRow> rows = GetRowsForRef(sr);
475-
foreach (UpdateUsfmRow row in rows)
462+
// get the set of references for the current row
463+
int compare = 0;
464+
UpdateUsfmRow row = _rows[_verseRows[_verseRowIndex]];
465+
(IReadOnlyList<ScriptureRef> rowScrRefs, string text, IReadOnlyDictionary<string, object> metadata) = (
466+
row.Refs,
467+
row.Text,
468+
row.Metadata
469+
);
470+
foreach (ScriptureRef rowScrRef in rowScrRefs)
471+
{
472+
while (sourceIndex < segScrRefs.Count)
473+
{
474+
compare = rowScrRef.CompareTo(segScrRefs[sourceIndex], compareSegments: _compareSegments);
475+
if (compare > 0)
476+
// row is ahead of source, increment source
477+
sourceIndex++;
478+
else
479+
break;
480+
}
481+
if (compare == 0)
482+
{
483+
// source and row match
484+
// grab the text - both source and row will be incremented in due time...
485+
rowTexts.Add(text);
486+
rowMetadata = metadata.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
487+
break;
488+
}
489+
}
490+
if (compare <= 0)
476491
{
477-
(
478-
IReadOnlyList<ScriptureRef> rowScrRefs,
479-
string text,
480-
IReadOnlyDictionary<string, object> metadata
481-
) = (row.Refs, row.Text, row.Metadata);
482-
483-
rowTexts.Add(text);
484-
rowMetadata = metadata.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
492+
// source is ahead row, increment row
493+
_verseRowIndex++;
485494
}
486495
}
487496
return (rowTexts, rowMetadata);
@@ -588,7 +597,7 @@ private bool HasNewText()
588597

589598
private void StartUpdateBlock(IReadOnlyList<ScriptureRef> scriptureRefs)
590599
{
591-
(IReadOnlyList<string> rowTexts, Dictionary<string, object> metadata) = GetRows(scriptureRefs);
600+
(IReadOnlyList<string> rowTexts, Dictionary<string, object> metadata) = AdvanceRows(scriptureRefs);
592601
_updateBlocks.Push(
593602
new UsfmUpdateBlock(scriptureRefs, metadata: metadata ?? new Dictionary<string, object>())
594603
);
@@ -679,5 +688,63 @@ private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement e
679688
UsfmTag paraTag = state.Stylesheet.GetTag(paraToken.Marker);
680689
return paraTag.TextType != UsfmTextType.VerseText && paraTag.TextType != UsfmTextType.NotSpecified;
681690
}
691+
692+
private void UpdateVerseRowsMap()
693+
{
694+
_verseRowsMap.Clear();
695+
while (_rowIndex < _rows.Count && _rows[_rowIndex].Refs[0].ChapterNum == _verseRowsRef.ChapterNum)
696+
{
697+
UpdateUsfmRow row = _rows[_rowIndex];
698+
var ri = new RowInfo(_rowIndex);
699+
foreach (ScriptureRef sr in row.Refs)
700+
{
701+
if (!_verseRowsMap.TryGetValue(sr.VerseRef, out List<RowInfo> rows))
702+
{
703+
rows = new List<RowInfo>();
704+
_verseRowsMap[sr.VerseRef] = rows;
705+
}
706+
rows.Add(ri);
707+
}
708+
_rowIndex++;
709+
}
710+
}
711+
712+
private void UpdateVerseRows()
713+
{
714+
VerseRef vref = _verseRowsRef;
715+
// We are using a dictionary, which uses an equality comparer. As a result, we need to change the
716+
// source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
717+
// would be less efficient.
718+
vref.ChangeVersification(_updateRowsVersification);
719+
720+
_verseRows.Clear();
721+
_verseRowIndex = 0;
722+
723+
foreach (VerseRef vr in vref.AllVerses())
724+
{
725+
if (_verseRowsMap.TryGetValue(vr, out List<RowInfo> rows))
726+
{
727+
foreach (RowInfo row in rows)
728+
{
729+
if (!row.IsConsumed)
730+
{
731+
_verseRows.Add(row.RowIndex);
732+
row.IsConsumed = true;
733+
}
734+
}
735+
}
736+
}
737+
}
738+
739+
private class RowInfo
740+
{
741+
public RowInfo(int rowIndex)
742+
{
743+
RowIndex = rowIndex;
744+
}
745+
746+
public int RowIndex { get; set; }
747+
public bool IsConsumed { get; set; }
748+
}
682749
}
683750
}

0 commit comments

Comments
 (0)