11using System ;
22using System . Collections . Generic ;
33using System . Linq ;
4+ using SIL . Scripture ;
45
56namespace SIL . Machine . Corpora
67{
@@ -42,6 +43,12 @@ public UpdateUsfmRow(
4243 public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
4344 {
4445 private readonly IReadOnlyList < UpdateUsfmRow > _rows ;
46+ private int _rowIndex ;
47+ private VerseRef _verseRowsRef ;
48+ private readonly List < int > _verseRows ;
49+ private int _verseRowIndex ;
50+ private readonly Dictionary < VerseRef , List < RowInfo > > _verseRowsMap ;
51+ private readonly ScrVers _updateRowsVersification ;
4552 private readonly List < UsfmToken > _tokens ;
4653 private readonly List < UsfmToken > _updatedText ;
4754 private readonly List < UsfmToken > _embedTokens ;
@@ -55,10 +62,11 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
5562 private readonly Stack < IUsfmUpdateBlockHandler > _updateBlockHandlers ;
5663 private readonly List < string > _remarks ;
5764 private readonly Stack < bool > _replace ;
58- private int _rowIndex ;
5965 private int _tokenIndex ;
6066 private readonly Func < UsfmUpdateBlockHandlerException , bool > _errorHandler ;
67+ private readonly bool _compareSegments ;
6168
69+ /// <param name="rows">UpdateUsfmRows must be in order</param>
6270 public UpdateUsfmParserHandler (
6371 IReadOnlyList < UpdateUsfmRow > rows = null ,
6472 string idText = null ,
@@ -69,10 +77,18 @@ public UpdateUsfmParserHandler(
6977 IEnumerable < string > preserveParagraphStyles = null ,
7078 IEnumerable < IUsfmUpdateBlockHandler > updateBlockHandlers = null ,
7179 IEnumerable < string > remarks = null ,
72- Func < UsfmUpdateBlockHandlerException , bool > errorHandler = null
80+ Func < UsfmUpdateBlockHandlerException , bool > errorHandler = null ,
81+ bool compareSegments = false
7382 )
7483 {
7584 _rows = rows ?? Array . Empty < UpdateUsfmRow > ( ) ;
85+ _verseRows = new List < int > ( ) ;
86+ _verseRowsMap = new Dictionary < VerseRef , List < RowInfo > > (
87+ compareSegments ? VerseRefComparer . Default : VerseRefComparer . IgnoreSegments
88+ ) ;
89+ _updateRowsVersification = ScrVers . English ;
90+ if ( _rows . Count > 0 )
91+ _updateRowsVersification = _rows . First ( r => r . Refs . Count > 0 ) . Refs [ 0 ] . Versification ;
7692 _tokens = new List < UsfmToken > ( ) ;
7793 _updatedText = new List < UsfmToken > ( ) ;
7894 _updateBlocks = new Stack < UsfmUpdateBlock > ( ) ;
@@ -95,6 +111,7 @@ public UpdateUsfmParserHandler(
95111 _errorHandler = errorHandler ;
96112 if ( _errorHandler == null )
97113 _errorHandler = ( error ) => false ;
114+ _compareSegments = compareSegments ;
98115 }
99116
100117 public IReadOnlyList < UsfmToken > Tokens => _tokens ;
@@ -107,6 +124,10 @@ public override void EndUsfm(UsfmParserState state)
107124
108125 public override void StartBook ( UsfmParserState state , string marker , string code )
109126 {
127+ _verseRowsRef = state . VerseRef ;
128+ UpdateVerseRowsMap ( ) ;
129+ UpdateVerseRows ( ) ;
130+
110131 CollectReadonlyTokens ( state ) ;
111132 _updateBlocks . Push ( new UsfmUpdateBlock ( ) ) ;
112133 var startBookTokens = new List < UsfmToken > ( ) ;
@@ -137,7 +158,7 @@ IReadOnlyList<UsfmAttribute> attributes
137158 if ( state . IsVerseText )
138159 {
139160 // Only strip paragraph markers in a verse
140- if ( _paragraphBehavior == UpdateUsfmMarkerBehavior . Preserve )
161+ if ( _paragraphBehavior == UpdateUsfmMarkerBehavior . Preserve && ! DuplicateVerse )
141162 {
142163 CollectUpdatableTokens ( state ) ;
143164 }
@@ -193,6 +214,13 @@ string pubNumber
193214 {
194215 UseUpdatedText ( ) ;
195216
217+ if ( ! _verseRowsRef . Equals ( state . VerseRef ) )
218+ {
219+ _verseRowsRef = state . VerseRef ;
220+ UpdateVerseRowsMap ( ) ;
221+ UpdateVerseRows ( ) ;
222+ }
223+
196224 base . Chapter ( state , number , marker , altNumber , pubNumber ) ;
197225
198226 CollectReadonlyTokens ( state ) ;
@@ -230,16 +258,31 @@ string pubNumber
230258 }
231259 }
232260
261+ if ( ! _verseRowsRef . Equals ( state . VerseRef ) )
262+ {
263+ _verseRowsRef = state . VerseRef ;
264+ UpdateVerseRows ( ) ;
265+ }
266+
233267 base . Verse ( state , number , marker , altNumber , pubNumber ) ;
234268
235- CollectReadonlyTokens ( state ) ;
269+ if ( DuplicateVerse )
270+ {
271+ SkipUpdatableTokens ( state ) ;
272+ }
273+ else
274+ {
275+ CollectReadonlyTokens ( state ) ;
276+ }
236277 }
237278
238279 public override void StartNote ( UsfmParserState state , string marker , string caller , string category )
239280 {
240281 base . StartNote ( state , marker , caller , category ) ;
241-
242- CollectUpdatableTokens ( state ) ;
282+ if ( ! DuplicateVerse )
283+ CollectUpdatableTokens ( state ) ;
284+ else
285+ SkipUpdatableTokens ( state ) ;
243286 }
244287
245288 public override void EndNote ( UsfmParserState state , string marker , bool closed )
@@ -319,7 +362,7 @@ public override void Text(UsfmParserState state, string text)
319362 base . Text ( state , text ) ;
320363
321364 // strip out text in verses that are being replaced
322- if ( ReplaceWithNewTokens ( state ) )
365+ if ( ReplaceWithNewTokens ( state ) || ( DuplicateVerse && CurrentTextType == ScriptureTextType . Verse ) )
323366 SkipUpdatableTokens ( state ) ;
324367 else
325368 CollectUpdatableTokens ( state ) ;
@@ -390,15 +433,11 @@ public string GetUsfm(UsfmStylesheet stylesheet)
390433 remarkTokens . Add ( new UsfmToken ( UsfmTokenType . Paragraph , "rem" , null , null ) ) ;
391434 remarkTokens . Add ( new UsfmToken ( remark ) ) ;
392435 }
393-
394- if ( tokens . Count > 0 && tokens [ 0 ] . Marker == "id" )
436+ if ( tokens . Count > 0 )
395437 {
396- int index = 1 ;
397- if ( tokens . Count > 1 && tokens [ 1 ] . Type == UsfmTokenType . Text )
398- {
399- index = 2 ;
400- }
401- while ( tokens [ index ] . Marker == "rem" )
438+ int index = 0 ;
439+ HashSet < string > markersToSkip = new HashSet < string > ( ) { "id" , "ide" , "rem" } ;
440+ while ( markersToSkip . Contains ( tokens [ index ] . Marker ) )
402441 {
403442 index ++ ;
404443 if ( tokens . Count > index && tokens [ index ] . Type == UsfmTokenType . Text )
@@ -407,6 +446,7 @@ public string GetUsfm(UsfmStylesheet stylesheet)
407446 tokens . InsertRange ( index , remarkTokens ) ;
408447 }
409448 }
449+
410450 return tokenizer . Detokenize ( tokens ) ;
411451 }
412452
@@ -418,11 +458,11 @@ IReadOnlyList<ScriptureRef> segScrRefs
418458 Dictionary < string , object > rowMetadata = null ;
419459 int sourceIndex = 0 ;
420460 // search the sorted rows with updated text, starting from where we left off last.
421- while ( _rowIndex < _rows . Count && sourceIndex < segScrRefs . Count )
461+ while ( _verseRowIndex < _verseRows . Count && sourceIndex < segScrRefs . Count )
422462 {
423463 // get the set of references for the current row
424464 int compare = 0 ;
425- UpdateUsfmRow row = _rows [ _rowIndex ] ;
465+ UpdateUsfmRow row = _rows [ _verseRows [ _verseRowIndex ] ] ;
426466 ( IReadOnlyList < ScriptureRef > rowScrRefs , string text , IReadOnlyDictionary < string , object > metadata ) = (
427467 row . Refs ,
428468 row . Text ,
@@ -432,7 +472,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
432472 {
433473 while ( sourceIndex < segScrRefs . Count )
434474 {
435- compare = rowScrRef . CompareTo ( segScrRefs [ sourceIndex ] , compareSegments : false ) ;
475+ compare = rowScrRef . CompareTo ( segScrRefs [ sourceIndex ] , compareSegments : _compareSegments ) ;
436476 if ( compare > 0 )
437477 // row is ahead of source, increment source
438478 sourceIndex ++ ;
@@ -451,7 +491,7 @@ IReadOnlyList<ScriptureRef> segScrRefs
451491 if ( compare <= 0 )
452492 {
453493 // source is ahead row, increment row
454- _rowIndex ++ ;
494+ _verseRowIndex ++ ;
455495 }
456496 }
457497 return ( rowTexts , rowMetadata ) ;
@@ -649,5 +689,63 @@ private bool IsNonverseParagraph(UsfmParserState state, UsfmUpdateBlockElement e
649689 UsfmTag paraTag = state . Stylesheet . GetTag ( paraToken . Marker ) ;
650690 return paraTag . TextType != UsfmTextType . VerseText && paraTag . TextType != UsfmTextType . NotSpecified ;
651691 }
692+
693+ private void UpdateVerseRowsMap ( )
694+ {
695+ _verseRowsMap . Clear ( ) ;
696+ while ( _rowIndex < _rows . Count && _rows [ _rowIndex ] . Refs [ 0 ] . ChapterNum == _verseRowsRef . ChapterNum )
697+ {
698+ UpdateUsfmRow row = _rows [ _rowIndex ] ;
699+ var ri = new RowInfo ( _rowIndex ) ;
700+ foreach ( ScriptureRef sr in row . Refs )
701+ {
702+ if ( ! _verseRowsMap . TryGetValue ( sr . VerseRef , out List < RowInfo > rows ) )
703+ {
704+ rows = new List < RowInfo > ( ) ;
705+ _verseRowsMap [ sr . VerseRef ] = rows ;
706+ }
707+ rows . Add ( ri ) ;
708+ }
709+ _rowIndex ++ ;
710+ }
711+ }
712+
713+ private void UpdateVerseRows ( )
714+ {
715+ VerseRef vref = _verseRowsRef ;
716+ // We are using a dictionary, which uses an equality comparer. As a result, we need to change the
717+ // source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
718+ // would be less efficient.
719+ vref . ChangeVersification ( _updateRowsVersification ) ;
720+
721+ _verseRows . Clear ( ) ;
722+ _verseRowIndex = 0 ;
723+
724+ foreach ( VerseRef vr in vref . AllVerses ( ) )
725+ {
726+ if ( _verseRowsMap . TryGetValue ( vr , out List < RowInfo > rows ) )
727+ {
728+ foreach ( RowInfo row in rows )
729+ {
730+ if ( ! row . IsConsumed )
731+ {
732+ _verseRows . Add ( row . RowIndex ) ;
733+ row . IsConsumed = true ;
734+ }
735+ }
736+ }
737+ }
738+ }
739+
740+ private class RowInfo
741+ {
742+ public RowInfo ( int rowIndex )
743+ {
744+ RowIndex = rowIndex ;
745+ }
746+
747+ public int RowIndex { get ; set ; }
748+ public bool IsConsumed { get ; set ; }
749+ }
652750 }
653751}
0 commit comments