Skip to content

Commit 48e221c

Browse files
committed
Port block handler
1 parent ca8f585 commit 48e221c

File tree

1 file changed

+360
-0
lines changed

1 file changed

+360
-0
lines changed
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using SIL.Extensions;
5+
using SIL.Machine.Translation;
6+
7+
namespace SIL.Machine.Corpora
8+
{
9+
public class PlaceMarkersAlignmentInfo
10+
{
11+
public IReadOnlyList<string> Refs { get; }
12+
public IReadOnlyList<string> SourceTokens { get; }
13+
public IReadOnlyList<string> TargetTokens { get; }
14+
public WordAlignmentMatrix Alignment { get; }
15+
16+
public PlaceMarkersAlignmentInfo(
17+
IReadOnlyList<string> refs,
18+
IReadOnlyList<string> sourceTokens,
19+
IReadOnlyList<string> targetTokens,
20+
WordAlignmentMatrix alignment
21+
)
22+
{
23+
Refs = refs;
24+
SourceTokens = sourceTokens;
25+
TargetTokens = targetTokens;
26+
Alignment = alignment;
27+
}
28+
}
29+
30+
public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler
31+
{
32+
private readonly IDictionary<string, PlaceMarkersAlignmentInfo> _alignmentInfo;
33+
34+
public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable<PlaceMarkersAlignmentInfo> alignmentInfo)
35+
{
36+
_alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info);
37+
}
38+
39+
public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block)
40+
{
41+
string reference = block.Refs.FirstOrDefault().ToString();
42+
List<UsfmUpdateBlockElement> elements = block.Elements.ToList();
43+
44+
// Nothing to do if there are no markers to place or no alignment to use
45+
if (
46+
elements.Count == 0
47+
|| !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo)
48+
|| alignmentInfo.Alignment.RowCount == 0
49+
|| alignmentInfo.Alignment.ColumnCount == 0
50+
|| !elements.Any(e =>
51+
e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)
52+
&& !e.MarkedForRemoval
53+
)
54+
)
55+
{
56+
return block;
57+
}
58+
59+
// Paragraph markers at the end of the block should stay there
60+
// Section headers should be ignored but re-inserted in the same position relative to other paragraph markers
61+
List<UsfmUpdateBlockElement> endElements = new List<UsfmUpdateBlockElement>();
62+
bool eobEmptyParas = true;
63+
List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)> headerElements =
64+
new List<(int paraMarkersLeft, UsfmUpdateBlockElement element)>();
65+
int paraMarkersLeft = 0;
66+
foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse())
67+
{
68+
if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval)
69+
{
70+
if (element.Tokens.Count > 1)
71+
{
72+
headerElements.Insert(0, (paraMarkersLeft, element));
73+
elements.RemoveAt(i);
74+
}
75+
else
76+
{
77+
paraMarkersLeft++;
78+
79+
if (eobEmptyParas)
80+
{
81+
endElements.Insert(0, element);
82+
}
83+
}
84+
}
85+
else if (
86+
!(
87+
element.Type == UsfmUpdateBlockElementType.Embed
88+
|| (
89+
element.Type == UsfmUpdateBlockElementType.Text
90+
&& element.Tokens[0].ToUsfm().Trim().Count() == 0
91+
)
92+
)
93+
)
94+
{
95+
eobEmptyParas = false;
96+
}
97+
}
98+
99+
IReadOnlyList<string> sourceTokens = alignmentInfo.SourceTokens;
100+
IReadOnlyList<string> targetTokens = alignmentInfo.TargetTokens;
101+
int sourceTokenIndex = 0;
102+
103+
string sourceSentence = "";
104+
string targetSentence = "";
105+
List<UsfmUpdateBlockElement> toPlace = new List<UsfmUpdateBlockElement>();
106+
List<int> adjacentSourceTokens = new List<int>();
107+
List<UsfmUpdateBlockElement> placedElements = new List<UsfmUpdateBlockElement>();
108+
if (elements[0].Type == UsfmUpdateBlockElementType.Other)
109+
{
110+
placedElements.Add(elements[0]);
111+
elements.RemoveAt(0);
112+
}
113+
List<UsfmUpdateBlockElement> embedElements = new List<UsfmUpdateBlockElement>();
114+
List<UsfmUpdateBlockElement> ignoredElements = new List<UsfmUpdateBlockElement>();
115+
foreach (UsfmUpdateBlockElement element in elements)
116+
{
117+
if (element.Type == UsfmUpdateBlockElementType.Text)
118+
{
119+
if (element.MarkedForRemoval)
120+
{
121+
string text = element.Tokens[0].ToUsfm();
122+
sourceSentence += text;
123+
124+
// Track seen tokens
125+
while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex]))
126+
{
127+
text = text.Substring(
128+
text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length
129+
);
130+
sourceTokenIndex++;
131+
}
132+
// Handle tokens split across text elements
133+
if (text.Trim().Length > 0)
134+
sourceTokenIndex++;
135+
}
136+
else
137+
{
138+
targetSentence += element.Tokens[0].ToUsfm();
139+
}
140+
}
141+
142+
if (element.MarkedForRemoval)
143+
{
144+
ignoredElements.Add(element);
145+
}
146+
else if (element.Type == UsfmUpdateBlockElementType.Embed)
147+
{
148+
embedElements.Add(element);
149+
}
150+
else if (element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style))
151+
{
152+
toPlace.Add(element);
153+
adjacentSourceTokens.Add(sourceTokenIndex);
154+
}
155+
}
156+
157+
List<int> targetTokenStarts = new List<int>();
158+
int prevLength = 0;
159+
foreach (string token in targetTokens)
160+
{
161+
targetTokenStarts.Add(targetSentence.IndexOf(token, targetTokenStarts.LastOrDefault() + prevLength));
162+
prevLength = token.Length;
163+
}
164+
165+
List<(int Index, UsfmUpdateBlockElement Element)> toInsert =
166+
new List<(int Index, UsfmUpdateBlockElement Element)>();
167+
foreach (
168+
(UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace
169+
.Zip(adjacentSourceTokens)
170+
.Select(tuple => (tuple.Item1, tuple.Item2))
171+
)
172+
{
173+
int adjacentTargetToken = PredictMarkerLocation(
174+
alignmentInfo.Alignment,
175+
adjacentSourceToken,
176+
sourceTokens,
177+
targetTokens
178+
);
179+
int targetStringIndex =
180+
adjacentTargetToken < targetTokenStarts.Count
181+
? targetTokenStarts[adjacentTargetToken]
182+
: targetSentence.Length;
183+
toInsert.Add((targetStringIndex, element));
184+
}
185+
toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index));
186+
toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e)));
187+
188+
// Construct new text tokens to put between markers
189+
// and reincorporate headers and empty end-of-verse paragraph markers
190+
if (toInsert[0].Index > 0)
191+
{
192+
placedElements.Add(
193+
new UsfmUpdateBlockElement(
194+
UsfmUpdateBlockElementType.Text,
195+
new List<UsfmToken>() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) }
196+
)
197+
);
198+
}
199+
200+
foreach ((int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p)))
201+
{
202+
if (element.Type == UsfmUpdateBlockElementType.Paragraph)
203+
{
204+
while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft)
205+
{
206+
placedElements.Add(headerElements[0].Element);
207+
headerElements.RemoveAt(0);
208+
}
209+
paraMarkersLeft++;
210+
}
211+
212+
placedElements.Add(element);
213+
if (
214+
insertIndex < targetSentence.Length
215+
&& (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index)
216+
)
217+
{
218+
UsfmToken textToken;
219+
if (j + 1 < toInsert.Count)
220+
{
221+
textToken = new UsfmToken(targetSentence.Substring(insertIndex, toInsert[j + 1].Index));
222+
}
223+
else
224+
{
225+
textToken = new UsfmToken(targetSentence.Substring(insertIndex));
226+
}
227+
placedElements.Add(
228+
new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, new List<UsfmToken> { textToken })
229+
);
230+
}
231+
}
232+
while (headerElements.Count > 0)
233+
{
234+
placedElements.Add(headerElements[0].Element);
235+
headerElements.RemoveAt(0);
236+
}
237+
238+
UsfmUpdateBlock processedBlock = new UsfmUpdateBlock(
239+
refs: block.Refs,
240+
elements: placedElements.Concat(ignoredElements)
241+
);
242+
return processedBlock;
243+
}
244+
245+
private int PredictMarkerLocation(
246+
WordAlignmentMatrix alignment,
247+
int adjacentSourceToken,
248+
IReadOnlyList<string> sourceTokens,
249+
IReadOnlyList<string> targetTokens
250+
)
251+
{
252+
// Gets the number of alignment pairs that "cross the line" between
253+
// the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5)
254+
int NumAlignCrossings(int sourceIndex, int targetIndex)
255+
{
256+
int crossings = 0;
257+
for (int i = 0; i < alignment.RowCount; i++)
258+
{
259+
for (int j = 0; j < alignment.ColumnCount; j++)
260+
{
261+
if (
262+
alignment[i, j]
263+
&& ((i < sourceIndex && j >= targetIndex) || (i >= sourceIndex && j < targetIndex))
264+
)
265+
{
266+
crossings++;
267+
}
268+
}
269+
}
270+
return crossings;
271+
}
272+
273+
// If the token on either side of a potential target location is punctuation,
274+
// use it as the basis for deciding the target marker location
275+
int targetHypothesis = -1;
276+
int[] punctuationHypotheses = new int[] { -1, 0 };
277+
foreach (int punctuationHypothesis in punctuationHypotheses)
278+
{
279+
int sourceHypothesis = adjacentSourceToken + punctuationHypothesis;
280+
if (sourceHypothesis < 0 || sourceHypothesis >= sourceTokens.Count)
281+
{
282+
continue;
283+
}
284+
// Only accept aligned pairs where both the src and trg token are punctuation
285+
string hypothesisToken = sourceTokens[sourceHypothesis];
286+
if (
287+
hypothesisToken.Length > 0
288+
&& !hypothesisToken.Any(char.IsLetter)
289+
&& sourceHypothesis < alignment.RowCount
290+
)
291+
{
292+
List<int> alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList();
293+
// If aligning to a token that precedes that marker,
294+
// the trg token predicted to be closest to the marker
295+
// is the last token aligned to the src rather than the first
296+
if (punctuationHypothesis < 0)
297+
alignedTargetTokens.Reverse();
298+
foreach (int targetIndex in alignedTargetTokens)
299+
{
300+
string targetToken = targetTokens[targetIndex];
301+
if (targetToken.Length > 0 && !targetToken.Any(char.IsLetter))
302+
{
303+
targetHypothesis = targetIndex;
304+
break;
305+
}
306+
}
307+
}
308+
if (targetHypothesis != -1)
309+
{
310+
// Since the marker location is represented by the token after the marker,
311+
// adjust the index when aligning to punctuation that precedes the token
312+
return targetHypothesis + (punctuationHypothesis == -1 ? 1 : 0);
313+
}
314+
}
315+
316+
int[] hypotheses = new int[] { 0, 1, 2 };
317+
int bestHypothesis = -1;
318+
int bestNumCrossings = 200 ^ 2;
319+
HashSet<int> checkedHypotheses = new HashSet<int>();
320+
foreach (int hypthesis in hypotheses)
321+
{
322+
int sourceHypothesis = adjacentSourceToken + bestHypothesis;
323+
if (checkedHypotheses.Contains(sourceHypothesis))
324+
continue;
325+
targetHypothesis = -1;
326+
while (targetHypothesis == -1 && sourceHypothesis >= 0 && sourceHypothesis < alignment.RowCount)
327+
{
328+
checkedHypotheses.Add(sourceHypothesis);
329+
List<int> alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList();
330+
if (alignedTargetTokens.Count > 0)
331+
{
332+
// If aligning with a source token that precedes the marker,
333+
// the target token predicted to be closest to the marker is the last aligned token rather than the first
334+
targetHypothesis = alignedTargetTokens[bestHypothesis < 0 ? -1 : 0];
335+
}
336+
else
337+
{
338+
sourceHypothesis += bestHypothesis < 0 ? -1 : 0;
339+
}
340+
}
341+
if (targetHypothesis != -1)
342+
{
343+
int numCrossings = NumAlignCrossings(adjacentSourceToken, targetHypothesis);
344+
if (numCrossings < bestNumCrossings)
345+
{
346+
bestHypothesis = targetHypothesis;
347+
bestNumCrossings = numCrossings;
348+
}
349+
if (numCrossings == 0)
350+
{
351+
break;
352+
}
353+
}
354+
}
355+
356+
// If no alignments found, insert at the end of the sentence
357+
return bestHypothesis != -1 ? bestHypothesis : targetTokens.Count;
358+
}
359+
}
360+
}

0 commit comments

Comments
 (0)