|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Linq; |
| 4 | +using SIL.Extensions; |
| 5 | +using SIL.Machine.Translation; |
| 6 | + |
| 7 | +namespace SIL.Machine.Corpora |
| 8 | +{ |
| 9 | + public class PlaceMarkersAlignmentInfo |
| 10 | + { |
| 11 | + public IReadOnlyList<string> Refs { get; } |
| 12 | + public IReadOnlyList<string> SourceTokens { get; } |
| 13 | + public IReadOnlyList<string> TargetTokens { get; } |
| 14 | + public WordAlignmentMatrix Alignment { get; } |
| 15 | + |
| 16 | + public PlaceMarkersAlignmentInfo( |
| 17 | + IReadOnlyList<string> refs, |
| 18 | + IReadOnlyList<string> sourceTokens, |
| 19 | + IReadOnlyList<string> targetTokens, |
| 20 | + WordAlignmentMatrix alignment |
| 21 | + ) |
| 22 | + { |
| 23 | + Refs = refs; |
| 24 | + SourceTokens = sourceTokens; |
| 25 | + TargetTokens = targetTokens; |
| 26 | + Alignment = alignment; |
| 27 | + } |
| 28 | + } |
| 29 | + |
| 30 | + public class PlaceMarkersUsfmUpdateBlockHandler : IUsfmUpdateBlockHandler |
| 31 | + { |
| 32 | + private readonly IDictionary<string, PlaceMarkersAlignmentInfo> _alignmentInfo; |
| 33 | + |
| 34 | + public PlaceMarkersUsfmUpdateBlockHandler(IEnumerable<PlaceMarkersAlignmentInfo> alignmentInfo) |
| 35 | + { |
| 36 | + _alignmentInfo = alignmentInfo.ToDictionary(info => info.Refs.First(), info => info); |
| 37 | + } |
| 38 | + |
| 39 | + public UsfmUpdateBlock ProcessBlock(UsfmUpdateBlock block) |
| 40 | + { |
| 41 | + string reference = block.Refs.FirstOrDefault().ToString(); |
| 42 | + List<UsfmUpdateBlockElement> elements = block.Elements.ToList(); |
| 43 | + |
| 44 | + // Nothing to do if there are no markers to place or no alignment to use |
| 45 | + if ( |
| 46 | + elements.Count == 0 |
| 47 | + || !_alignmentInfo.TryGetValue(reference, out PlaceMarkersAlignmentInfo alignmentInfo) |
| 48 | + || alignmentInfo.Alignment.RowCount == 0 |
| 49 | + || alignmentInfo.Alignment.ColumnCount == 0 |
| 50 | + || !elements.Any(e => |
| 51 | + e.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style) |
| 52 | + && !e.MarkedForRemoval |
| 53 | + ) |
| 54 | + ) |
| 55 | + { |
| 56 | + return block; |
| 57 | + } |
| 58 | + |
| 59 | + // Paragraph markers at the end of the block should stay there |
| 60 | + // Section headers should be ignored but re-inserted in the same position relative to other paragraph markers |
| 61 | + List<UsfmUpdateBlockElement> endElements = new List<UsfmUpdateBlockElement>(); |
| 62 | + bool eobEmptyParas = true; |
| 63 | + List<(int ParaMarkersLeft, UsfmUpdateBlockElement Element)> headerElements = |
| 64 | + new List<(int paraMarkersLeft, UsfmUpdateBlockElement element)>(); |
| 65 | + int paraMarkersLeft = 0; |
| 66 | + foreach ((int i, UsfmUpdateBlockElement element) in elements.Select((e, i) => (i, e)).Reverse()) |
| 67 | + { |
| 68 | + if (element.Type == UsfmUpdateBlockElementType.Paragraph && !element.MarkedForRemoval) |
| 69 | + { |
| 70 | + if (element.Tokens.Count > 1) |
| 71 | + { |
| 72 | + headerElements.Insert(0, (paraMarkersLeft, element)); |
| 73 | + elements.RemoveAt(i); |
| 74 | + } |
| 75 | + else |
| 76 | + { |
| 77 | + paraMarkersLeft++; |
| 78 | + |
| 79 | + if (eobEmptyParas) |
| 80 | + { |
| 81 | + endElements.Insert(0, element); |
| 82 | + } |
| 83 | + } |
| 84 | + } |
| 85 | + else if ( |
| 86 | + !( |
| 87 | + element.Type == UsfmUpdateBlockElementType.Embed |
| 88 | + || ( |
| 89 | + element.Type == UsfmUpdateBlockElementType.Text |
| 90 | + && element.Tokens[0].ToUsfm().Trim().Count() == 0 |
| 91 | + ) |
| 92 | + ) |
| 93 | + ) |
| 94 | + { |
| 95 | + eobEmptyParas = false; |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + IReadOnlyList<string> sourceTokens = alignmentInfo.SourceTokens; |
| 100 | + IReadOnlyList<string> targetTokens = alignmentInfo.TargetTokens; |
| 101 | + int sourceTokenIndex = 0; |
| 102 | + |
| 103 | + string sourceSentence = ""; |
| 104 | + string targetSentence = ""; |
| 105 | + List<UsfmUpdateBlockElement> toPlace = new List<UsfmUpdateBlockElement>(); |
| 106 | + List<int> adjacentSourceTokens = new List<int>(); |
| 107 | + List<UsfmUpdateBlockElement> placedElements = new List<UsfmUpdateBlockElement>(); |
| 108 | + if (elements[0].Type == UsfmUpdateBlockElementType.Other) |
| 109 | + { |
| 110 | + placedElements.Add(elements[0]); |
| 111 | + elements.RemoveAt(0); |
| 112 | + } |
| 113 | + List<UsfmUpdateBlockElement> embedElements = new List<UsfmUpdateBlockElement>(); |
| 114 | + List<UsfmUpdateBlockElement> ignoredElements = new List<UsfmUpdateBlockElement>(); |
| 115 | + foreach (UsfmUpdateBlockElement element in elements) |
| 116 | + { |
| 117 | + if (element.Type == UsfmUpdateBlockElementType.Text) |
| 118 | + { |
| 119 | + if (element.MarkedForRemoval) |
| 120 | + { |
| 121 | + string text = element.Tokens[0].ToUsfm(); |
| 122 | + sourceSentence += text; |
| 123 | + |
| 124 | + // Track seen tokens |
| 125 | + while (sourceTokenIndex < sourceTokens.Count && text.Contains(sourceTokens[sourceTokenIndex])) |
| 126 | + { |
| 127 | + text = text.Substring( |
| 128 | + text.IndexOf(sourceTokens[sourceTokenIndex]) + sourceTokens[sourceTokenIndex].Length |
| 129 | + ); |
| 130 | + sourceTokenIndex++; |
| 131 | + } |
| 132 | + // Handle tokens split across text elements |
| 133 | + if (text.Trim().Length > 0) |
| 134 | + sourceTokenIndex++; |
| 135 | + } |
| 136 | + else |
| 137 | + { |
| 138 | + targetSentence += element.Tokens[0].ToUsfm(); |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + if (element.MarkedForRemoval) |
| 143 | + { |
| 144 | + ignoredElements.Add(element); |
| 145 | + } |
| 146 | + else if (element.Type == UsfmUpdateBlockElementType.Embed) |
| 147 | + { |
| 148 | + embedElements.Add(element); |
| 149 | + } |
| 150 | + else if (element.Type.IsOneOf(UsfmUpdateBlockElementType.Paragraph, UsfmUpdateBlockElementType.Style)) |
| 151 | + { |
| 152 | + toPlace.Add(element); |
| 153 | + adjacentSourceTokens.Add(sourceTokenIndex); |
| 154 | + } |
| 155 | + } |
| 156 | + |
| 157 | + List<int> targetTokenStarts = new List<int>(); |
| 158 | + int prevLength = 0; |
| 159 | + foreach (string token in targetTokens) |
| 160 | + { |
| 161 | + targetTokenStarts.Add(targetSentence.IndexOf(token, targetTokenStarts.LastOrDefault() + prevLength)); |
| 162 | + prevLength = token.Length; |
| 163 | + } |
| 164 | + |
| 165 | + List<(int Index, UsfmUpdateBlockElement Element)> toInsert = |
| 166 | + new List<(int Index, UsfmUpdateBlockElement Element)>(); |
| 167 | + foreach ( |
| 168 | + (UsfmUpdateBlockElement element, int adjacentSourceToken) in toPlace |
| 169 | + .Zip(adjacentSourceTokens) |
| 170 | + .Select(tuple => (tuple.Item1, tuple.Item2)) |
| 171 | + ) |
| 172 | + { |
| 173 | + int adjacentTargetToken = PredictMarkerLocation( |
| 174 | + alignmentInfo.Alignment, |
| 175 | + adjacentSourceToken, |
| 176 | + sourceTokens, |
| 177 | + targetTokens |
| 178 | + ); |
| 179 | + int targetStringIndex = |
| 180 | + adjacentTargetToken < targetTokenStarts.Count |
| 181 | + ? targetTokenStarts[adjacentTargetToken] |
| 182 | + : targetSentence.Length; |
| 183 | + toInsert.Add((targetStringIndex, element)); |
| 184 | + } |
| 185 | + toInsert.Sort((p1, p2) => p1.Index.CompareTo(p2.Index)); |
| 186 | + toInsert.AddRange(embedElements.Concat(endElements).Select(e => (targetSentence.Length, e))); |
| 187 | + |
| 188 | + // Construct new text tokens to put between markers |
| 189 | + // and reincorporate headers and empty end-of-verse paragraph markers |
| 190 | + if (toInsert[0].Index > 0) |
| 191 | + { |
| 192 | + placedElements.Add( |
| 193 | + new UsfmUpdateBlockElement( |
| 194 | + UsfmUpdateBlockElementType.Text, |
| 195 | + new List<UsfmToken>() { new UsfmToken(targetSentence.Substring(0, toInsert[0].Index)) } |
| 196 | + ) |
| 197 | + ); |
| 198 | + } |
| 199 | + |
| 200 | + foreach ((int j, (int insertIndex, UsfmUpdateBlockElement element)) in toInsert.Select((p, i) => (i, p))) |
| 201 | + { |
| 202 | + if (element.Type == UsfmUpdateBlockElementType.Paragraph) |
| 203 | + { |
| 204 | + while (headerElements.Count > 0 && headerElements[0].ParaMarkersLeft == paraMarkersLeft) |
| 205 | + { |
| 206 | + placedElements.Add(headerElements[0].Element); |
| 207 | + headerElements.RemoveAt(0); |
| 208 | + } |
| 209 | + paraMarkersLeft++; |
| 210 | + } |
| 211 | + |
| 212 | + placedElements.Add(element); |
| 213 | + if ( |
| 214 | + insertIndex < targetSentence.Length |
| 215 | + && (j + 1 == toInsert.Count || insertIndex < toInsert[j + 1].Index) |
| 216 | + ) |
| 217 | + { |
| 218 | + UsfmToken textToken; |
| 219 | + if (j + 1 < toInsert.Count) |
| 220 | + { |
| 221 | + textToken = new UsfmToken(targetSentence.Substring(insertIndex, toInsert[j + 1].Index)); |
| 222 | + } |
| 223 | + else |
| 224 | + { |
| 225 | + textToken = new UsfmToken(targetSentence.Substring(insertIndex)); |
| 226 | + } |
| 227 | + placedElements.Add( |
| 228 | + new UsfmUpdateBlockElement(UsfmUpdateBlockElementType.Text, new List<UsfmToken> { textToken }) |
| 229 | + ); |
| 230 | + } |
| 231 | + } |
| 232 | + while (headerElements.Count > 0) |
| 233 | + { |
| 234 | + placedElements.Add(headerElements[0].Element); |
| 235 | + headerElements.RemoveAt(0); |
| 236 | + } |
| 237 | + |
| 238 | + UsfmUpdateBlock processedBlock = new UsfmUpdateBlock( |
| 239 | + refs: block.Refs, |
| 240 | + elements: placedElements.Concat(ignoredElements) |
| 241 | + ); |
| 242 | + return processedBlock; |
| 243 | + } |
| 244 | + |
| 245 | + private int PredictMarkerLocation( |
| 246 | + WordAlignmentMatrix alignment, |
| 247 | + int adjacentSourceToken, |
| 248 | + IReadOnlyList<string> sourceTokens, |
| 249 | + IReadOnlyList<string> targetTokens |
| 250 | + ) |
| 251 | + { |
| 252 | + // Gets the number of alignment pairs that "cross the line" between |
| 253 | + // the src marker position and the potential trg marker position, (src_idx - .5) and (trg_idx - .5) |
| 254 | + int NumAlignCrossings(int sourceIndex, int targetIndex) |
| 255 | + { |
| 256 | + int crossings = 0; |
| 257 | + for (int i = 0; i < alignment.RowCount; i++) |
| 258 | + { |
| 259 | + for (int j = 0; j < alignment.ColumnCount; j++) |
| 260 | + { |
| 261 | + if ( |
| 262 | + alignment[i, j] |
| 263 | + && ((i < sourceIndex && j >= targetIndex) || (i >= sourceIndex && j < targetIndex)) |
| 264 | + ) |
| 265 | + { |
| 266 | + crossings++; |
| 267 | + } |
| 268 | + } |
| 269 | + } |
| 270 | + return crossings; |
| 271 | + } |
| 272 | + |
| 273 | + // If the token on either side of a potential target location is punctuation, |
| 274 | + // use it as the basis for deciding the target marker location |
| 275 | + int targetHypothesis = -1; |
| 276 | + int[] punctuationHypotheses = new int[] { -1, 0 }; |
| 277 | + foreach (int punctuationHypothesis in punctuationHypotheses) |
| 278 | + { |
| 279 | + int sourceHypothesis = adjacentSourceToken + punctuationHypothesis; |
| 280 | + if (sourceHypothesis < 0 || sourceHypothesis >= sourceTokens.Count) |
| 281 | + { |
| 282 | + continue; |
| 283 | + } |
| 284 | + // Only accept aligned pairs where both the src and trg token are punctuation |
| 285 | + string hypothesisToken = sourceTokens[sourceHypothesis]; |
| 286 | + if ( |
| 287 | + hypothesisToken.Length > 0 |
| 288 | + && !hypothesisToken.Any(char.IsLetter) |
| 289 | + && sourceHypothesis < alignment.RowCount |
| 290 | + ) |
| 291 | + { |
| 292 | + List<int> alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList(); |
| 293 | + // If aligning to a token that precedes that marker, |
| 294 | + // the trg token predicted to be closest to the marker |
| 295 | + // is the last token aligned to the src rather than the first |
| 296 | + if (punctuationHypothesis < 0) |
| 297 | + alignedTargetTokens.Reverse(); |
| 298 | + foreach (int targetIndex in alignedTargetTokens) |
| 299 | + { |
| 300 | + string targetToken = targetTokens[targetIndex]; |
| 301 | + if (targetToken.Length > 0 && !targetToken.Any(char.IsLetter)) |
| 302 | + { |
| 303 | + targetHypothesis = targetIndex; |
| 304 | + break; |
| 305 | + } |
| 306 | + } |
| 307 | + } |
| 308 | + if (targetHypothesis != -1) |
| 309 | + { |
| 310 | + // Since the marker location is represented by the token after the marker, |
| 311 | + // adjust the index when aligning to punctuation that precedes the token |
| 312 | + return targetHypothesis + (punctuationHypothesis == -1 ? 1 : 0); |
| 313 | + } |
| 314 | + } |
| 315 | + |
| 316 | + int[] hypotheses = new int[] { 0, 1, 2 }; |
| 317 | + int bestHypothesis = -1; |
| 318 | + int bestNumCrossings = 200 ^ 2; |
| 319 | + HashSet<int> checkedHypotheses = new HashSet<int>(); |
| 320 | + foreach (int hypthesis in hypotheses) |
| 321 | + { |
| 322 | + int sourceHypothesis = adjacentSourceToken + bestHypothesis; |
| 323 | + if (checkedHypotheses.Contains(sourceHypothesis)) |
| 324 | + continue; |
| 325 | + targetHypothesis = -1; |
| 326 | + while (targetHypothesis == -1 && sourceHypothesis >= 0 && sourceHypothesis < alignment.RowCount) |
| 327 | + { |
| 328 | + checkedHypotheses.Add(sourceHypothesis); |
| 329 | + List<int> alignedTargetTokens = alignment.GetRowAlignedIndices(sourceHypothesis).ToList(); |
| 330 | + if (alignedTargetTokens.Count > 0) |
| 331 | + { |
| 332 | + // If aligning with a source token that precedes the marker, |
| 333 | + // the target token predicted to be closest to the marker is the last aligned token rather than the first |
| 334 | + targetHypothesis = alignedTargetTokens[bestHypothesis < 0 ? -1 : 0]; |
| 335 | + } |
| 336 | + else |
| 337 | + { |
| 338 | + sourceHypothesis += bestHypothesis < 0 ? -1 : 0; |
| 339 | + } |
| 340 | + } |
| 341 | + if (targetHypothesis != -1) |
| 342 | + { |
| 343 | + int numCrossings = NumAlignCrossings(adjacentSourceToken, targetHypothesis); |
| 344 | + if (numCrossings < bestNumCrossings) |
| 345 | + { |
| 346 | + bestHypothesis = targetHypothesis; |
| 347 | + bestNumCrossings = numCrossings; |
| 348 | + } |
| 349 | + if (numCrossings == 0) |
| 350 | + { |
| 351 | + break; |
| 352 | + } |
| 353 | + } |
| 354 | + } |
| 355 | + |
| 356 | + // If no alignments found, insert at the end of the sentence |
| 357 | + return bestHypothesis != -1 ? bestHypothesis : targetTokens.Count; |
| 358 | + } |
| 359 | + } |
| 360 | +} |
0 commit comments