Skip to content

Commit 878ad05

Browse files
committed
Update handler for update block refactor, handle section headers
1 parent acf7975 commit 878ad05

File tree

1 file changed

+67
-38
lines changed

1 file changed

+67
-38
lines changed

machine/corpora/place_markers_scripture_update_block_handler.py

Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,98 @@
11
from __future__ import annotations
22

3-
from copy import copy
43
from typing import List, Sequence
54

65
from ..jobs.translation_file_service import PretranslationInfo
76
from ..tokenization import LatinWordTokenizer
87
from ..translation import WordAlignmentMatrix
98
from .aligned_word_pair import AlignedWordPair
10-
from .scripture_update_block import ScriptureUpdateBlock
11-
from .scripture_update_block_handler import ScriptureUpdateBlockHandler
12-
from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType
9+
from .usfm_stylesheet import UsfmStylesheet
10+
from .usfm_tag import UsfmTextType
1311
from .usfm_token import UsfmToken, UsfmTokenType
12+
from .usfm_update_block import UsfmUpdateBlock
13+
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
14+
from .usfm_update_block_handler import UsfmUpdateBlockHandler
1415

1516
TOKENIZER = LatinWordTokenizer()
17+
STYLESHEET = UsfmStylesheet("usfm.sty")
1618

1719

18-
class PlaceMarkersScriptureUpdateBlockHandler(ScriptureUpdateBlockHandler):
20+
class PlaceMarkersScriptureUpdateBlockHandler(UsfmUpdateBlockHandler):
1921

2022
def __init__(self, pt_info: Sequence[PretranslationInfo]):
2123
# TODO: when will len(refs) be >1?
2224
self._pt_info = {info["refs"][0]: info for info in pt_info}
2325

24-
def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
26+
def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
2527
# Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed
2628
if (
2729
len(block.elements) == 0
28-
or str(block.ref) not in self._pt_info.keys()
29-
or len(self._pt_info[str(block.ref)]["alignment"]) == 0
30-
or block.elements[0].type == ScriptureUpdateElementType.EMBED
30+
or str(block.refs[0]) not in self._pt_info.keys()
31+
or len(self._pt_info[str(block.refs[0])]["alignment"]) == 0
32+
or block.elements[0].type == UsfmUpdateBlockElementType.EMBED
3133
or not any(
3234
(
33-
element.type in [ScriptureUpdateElementType.PARAGRAPH, ScriptureUpdateElementType.STYLE]
35+
element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
3436
and not element.marked_for_removal
3537
)
3638
for element in block.elements[1:]
3739
)
3840
):
3941
return block
4042

41-
# Parsing the block's elements potentially involves removing elements so they are not processed twice,
42-
# but the original block may need to be returned if the two versions of the source/target text to not match up
43-
orig_elements = copy(block.elements)
43+
# Work on a copy in case the block needs to be returned unchanged
44+
orig_elements = list(block.elements)
4445

4546
src_sent = ""
4647
trg_sent = ""
4748
to_place = []
4849
src_marker_idxs = []
4950
placed_elements = [orig_elements[0]]
50-
end_elements = []
5151
ignored_elements = []
5252

53+
# Section headers should be ignored but re-inserted in the same position relative to other paragraph markers
54+
header_elements = []
55+
para_markers_left = 0
56+
for i, element in reversed(list(enumerate(orig_elements))):
57+
if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
58+
if STYLESHEET.get_tag(str(element.tokens[0].marker)).text_type == UsfmTextType.SECTION:
59+
if i < len(orig_elements) - 1 and orig_elements[i + 1].type == UsfmUpdateBlockElementType.TEXT:
60+
header_elements.insert(0, (para_markers_left, [element, orig_elements.pop(i + 1)]))
61+
else:
62+
header_elements.insert(0, (para_markers_left, [element]))
63+
orig_elements.pop(i)
64+
else:
65+
para_markers_left += 1
66+
5367
# Paragraph markers at the end of the block should stay there
68+
end_elements = []
5469
for i, element in reversed(list(enumerate(orig_elements))):
55-
if element.type == ScriptureUpdateElementType.PARAGRAPH:
70+
if element.type == UsfmUpdateBlockElementType.PARAGRAPH and not element.marked_for_removal:
5671
end_elements.insert(0, element)
5772
orig_elements.pop(i)
58-
elif element.type != ScriptureUpdateElementType.EMBED_BLOCK:
73+
elif element.type != UsfmUpdateBlockElementType.EMBED:
5974
break
6075

6176
for element in orig_elements[1:]:
62-
if element.type == ScriptureUpdateElementType.EXISTING_TEXT:
63-
src_sent += element.tokens[0].to_usfm()
64-
if element.type == ScriptureUpdateElementType.INSERTED_TEXT:
65-
trg_sent += element.tokens[0].to_usfm()
77+
if element.type == UsfmUpdateBlockElementType.TEXT:
78+
if element.marked_for_removal:
79+
src_sent += element.tokens[0].to_usfm()
80+
else:
81+
trg_sent += element.tokens[0].to_usfm()
6682

67-
if element.marked_for_removal or element.type == ScriptureUpdateElementType.EMBED_BLOCK:
83+
if element.marked_for_removal or element.type == UsfmUpdateBlockElementType.EMBED:
6884
ignored_elements.append(element)
69-
elif element.type in [ScriptureUpdateElementType.PARAGRAPH, ScriptureUpdateElementType.STYLE]:
85+
elif element.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]:
7086
to_place.append(element)
7187
src_marker_idxs.append(len(src_sent))
7288

73-
src_toks = self._pt_info[str(block.ref)]["source_toks"]
74-
trg_toks = self._pt_info[str(block.ref)]["translation_toks"]
89+
src_toks = self._pt_info[str(block.refs[0])]["source_toks"]
90+
trg_toks = self._pt_info[str(block.refs[0])]["translation_toks"]
7591

7692
# Don't do anything if the source sentence or pretranslation has changed
7793
if (
7894
list(t for t in TOKENIZER.tokenize(src_sent)) != src_toks
79-
or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks # could just use translation for trg
95+
or list(t for t in TOKENIZER.tokenize(trg_sent)) != trg_toks
8096
):
8197
return block
8298

@@ -98,7 +114,7 @@ def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
98114
if i == 0:
99115
adj_src_toks.append(i)
100116

101-
alignment = to_word_alignment_matrix(self._pt_info[str(block.ref)]["alignment"])
117+
alignment = to_word_alignment_matrix(self._pt_info[str(block.refs[0])]["alignment"])
102118
adj_trg_toks = [
103119
self._predict_marker_location(alignment, adj_src_tok, src_toks, trg_toks) for adj_src_tok in adj_src_toks
104120
]
@@ -115,26 +131,39 @@ def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
115131
to_insert.insert(insert_pos, (trg_str_idx, element))
116132

117133
# Construct new text tokens to put between markers
118-
placed_elements.append(
119-
ScriptureUpdateElement(
120-
ScriptureUpdateElementType.INSERTED_TEXT,
121-
[UsfmToken(UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent)],
134+
# and reincorporate headers and empty end-of-verse paragraph markers
135+
if len(to_insert) == 0 or to_insert[0][0] > 0:
136+
placed_elements.append(
137+
UsfmUpdateBlockElement(
138+
UsfmUpdateBlockElementType.TEXT,
139+
[
140+
UsfmToken(
141+
UsfmTokenType.TEXT, text=trg_sent[: to_insert[0][0]] if len(to_insert) > 0 else trg_sent
142+
)
143+
],
144+
)
122145
)
123-
)
124146
for j, (insert_idx, element) in enumerate(to_insert):
147+
if element.type == UsfmUpdateBlockElementType.PARAGRAPH:
148+
while len(header_elements) > 0 and header_elements[0][0] == para_markers_left:
149+
placed_elements += header_elements.pop(0)[1]
150+
para_markers_left -= 1
151+
125152
placed_elements.append(element)
126153
text_token = UsfmToken(
127154
UsfmTokenType.TEXT,
128155
text=(trg_sent[insert_idx : to_insert[j + 1][0]] if j + 1 < len(to_insert) else trg_sent[insert_idx:]),
129156
)
130-
placed_elements.append(
131-
ScriptureUpdateElement(
132-
ScriptureUpdateElementType.INSERTED_TEXT,
133-
[text_token],
134-
)
135-
)
157+
placed_elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, [text_token]))
158+
for element in end_elements:
159+
while len(header_elements) > 0 and header_elements[0][0] == para_markers_left:
160+
placed_elements += header_elements.pop(0)[1]
161+
para_markers_left -= 1
162+
placed_elements.append(element)
163+
while len(header_elements) > 0:
164+
placed_elements += header_elements.pop(0)[1]
136165

137-
block._elements = placed_elements + end_elements + ignored_elements
166+
block._elements = placed_elements + ignored_elements
138167
return block
139168

140169
def _predict_marker_location(

0 commit comments

Comments
 (0)