11from __future__ import annotations
22
3- from copy import copy
43from typing import List , Sequence
54
65from ..jobs .translation_file_service import PretranslationInfo
76from ..tokenization import LatinWordTokenizer
87from ..translation import WordAlignmentMatrix
98from .aligned_word_pair import AlignedWordPair
10- from .scripture_update_block import ScriptureUpdateBlock
11- from .scripture_update_block_handler import ScriptureUpdateBlockHandler
12- from .scripture_update_element import ScriptureUpdateElement , ScriptureUpdateElementType
9+ from .usfm_stylesheet import UsfmStylesheet
10+ from .usfm_tag import UsfmTextType
1311from .usfm_token import UsfmToken , UsfmTokenType
12+ from .usfm_update_block import UsfmUpdateBlock
13+ from .usfm_update_block_element import UsfmUpdateBlockElement , UsfmUpdateBlockElementType
14+ from .usfm_update_block_handler import UsfmUpdateBlockHandler
1415
1516TOKENIZER = LatinWordTokenizer ()
17+ STYLESHEET = UsfmStylesheet ("usfm.sty" )
1618
1719
18- class PlaceMarkersScriptureUpdateBlockHandler (ScriptureUpdateBlockHandler ):
20+ class PlaceMarkersScriptureUpdateBlockHandler (UsfmUpdateBlockHandler ):
1921
2022 def __init__ (self , pt_info : Sequence [PretranslationInfo ]):
2123 # TODO: when will len(refs) be >1?
2224 self ._pt_info = {info ["refs" ][0 ]: info for info in pt_info }
2325
24- def process_block (self , block : ScriptureUpdateBlock ) -> ScriptureUpdateBlock :
26+ def process_block (self , block : UsfmUpdateBlock ) -> UsfmUpdateBlock :
2527 # Nothing to do if there are no markers to place, no alignment to use, or if the block represents an embed
2628 if (
2729 len (block .elements ) == 0
28- or str (block .ref ) not in self ._pt_info .keys ()
29- or len (self ._pt_info [str (block .ref )]["alignment" ]) == 0
30- or block .elements [0 ].type == ScriptureUpdateElementType .EMBED
30+ or str (block .refs [ 0 ] ) not in self ._pt_info .keys ()
31+ or len (self ._pt_info [str (block .refs [ 0 ] )]["alignment" ]) == 0
32+ or block .elements [0 ].type == UsfmUpdateBlockElementType .EMBED
3133 or not any (
3234 (
33- element .type in [ScriptureUpdateElementType .PARAGRAPH , ScriptureUpdateElementType .STYLE ]
35+ element .type in [UsfmUpdateBlockElementType .PARAGRAPH , UsfmUpdateBlockElementType .STYLE ]
3436 and not element .marked_for_removal
3537 )
3638 for element in block .elements [1 :]
3739 )
3840 ):
3941 return block
4042
41- # Parsing the block's elements potentially involves removing elements so they are not processed twice,
42- # but the original block may need to be returned if the two versions of the source/target text to not match up
43- orig_elements = copy (block .elements )
43+ # Work on a copy in case the block needs to be returned unchanged
44+ orig_elements = list (block .elements )
4445
4546 src_sent = ""
4647 trg_sent = ""
4748 to_place = []
4849 src_marker_idxs = []
4950 placed_elements = [orig_elements [0 ]]
50- end_elements = []
5151 ignored_elements = []
5252
53+ # Section headers should be ignored but re-inserted in the same position relative to other paragraph markers
54+ header_elements = []
55+ para_markers_left = 0
56+ for i , element in reversed (list (enumerate (orig_elements ))):
57+ if element .type == UsfmUpdateBlockElementType .PARAGRAPH and not element .marked_for_removal :
58+ if STYLESHEET .get_tag (str (element .tokens [0 ].marker )).text_type == UsfmTextType .SECTION :
59+ if i < len (orig_elements ) - 1 and orig_elements [i + 1 ].type == UsfmUpdateBlockElementType .TEXT :
60+ header_elements .insert (0 , (para_markers_left , [element , orig_elements .pop (i + 1 )]))
61+ else :
62+ header_elements .insert (0 , (para_markers_left , [element ]))
63+ orig_elements .pop (i )
64+ else :
65+ para_markers_left += 1
66+
5367 # Paragraph markers at the end of the block should stay there
68+ end_elements = []
5469 for i , element in reversed (list (enumerate (orig_elements ))):
55- if element .type == ScriptureUpdateElementType .PARAGRAPH :
70+ if element .type == UsfmUpdateBlockElementType .PARAGRAPH and not element . marked_for_removal :
5671 end_elements .insert (0 , element )
5772 orig_elements .pop (i )
58- elif element .type != ScriptureUpdateElementType . EMBED_BLOCK :
73+ elif element .type != UsfmUpdateBlockElementType . EMBED :
5974 break
6075
6176 for element in orig_elements [1 :]:
62- if element .type == ScriptureUpdateElementType .EXISTING_TEXT :
63- src_sent += element .tokens [0 ].to_usfm ()
64- if element .type == ScriptureUpdateElementType .INSERTED_TEXT :
65- trg_sent += element .tokens [0 ].to_usfm ()
77+ if element .type == UsfmUpdateBlockElementType .TEXT :
78+ if element .marked_for_removal :
79+ src_sent += element .tokens [0 ].to_usfm ()
80+ else :
81+ trg_sent += element .tokens [0 ].to_usfm ()
6682
67- if element .marked_for_removal or element .type == ScriptureUpdateElementType . EMBED_BLOCK :
83+ if element .marked_for_removal or element .type == UsfmUpdateBlockElementType . EMBED :
6884 ignored_elements .append (element )
69- elif element .type in [ScriptureUpdateElementType .PARAGRAPH , ScriptureUpdateElementType .STYLE ]:
85+ elif element .type in [UsfmUpdateBlockElementType .PARAGRAPH , UsfmUpdateBlockElementType .STYLE ]:
7086 to_place .append (element )
7187 src_marker_idxs .append (len (src_sent ))
7288
73- src_toks = self ._pt_info [str (block .ref )]["source_toks" ]
74- trg_toks = self ._pt_info [str (block .ref )]["translation_toks" ]
89+ src_toks = self ._pt_info [str (block .refs [ 0 ] )]["source_toks" ]
90+ trg_toks = self ._pt_info [str (block .refs [ 0 ] )]["translation_toks" ]
7591
7692 # Don't do anything if the source sentence or pretranslation has changed
7793 if (
7894 list (t for t in TOKENIZER .tokenize (src_sent )) != src_toks
79- or list (t for t in TOKENIZER .tokenize (trg_sent )) != trg_toks # could just use translation for trg
95+ or list (t for t in TOKENIZER .tokenize (trg_sent )) != trg_toks
8096 ):
8197 return block
8298
@@ -98,7 +114,7 @@ def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
98114 if i == 0 :
99115 adj_src_toks .append (i )
100116
101- alignment = to_word_alignment_matrix (self ._pt_info [str (block .ref )]["alignment" ])
117+ alignment = to_word_alignment_matrix (self ._pt_info [str (block .refs [ 0 ] )]["alignment" ])
102118 adj_trg_toks = [
103119 self ._predict_marker_location (alignment , adj_src_tok , src_toks , trg_toks ) for adj_src_tok in adj_src_toks
104120 ]
@@ -115,26 +131,39 @@ def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
115131 to_insert .insert (insert_pos , (trg_str_idx , element ))
116132
117133 # Construct new text tokens to put between markers
118- placed_elements .append (
119- ScriptureUpdateElement (
120- ScriptureUpdateElementType .INSERTED_TEXT ,
121- [UsfmToken (UsfmTokenType .TEXT , text = trg_sent [: to_insert [0 ][0 ]] if len (to_insert ) > 0 else trg_sent )],
134+ # and reincorporate headers and empty end-of-verse paragraph markers
135+ if len (to_insert ) == 0 or to_insert [0 ][0 ] > 0 :
136+ placed_elements .append (
137+ UsfmUpdateBlockElement (
138+ UsfmUpdateBlockElementType .TEXT ,
139+ [
140+ UsfmToken (
141+ UsfmTokenType .TEXT , text = trg_sent [: to_insert [0 ][0 ]] if len (to_insert ) > 0 else trg_sent
142+ )
143+ ],
144+ )
122145 )
123- )
124146 for j , (insert_idx , element ) in enumerate (to_insert ):
147+ if element .type == UsfmUpdateBlockElementType .PARAGRAPH :
148+ while len (header_elements ) > 0 and header_elements [0 ][0 ] == para_markers_left :
149+ placed_elements += header_elements .pop (0 )[1 ]
150+ para_markers_left -= 1
151+
125152 placed_elements .append (element )
126153 text_token = UsfmToken (
127154 UsfmTokenType .TEXT ,
128155 text = (trg_sent [insert_idx : to_insert [j + 1 ][0 ]] if j + 1 < len (to_insert ) else trg_sent [insert_idx :]),
129156 )
130- placed_elements .append (
131- ScriptureUpdateElement (
132- ScriptureUpdateElementType .INSERTED_TEXT ,
133- [text_token ],
134- )
135- )
157+ placed_elements .append (UsfmUpdateBlockElement (UsfmUpdateBlockElementType .TEXT , [text_token ]))
158+ for element in end_elements :
159+ while len (header_elements ) > 0 and header_elements [0 ][0 ] == para_markers_left :
160+ placed_elements += header_elements .pop (0 )[1 ]
161+ para_markers_left -= 1
162+ placed_elements .append (element )
163+ while len (header_elements ) > 0 :
164+ placed_elements += header_elements .pop (0 )[1 ]
136165
137- block ._elements = placed_elements + end_elements + ignored_elements
166+ block ._elements = placed_elements + ignored_elements
138167 return block
139168
140169 def _predict_marker_location (
0 commit comments