Skip to content

Commit 41e1810

Browse files
committed
Some tests pass
1 parent a3d7276 commit 41e1810

File tree

5 files changed

+178
-35
lines changed

5 files changed

+178
-35
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from __future__ import annotations
2+
3+
from .scripture_ref import ScriptureRef
4+
from .scripture_update_element import ScriptureUpdateElement, ScriptureUpdateElementType
5+
from .usfm_token import UsfmToken, UsfmTokenType
6+
7+
8+
class ScriptureUpdateBlock:
9+
10+
def __init__(self) -> None:
11+
self._ref: ScriptureRef = ScriptureRef()
12+
self._elements: list[ScriptureUpdateElement] = []
13+
14+
def add_existing_text(self, token: UsfmToken, marked_for_removal: bool = False) -> None:
15+
self._elements.append(
16+
ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal)
17+
)
18+
19+
def add_inserted_text(self, tokens: list[UsfmToken]) -> None:
20+
self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.INSERTED_TEXT, tokens.copy()))
21+
22+
def add_token(self, token: UsfmToken, marked_for_removal: bool = False) -> None:
23+
if token.type == UsfmTokenType.TEXT:
24+
self._elements.append(
25+
ScriptureUpdateElement(ScriptureUpdateElementType.EXISTING_TEXT, [token], marked_for_removal)
26+
)
27+
else:
28+
self._elements.append(ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, [token], marked_for_removal))
29+
30+
def add_tokens(self, tokens: list[UsfmToken], marked_for_removal: bool = False) -> None:
31+
if len(tokens) == 0:
32+
return
33+
self._elements.append(
34+
ScriptureUpdateElement(ScriptureUpdateElementType.OTHER, tokens.copy(), marked_for_removal)
35+
)
36+
37+
def update_ref(self, ref: ScriptureRef) -> None:
38+
self._ref = ref
39+
40+
def clear(self) -> None:
41+
self._elements.clear()
42+
self._ref = ScriptureRef()
43+
44+
def get_tokens(self) -> list[UsfmToken]:
45+
return [token for element in self._elements for token in element.get_tokens()]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from __future__ import annotations
2+
3+
from .scripture_update_block import ScriptureUpdateBlock
4+
5+
6+
class ScriptureUpdateBlockHandlerBase:
7+
8+
def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
9+
raise NotImplementedError("Must be implemented in subclass")
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import annotations
2+
3+
from .scripture_update_block import ScriptureUpdateBlock
4+
from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase
5+
from .scripture_update_element import ScriptureUpdateElementType
6+
7+
8+
class ScriptureUpdateBlockHandlerFirstElementsFirst(ScriptureUpdateBlockHandlerBase):
9+
10+
def process_block(self, block: ScriptureUpdateBlock) -> ScriptureUpdateBlock:
11+
# If a paragraph, embed or style element occurs before existing text, move it before inserted text as well.
12+
current_insert_index = 0
13+
for current_index in range(len(block._elements)):
14+
element = block._elements[current_index]
15+
if element.type == ScriptureUpdateElementType.EXISTING_TEXT:
16+
# we found existing text, so we stop looking for elements to move
17+
break
18+
if current_index != current_insert_index and element.type != ScriptureUpdateElementType.INSERTED_TEXT:
19+
block._elements.remove(element)
20+
block._elements.insert(current_insert_index, element)
21+
current_insert_index += 1
22+
23+
return block
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from enum import Enum, auto
5+
6+
from .usfm_token import UsfmToken
7+
8+
9+
class ScriptureUpdateElementType(Enum):
10+
EXISTING_TEXT = auto()
11+
INSERTED_TEXT = auto()
12+
OTHER = auto()
13+
14+
15+
@dataclass
16+
class ScriptureUpdateElement:
17+
type: ScriptureUpdateElementType
18+
tokens: list[UsfmToken]
19+
marked_for_removal: bool = False
20+
21+
def get_tokens(self) -> list[UsfmToken]:
22+
if self.marked_for_removal:
23+
return []
24+
return self.tokens

machine/corpora/update_usfm_parser_handler.py

Lines changed: 77 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from enum import Enum, auto
22
from typing import List, Optional, Sequence, Tuple, Union
33

4+
from ..scripture.verse_ref import VerseRef
45
from .scripture_ref import ScriptureRef
56
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler
7+
from .scripture_update_block import ScriptureUpdateBlock
8+
from .scripture_update_block_handler_base import ScriptureUpdateBlockHandlerBase
9+
from .scripture_update_block_handler_first_elements_first import ScriptureUpdateBlockHandlerFirstElementsFirst
610
from .usfm_parser_state import UsfmParserState
711
from .usfm_stylesheet import UsfmStylesheet
812
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
@@ -31,13 +35,20 @@ def __init__(
3135
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
3236
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
3337
preserve_paragraph_styles: Optional[Sequence[str]] = None,
38+
update_block_handlers: Optional[list[ScriptureUpdateBlockHandlerBase]] = None,
3439
) -> None:
3540
super().__init__()
3641
self._rows = rows or []
3742
self._tokens: List[UsfmToken] = []
38-
self._new_tokens: List[UsfmToken] = []
39-
self._new_embed_tokens: List[UsfmToken] = []
43+
self._updated_text: List[UsfmToken] = []
44+
self._updated_embed_text: List[UsfmToken] = []
45+
self._update_block: ScriptureUpdateBlock = ScriptureUpdateBlock()
46+
self._embed_update_block: ScriptureUpdateBlock = ScriptureUpdateBlock()
4047
self._id_text = id_text
48+
if update_block_handlers is None:
49+
self._update_block_handlers = [ScriptureUpdateBlockHandlerFirstElementsFirst()]
50+
else:
51+
self._update_block_handlers = update_block_handlers
4152
if preserve_paragraph_styles is None:
4253
self._preserve_paragraph_styles = set(["r", "rem"])
4354
elif isinstance(preserve_paragraph_styles, str):
@@ -60,21 +71,20 @@ def tokens(self) -> List[UsfmToken]:
6071

6172
def end_usfm(self, state: UsfmParserState) -> None:
6273
self._collect_tokens(state)
63-
74+
self._process_update_block()
6475
super().end_usfm(state)
6576

6677
def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
6778
self._collect_tokens(state)
6879
start_book_tokens: List[UsfmToken] = []
6980
if self._id_text is not None:
7081
start_book_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=self._id_text + " "))
71-
self._push_new_tokens(start_book_tokens)
82+
self._update_block.add_tokens(start_book_tokens)
7283

7384
super().start_book(state, marker, code)
7485

7586
def end_book(self, state: UsfmParserState, marker: str) -> None:
76-
self._pop_new_tokens()
77-
87+
self._process_update_block()
7888
super().end_book(state, marker)
7989

8090
def start_para(
@@ -99,6 +109,7 @@ def start_para(
99109
super().start_para(state, marker, unknown, attributes)
100110

101111
def end_para(self, state: UsfmParserState, marker: str) -> None:
112+
self._process_update_block()
102113
super().end_para(state, marker)
103114
self._in_preserved_paragraph = False
104115

@@ -114,7 +125,7 @@ def start_cell(self, state: UsfmParserState, marker: str, align: str, colspan: i
114125

115126
def end_cell(self, state: UsfmParserState, marker: str) -> None:
116127
self._collect_tokens(state)
117-
128+
self._process_update_block()
118129
super().end_cell(state, marker)
119130

120131
def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> None:
@@ -125,6 +136,7 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N
125136
def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
126137
if closed:
127138
self._collect_tokens(state)
139+
self._process_update_block()
128140

129141
super().end_sidebar(state, marker, closed)
130142

@@ -137,6 +149,7 @@ def chapter(
137149
pub_number: str,
138150
) -> None:
139151
self._collect_tokens(state)
152+
self._process_update_block()
140153

141154
super().chapter(state, number, marker, alt_number, pub_number)
142155

@@ -148,6 +161,7 @@ def milestone(
148161
attributes: Sequence[UsfmAttribute],
149162
) -> None:
150163
self._collect_tokens(state)
164+
self._process_update_block()
151165

152166
super().milestone(state, marker, start_milestone, attributes)
153167

@@ -160,6 +174,7 @@ def verse(
160174
pub_number: str,
161175
) -> None:
162176
self._collect_tokens(state)
177+
self._process_update_block()
163178

164179
super().verse(state, number, marker, alt_number, pub_number)
165180

@@ -196,6 +211,7 @@ def _start_embed(
196211
state: UsfmParserState,
197212
scripture_ref: ScriptureRef,
198213
) -> None:
214+
self._embed_update_block.update_ref(scripture_ref)
199215
self._embed_row_texts = self._advance_rows([scripture_ref])
200216
self._embed_updated = any(self._embed_row_texts)
201217

@@ -212,6 +228,7 @@ def _end_embed(
212228
else:
213229
self._collect_tokens(state)
214230

231+
self._process_embed_update_block()
215232
self._embed_row_texts.clear()
216233
self._embed_updated = False
217234

@@ -251,20 +268,20 @@ def unmatched(self, state: UsfmParserState, marker: str) -> None:
251268

252269
def _start_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None:
253270
row_texts: List[str] = self._advance_rows(scripture_refs)
254-
self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
271+
self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
255272

256273
def _end_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None:
257274
self._pop_new_tokens()
258275

259276
def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
260277
row_texts = self._advance_rows([scripture_ref])
261-
self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
278+
self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
262279

263280
def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
264281
self._pop_new_tokens()
265282

266283
def _start_note_text(self, state: UsfmParserState) -> None:
267-
self._push_new_embed_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
284+
self._push_updated_embed_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
268285

269286
def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
270287
self._embed_row_texts.clear()
@@ -301,13 +318,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]:
301318
return row_texts
302319

303320
def _collect_tokens(self, state: UsfmParserState) -> None:
304-
self._tokens.extend(self._new_tokens)
305-
self._new_tokens.clear()
321+
self._use_updated_text()
306322
while self._token_index <= state.index + state.special_token_count:
307-
self._tokens.append(state.tokens[self._token_index])
323+
self._update_block.add_token(state.tokens[self._token_index])
308324
self._token_index += 1
309325

310326
def _skip_tokens(self, state: UsfmParserState) -> None:
327+
while self._token_index <= state.index + state.special_token_count:
328+
self._update_block.add_token(state.tokens[self._token_index], marked_for_removal=True)
329+
self._token_index += 1
311330
self._token_index = state.index + 1 + state.special_token_count
312331

313332
def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool:
@@ -343,24 +362,24 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True)
343362

344363
if use_new_tokens:
345364
if in_embed:
346-
self._add_new_embed_tokens()
365+
self._use_updated_embed_text()
347366
else:
348-
self._add_new_tokens()
367+
self._use_updated_text()
349368

350369
if existing_text and (
351370
self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING or self._is_in_preserved_paragraph(marker)
352371
):
353372
if in_embed:
354-
self._clear_new_embed_tokens()
373+
self._clear_updated_embed_text()
355374
else:
356-
self._clear_new_tokens()
375+
self._clear_updated_text()
357376

358377
embed_in_new_verse_text = (
359378
any(self._replace_stack) or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING
360379
) and in_embed
361380
if embed_in_new_verse_text or self._embed_updated:
362381
if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP:
363-
self._clear_new_embed_tokens()
382+
self._clear_updated_embed_text()
364383
return True
365384
if not self._is_in_note_text() or in_nested_embed:
366385
return False
@@ -375,33 +394,56 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True)
375394
def _has_new_text(self) -> bool:
376395
return any(self._replace_stack) and self._replace_stack[-1]
377396

378-
def _push_new_tokens(self, tokens: List[UsfmToken]) -> None:
397+
def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
398+
super()._update_verse_ref(verse_ref, marker)
399+
self._update_block.update_ref(ScriptureRef(verse_ref.copy()))
400+
401+
def _create_non_verse_ref(self) -> ScriptureRef:
402+
ref = super()._create_non_verse_ref()
403+
self._update_block.update_ref(ref)
404+
return ref
405+
406+
def _process_update_block(self) -> None:
407+
self._use_updated_text()
408+
for handler in self._update_block_handlers:
409+
self._update_block = handler.process_block(self._update_block)
410+
self._tokens.extend(self._update_block.get_tokens())
411+
self._update_block.clear()
412+
413+
def _process_embed_update_block(self) -> None:
414+
self._use_updated_embed_text()
415+
for handler in self._update_block_handlers:
416+
self._embed_update_block = handler.process_block(self._embed_update_block)
417+
self._update_block.add_tokens(self._embed_update_block.get_tokens())
418+
self._embed_update_block.clear()
419+
420+
def _push_updated_text(self, tokens: List[UsfmToken]) -> None:
379421
self._replace_stack.append(any(tokens))
380422
if tokens:
381-
self._new_tokens.extend(tokens)
423+
self._updated_text.extend(tokens)
382424

383-
def _add_new_tokens(self) -> None:
384-
if self._new_tokens:
385-
self._tokens.extend(self._new_tokens)
386-
self._new_tokens.clear()
425+
def _use_updated_text(self) -> None:
426+
if self._updated_text:
427+
self._update_block.add_inserted_text(self._updated_text)
428+
self._updated_text.clear()
387429

388-
def _clear_new_tokens(self) -> None:
389-
self._new_tokens.clear()
430+
def _clear_updated_text(self) -> None:
431+
self._updated_text.clear()
390432

391-
def _push_new_embed_tokens(self, tokens: List[UsfmToken]) -> None:
433+
def _push_updated_embed_text(self, tokens: List[UsfmToken]) -> None:
392434
self._replace_stack.append(any(tokens))
393435
if tokens:
394-
self._new_embed_tokens.extend(tokens)
436+
self._updated_embed_text.extend(tokens)
395437

396-
def _add_new_embed_tokens(self) -> None:
397-
if self._new_embed_tokens:
398-
self._tokens.extend(self._new_embed_tokens)
399-
self._new_embed_tokens.clear()
438+
def _use_updated_embed_text(self) -> None:
439+
if self._updated_embed_text:
440+
self._embed_update_block.add_inserted_text(self._updated_embed_text)
441+
self._updated_embed_text.clear()
400442

401-
def _clear_new_embed_tokens(self) -> None:
402-
self._new_embed_tokens.clear()
443+
def _clear_updated_embed_text(self) -> None:
444+
self._updated_embed_text.clear()
403445

404-
def _push_token_as_previous(self) -> None:
446+
def _push_updated_text_as_previous(self) -> None:
405447
self._replace_stack.append(self._replace_stack[-1])
406448

407449
def _pop_new_tokens(self) -> None:

0 commit comments

Comments
 (0)