Skip to content

Commit bc855bd

Browse files
isaac091johnml1135
authored andcommitted
Add tests for beginning-of-verse embeds
Fix the embed at beginning issue. Add paragraph marker control Correct behavior for stripping text
1 parent ec71305 commit bc855bd

File tree

4 files changed

+229
-26
lines changed

4 files changed

+229
-26
lines changed

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def update_usfm(
2222
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
2323
full_name: Optional[str] = None,
2424
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
25+
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2526
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2627
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
2728
) -> Optional[str]:
@@ -31,7 +32,12 @@ def update_usfm(
3132
with self._open(file_name) as sfm_file:
3233
usfm: str = sfm_file.read().decode(self._settings.encoding)
3334
handler = UpdateUsfmParserHandler(
34-
rows, None if full_name is None else f"- {full_name}", text_behavior, embed_behavior, style_behavior
35+
rows,
36+
None if full_name is None else f"- {full_name}",
37+
text_behavior,
38+
paragraph_behavior,
39+
embed_behavior,
40+
style_behavior,
3541
)
3642
try:
3743
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

machine/corpora/scripture_ref_usfm_parser_handler.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class ScriptureTextType(Enum):
1919
NOTE_TEXT = auto()
2020

2121

22+
PRESERVE_PARAGRAPH_STYLES = ("r", "rem")
2223
EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
2324
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")
2425

@@ -29,6 +30,7 @@ def __init__(self) -> None:
2930
self._cur_elements_stack: List[ScriptureElement] = []
3031
self._cur_text_type_stack: List[ScriptureTextType] = []
3132
self._duplicate_verse: bool = False
33+
self._in_preserved_paragraph: bool = False
3234
self._in_embed: bool = False
3335
self._in_note_text: bool = False
3436
self._in_nested_embed: bool = False
@@ -74,13 +76,16 @@ def start_para(
7476
unknown: Optional[bool],
7577
attributes: Optional[Sequence[UsfmAttribute]],
7678
) -> None:
79+
if self._is_preserve_paragraph_type(marker):
80+
self._in_preserved_paragraph = True
7781
if self._cur_verse_ref.is_default:
7882
self._update_verse_ref(state.verse_ref, marker)
7983
if not state.is_verse_text:
8084
self._start_parent_element(marker)
8185
self._start_non_verse_text_wrapper(state)
8286

8387
def end_para(self, state: UsfmParserState, marker: str) -> None:
88+
self._in_preserved_paragraph = False
8489
if self._current_text_type == ScriptureTextType.NONVERSE:
8590
self._end_parent_element()
8691
self._end_non_verse_text_wrapper(state)
@@ -270,9 +275,12 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
270275
def _is_in_embed(self, marker: Optional[str]) -> bool:
271276
return self._in_embed or self._is_embed_style(marker)
272277

278+
def _is_in_preserved_paragraph(self, marker: Optional[str]) -> bool:
279+
return self._in_preserved_paragraph or self._is_preserve_paragraph_type(marker)
280+
273281
def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
274282
return self._in_nested_embed or (
275-
marker is not None and marker[0] == "+" and marker[1] in EMBED_PART_START_CHAR_STYLES
283+
marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES
276284
)
277285

278286
def _is_note_text(self, marker: Optional[str]) -> bool:
@@ -282,4 +290,7 @@ def _is_embed_part_style(self, marker: Optional[str]) -> bool:
282290
return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES)
283291

284292
def _is_embed_style(self, marker: Optional[str]) -> bool:
285-
return marker in EMBED_STYLES
293+
return marker is not None and marker.strip("*") in EMBED_STYLES
294+
295+
def _is_preserve_paragraph_type(self, marker: Optional[str]) -> bool:
296+
return marker in PRESERVE_PARAGRAPH_STYLES

machine/corpora/update_usfm_parser_handler.py

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,18 @@ def __init__(
2727
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
2828
id_text: Optional[str] = None,
2929
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
30+
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
3031
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
3132
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
3233
) -> None:
3334
super().__init__()
3435
self._rows = rows or []
3536
self._tokens: List[UsfmToken] = []
3637
self._new_tokens: List[UsfmToken] = []
38+
self._new_embed_tokens: List[UsfmToken] = []
3739
self._id_text = id_text
3840
self._text_behavior = text_behavior
41+
self._paragraph_behavior = paragraph_behavior
3942
self._embed_behavior = embed_behavior
4043
self._style_behavior = style_behavior
4144
self._replace_stack: List[bool] = []
@@ -74,7 +77,14 @@ def start_para(
7477
unknown: bool,
7578
attributes: Optional[Sequence[UsfmAttribute]],
7679
) -> None:
77-
self._collect_tokens(state)
80+
if (
81+
state.verse_ref.verse_num != 0
82+
and (self._has_new_text() or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING)
83+
and self._paragraph_behavior == UpdateUsfmMarkerBehavior.STRIP
84+
):
85+
self._skip_tokens(state)
86+
else:
87+
self._collect_tokens(state)
7888

7989
super().start_para(state, marker, unknown, attributes)
8090

@@ -202,13 +212,13 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) ->
202212
super().ref(state, marker, display, target)
203213

204214
def text(self, state: UsfmParserState, text: str) -> None:
215+
super().text(state, text)
216+
205217
if self._replace_with_new_tokens(state):
206218
self._skip_tokens(state)
207219
else:
208220
self._collect_tokens(state)
209221

210-
super().text(state, text)
211-
212222
def opt_break(self, state: UsfmParserState) -> None:
213223
if self._replace_with_new_tokens(state):
214224
self._skip_tokens(state)
@@ -240,7 +250,7 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
240250
self._pop_new_tokens()
241251

242252
def _start_note_text(self, state: UsfmParserState) -> None:
243-
self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
253+
self._push_new_embed_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
244254

245255
def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
246256
self._embed_row_texts.clear()
@@ -287,13 +297,9 @@ def _skip_tokens(self, state: UsfmParserState) -> None:
287297
self._token_index = state.index + 1 + state.special_token_count
288298

289299
def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool:
290-
if self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING:
291-
self._add_new_tokens()
292-
return True
293-
294-
new_text: bool = bool(self._replace_stack) and self._replace_stack[-1]
295300
marker: Optional[str] = state.token if state.token is None else state.token.marker
296301
in_embed: bool = self._is_in_embed(marker)
302+
297303
in_nested_embed: bool = self._is_in_nested_embed(marker)
298304
is_style_tag: bool = marker is not None and not self._is_embed_part_style(marker)
299305

@@ -303,8 +309,14 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True)
303309
)
304310

305311
use_new_tokens = (
306-
new_text
307-
and (not existing_text or self._text_behavior == UpdateUsfmTextBehavior.PREFER_NEW)
312+
(
313+
(self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING)
314+
or (
315+
self._has_new_text()
316+
and (not existing_text or self._text_behavior == UpdateUsfmTextBehavior.PREFER_NEW)
317+
)
318+
)
319+
and not self._is_in_preserved_paragraph(marker)
308320
and (
309321
not in_embed
310322
or (
@@ -316,26 +328,37 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True)
316328
)
317329

318330
if use_new_tokens:
319-
self._add_new_tokens()
331+
if in_embed:
332+
self._add_new_embed_tokens()
333+
else:
334+
self._add_new_tokens()
320335

321336
if existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING:
322-
self._clear_new_tokens()
337+
if in_embed:
338+
self._clear_new_embed_tokens()
339+
else:
340+
self._clear_new_tokens()
323341

324-
embed_in_new_verse_text = any(self._replace_stack) and in_embed
342+
embed_in_new_verse_text = (
343+
any(self._replace_stack) or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING
344+
) and in_embed
325345
if embed_in_new_verse_text or self._embed_updated:
326346
if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP:
327-
self._clear_new_tokens()
347+
self._clear_new_embed_tokens()
328348
return True
329349
if not self._is_in_note_text() or in_nested_embed:
330350
return False
331351

332352
skip_tokens = use_new_tokens and closed
333353

334-
if new_text and is_style_tag:
354+
if use_new_tokens and is_style_tag:
335355
skip_tokens = self._style_behavior == UpdateUsfmMarkerBehavior.STRIP
336356

337357
return skip_tokens
338358

359+
def _has_new_text(self) -> bool:
360+
return bool(self._replace_stack) and self._replace_stack[-1]
361+
339362
def _push_new_tokens(self, tokens: List[UsfmToken]) -> None:
340363
self._replace_stack.append(any(tokens))
341364
if tokens:
@@ -349,6 +372,19 @@ def _add_new_tokens(self) -> None:
349372
def _clear_new_tokens(self) -> None:
350373
self._new_tokens.clear()
351374

375+
def _push_new_embed_tokens(self, tokens: List[UsfmToken]) -> None:
376+
self._replace_stack.append(any(tokens))
377+
if tokens:
378+
self._new_embed_tokens.extend(tokens)
379+
380+
def _add_new_embed_tokens(self) -> None:
381+
if self._new_embed_tokens:
382+
self._tokens.extend(self._new_embed_tokens)
383+
self._new_embed_tokens.clear()
384+
385+
def _clear_new_embed_tokens(self) -> None:
386+
self._new_embed_tokens.clear()
387+
352388
def _push_token_as_previous(self) -> None:
353389
self._replace_stack.append(self._replace_stack[-1])
354390

0 commit comments

Comments
 (0)