Skip to content

Commit ba783fb

Browse files
committed
Move "untraslatable" to Serval/SILNLP. or just at least remove it from machine.py
Updated logic * Add "nested" embeds * If we strip embeds and there is an updated embed, strip it All tests pass
1 parent d4c7802 commit ba783fb

File tree

10 files changed

+404
-143
lines changed

10 files changed

+404
-143
lines changed

machine/corpora/scripture_ref_usfm_parser_handler.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ class ScriptureTextType(Enum):
1919
NOTE_TEXT = auto()
2020

2121

22+
EMBED_STARTING_CHARS = ("f", "x", "z")
23+
24+
2225
class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
2326
def __init__(self) -> None:
2427
self._cur_verse_ref: VerseRef = VerseRef()
@@ -27,6 +30,7 @@ def __init__(self) -> None:
2730
self._duplicate_verse: bool = False
2831
self._in_embed: bool = False
2932
self._in_note_text: bool = False
33+
self._in_nested_embed: bool = False
3034

3135
@property
3236
def _current_text_type(self) -> ScriptureTextType:
@@ -112,21 +116,25 @@ def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None
112116

113117
def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
114118
self._in_embed = True
115-
self._start_embed(state, marker, caller, category)
119+
self._start_embed_wrapper(state, marker)
116120

117121
def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
118122
self._end_note_text_wrapper(state)
119123
self._end_embed(state, marker, None, closed)
120124
self._in_embed = False
121125

122-
def _start_embed(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
126+
def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None:
123127
if self._cur_verse_ref.is_default:
124128
self._update_verse_ref(state.verse_ref, marker)
125129

126130
if not self._duplicate_verse:
127131
self._check_convert_verse_para_to_non_verse(state)
128132
self._next_element(marker)
129133

134+
self._start_embed(state, self._create_non_verse_ref())
135+
136+
def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
137+
130138
def _end_embed(
131139
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
132140
) -> None:
@@ -143,21 +151,26 @@ def opt_break(self, state: UsfmParserState) -> None:
143151
def start_char(
144152
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
145153
) -> None:
146-
if self._is_embed_part(marker):
147-
self._end_note_text_wrapper(state)
154+
if self._is_embed_part(marker) and self._in_note_text:
155+
self._in_nested_embed = True
148156
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
149157
self._check_convert_verse_para_to_non_verse(state)
150158

151159
if self._is_embed_character(marker):
152160
self._in_embed = True
153-
self._start_embed(state, marker, "", None)
161+
self._start_embed_wrapper(state, marker)
154162

155163
if self._is_note_text(marker):
156164
self._start_note_text_wrapper(state)
157165

158166
def end_char(
159167
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
160168
) -> None:
169+
if self._is_embed_part(marker):
170+
if self._in_nested_embed:
171+
self._in_nested_embed = False
172+
else:
173+
self._end_note_text_wrapper(state)
161174
if self._is_embed_character(marker):
162175
self._end_embed(state, marker, attributes, closed)
163176
self._in_embed = False
@@ -173,9 +186,9 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
173186
def _start_note_text_wrapper(self, state: UsfmParserState):
174187
self._in_note_text = True
175188
self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT)
176-
self._start_note_text(state, self._create_non_verse_ref())
189+
self._start_note_text(state)
177190

178-
def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
191+
def _start_note_text(self, state: UsfmParserState) -> None: ...
179192

180193
def _end_note_text_wrapper(self, state: UsfmParserState):
181194
if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT:
@@ -256,11 +269,14 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
256269
def _is_in_embed(self, marker: Optional[str]) -> bool:
257270
return self._in_embed or self._is_embed_character(marker)
258271

272+
def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
273+
return self._in_nested_embed or (marker is not None and marker[0] == "+" and marker[1] in EMBED_STARTING_CHARS)
274+
259275
def _is_note_text(self, marker: Optional[str]) -> bool:
260276
return marker == "ft"
261277

262278
def _is_embed_part(self, marker: Optional[str]) -> bool:
263-
return marker is not None and marker.startswith(("f", "x", "z"))
279+
return marker is not None and marker.startswith(EMBED_STARTING_CHARS)
264280

265281
def _is_embed_character(self, marker: Optional[str]) -> bool:
266282
return marker in ("f", "fe", "fig", "fm", "x")

machine/corpora/update_usfm_parser_handler.py

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ class UpdateUsfmMarkerBehavior(Enum):
2222

2323
class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
2424

25-
_untranslatable_paragraph_tags = ("r", "rem")
26-
2725
def __init__(
2826
self,
2927
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
@@ -43,6 +41,8 @@ def __init__(
4341
self._replace_stack: List[bool] = []
4442
self._row_index: int = 0
4543
self._token_index: int = 0
44+
self._embed_updated: bool = False
45+
self._embed_row_texts: List[str] = []
4646

4747
@property
4848
def tokens(self) -> List[UsfmToken]:
@@ -170,17 +170,16 @@ def end_char(
170170
def _start_embed(
171171
self,
172172
state: UsfmParserState,
173-
marker: str,
174-
caller: str,
175-
category: str,
173+
scripture_ref: ScriptureRef,
176174
) -> None:
175+
self._embed_row_texts = self._advance_rows([scripture_ref])
176+
self._embed_updated = bool(self._embed_row_texts)
177+
177178
if self._replace_with_new_tokens(state):
178179
self._skip_tokens(state)
179180
else:
180181
self._collect_tokens(state)
181182

182-
super()._start_embed(state, marker, caller, category)
183-
184183
def _end_embed(
185184
self, state: UsfmParserState, marker: str, attributes: Sequence[UsfmAttribute], closed: bool
186185
) -> None:
@@ -189,6 +188,9 @@ def _end_embed(
189188
else:
190189
self._collect_tokens(state)
191190

191+
self._embed_row_texts.clear()
192+
self._embed_updated = False
193+
192194
super()._end_embed(state, marker, attributes, closed)
193195

194196
def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None:
@@ -237,11 +239,11 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture
237239
def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
238240
self._pop_new_tokens()
239241

240-
def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
241-
row_texts = self._advance_rows([scripture_ref])
242-
self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
242+
def _start_note_text(self, state: UsfmParserState) -> None:
243+
self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
243244

244245
def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
246+
self._embed_row_texts.clear()
245247
self._pop_new_tokens()
246248

247249
def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
@@ -285,47 +287,46 @@ def _skip_tokens(self, state: UsfmParserState) -> None:
285287
self._token_index = state.index + 1 + state.special_token_count
286288

287289
def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool:
288-
untranslatable_paragraph: bool = state.para_tag is not None and self._is_untranslatable_paragraph(
289-
state.para_tag.marker
290-
)
291290
if self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING:
292-
if untranslatable_paragraph:
293-
self._clear_new_tokens()
294-
else:
295-
self._add_new_tokens()
291+
self._add_new_tokens()
296292
return True
297293

298294
new_text: bool = bool(self._replace_stack) and self._replace_stack[-1]
299-
in_embed: bool = state.token is not None and self._is_in_embed(state.token.marker)
300-
is_style_tag: bool = (
301-
state.token is not None and state.token.marker is not None and not self._is_embed_part(state.token.marker)
302-
)
295+
marker: Optional[str] = state.token if state.token is None else state.token.marker
296+
in_embed: bool = self._is_in_embed(marker)
297+
in_nested: bool = self._is_in_nested_embed(marker)
298+
is_style_tag: bool = marker is not None and not self._is_embed_part(marker)
303299

304300
existing_text = any(
305301
t.type == UsfmTokenType.TEXT and t.text
306302
for t in state.tokens[self._token_index : state.index + 1 + state.special_token_count]
307303
)
308304

309305
use_new_tokens = (
310-
not untranslatable_paragraph
311-
and new_text
306+
new_text
312307
and (not existing_text or self._text_behavior == UpdateUsfmTextBehavior.PREFER_NEW)
313-
and (not in_embed or self._is_in_note_text())
308+
and (
309+
not in_embed
310+
or (
311+
self._is_in_note_text()
312+
and not in_nested
313+
and self._embed_behavior == UpdateUsfmMarkerBehavior.PRESERVE
314+
)
315+
)
314316
)
315317

316318
if use_new_tokens:
317319
self._add_new_tokens()
318320

319-
if untranslatable_paragraph or (
320-
existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING
321-
):
321+
if existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING:
322322
self._clear_new_tokens()
323323

324-
within_new_text = any(self._replace_stack)
325-
if within_new_text and in_embed:
324+
embed_in_new_verse_text = any(self._replace_stack) and in_embed
325+
if embed_in_new_verse_text or self._embed_updated:
326326
if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP:
327+
self._clear_new_tokens()
327328
return True
328-
if not self._is_in_note_text():
329+
if not self._is_in_note_text() or in_nested:
329330
return False
330331

331332
skip_tokens = use_new_tokens and closed
@@ -353,6 +354,3 @@ def _push_token_as_previous(self) -> None:
353354

354355
def _pop_new_tokens(self) -> None:
355356
self._replace_stack.pop()
356-
357-
def _is_untranslatable_paragraph(self, marker: Optional[str]) -> bool:
358-
return marker in self._untranslatable_paragraph_tags

machine/corpora/usfm_text_base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,11 @@ def text(self, state: UsfmParserState, text: str) -> None:
202202
text = text.lstrip()
203203
row_text += text
204204
elif len(text) > 0 and (self._current_text_type != ScriptureTextType.VERSE or state.is_verse_text):
205-
if state.token is not None and self._is_in_embed(state.token.marker) and not self._is_in_note_text():
205+
if (
206+
state.token is not None
207+
and self._is_in_embed(state.token.marker)
208+
and (not self._is_in_note_text() or self._is_in_nested_embed(state.token.marker))
209+
):
206210
return
207211

208212
if (
@@ -230,7 +234,7 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
230234
if self._text._include_all_text:
231235
self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start))
232236

233-
def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
237+
def _start_note_text(self, state: UsfmParserState) -> None:
234238
if self._text._include_markers:
235239
return
236240
self._row_texts_stack.append("")

poetry.lock

Lines changed: 20 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ pytest-cov = "^4.1.0"
8383
ipykernel = "^6.7.0"
8484
jupyter = "^1.0.0"
8585
pandas = "^2.0.3"
86-
pyright = "^1.1.362"
86+
pyright = { extras = ["nodejs"], version = "^1.1.362" }
8787
decoy = "^2.1.0"
8888
pep8-naming = "^0.14.1"
8989

0 commit comments

Comments
 (0)