sillsdev
diff --git a/‎machine/corpora/scripture_ref_usfm_parser_handler.py‎
Lines changed: 24 additions & 8 deletions b/‎machine/corpora/scripture_ref_usfm_parser_handler.py‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎machine/corpora/update_usfm_parser_handler.py‎
Lines changed: 31 additions & 33 deletions b/‎machine/corpora/update_usfm_parser_handler.py‎
Lines changed: 31 additions & 33 deletions
diff --git a/‎machine/corpora/usfm_text_base.py‎
Lines changed: 6 additions & 2 deletions b/‎machine/corpora/usfm_text_base.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎poetry.lock‎
Lines changed: 20 additions & 1 deletion b/‎poetry.lock‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -19,6 +19,9 @@ class ScriptureTextType(Enum):
     NOTE_TEXT = auto()
 
 
+EMBED_STARTING_CHARS = ("f", "x", "z")
+
+
 class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
     def __init__(self) -> None:
         self._cur_verse_ref: VerseRef = VerseRef()
@@ -27,6 +30,7 @@ def __init__(self) -> None:
         self._duplicate_verse: bool = False
         self._in_embed: bool = False
         self._in_note_text: bool = False
+        self._in_nested_embed: bool = False
 
     @property
     def _current_text_type(self) -> ScriptureTextType:
@@ -112,21 +116,25 @@ def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None
 
     def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
         self._in_embed = True
-        self._start_embed(state, marker, caller, category)
+        self._start_embed_wrapper(state, marker)
 
     def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
         self._end_note_text_wrapper(state)
         self._end_embed(state, marker, None, closed)
         self._in_embed = False
 
-    def _start_embed(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
+    def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None:
         if self._cur_verse_ref.is_default:
             self._update_verse_ref(state.verse_ref, marker)
 
         if not self._duplicate_verse:
             self._check_convert_verse_para_to_non_verse(state)
             self._next_element(marker)
 
+        self._start_embed(state, self._create_non_verse_ref())
+
+    def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
+
     def _end_embed(
         self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
     ) -> None:
@@ -143,21 +151,26 @@ def opt_break(self, state: UsfmParserState) -> None:
     def start_char(
         self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
     ) -> None:
-        if self._is_embed_part(marker):
-            self._end_note_text_wrapper(state)
+        if self._is_embed_part(marker) and self._in_note_text:
+            self._in_nested_embed = True
         # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
         self._check_convert_verse_para_to_non_verse(state)
 
         if self._is_embed_character(marker):
             self._in_embed = True
-            self._start_embed(state, marker, "", None)
+            self._start_embed_wrapper(state, marker)
 
         if self._is_note_text(marker):
             self._start_note_text_wrapper(state)
 
     def end_char(
         self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
     ) -> None:
+        if self._is_embed_part(marker):
+            if self._in_nested_embed:
+                self._in_nested_embed = False
+            else:
+                self._end_note_text_wrapper(state)
         if self._is_embed_character(marker):
             self._end_embed(state, marker, attributes, closed)
             self._in_embed = False
@@ -173,9 +186,9 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
     def _start_note_text_wrapper(self, state: UsfmParserState):
         self._in_note_text = True
         self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT)
-        self._start_note_text(state, self._create_non_verse_ref())
+        self._start_note_text(state)
 
-    def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
+    def _start_note_text(self, state: UsfmParserState) -> None: ...
 
     def _end_note_text_wrapper(self, state: UsfmParserState):
         if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT:
@@ -256,11 +269,14 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
     def _is_in_embed(self, marker: Optional[str]) -> bool:
         return self._in_embed or self._is_embed_character(marker)
 
+    def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
+        return self._in_nested_embed or (marker is not None and marker[0] == "+" and marker[1] in EMBED_STARTING_CHARS)
+
     def _is_note_text(self, marker: Optional[str]) -> bool:
         return marker == "ft"
 
     def _is_embed_part(self, marker: Optional[str]) -> bool:
-        return marker is not None and marker.startswith(("f", "x", "z"))
+        return marker is not None and marker.startswith(EMBED_STARTING_CHARS)
 
     def _is_embed_character(self, marker: Optional[str]) -> bool:
         return marker in ("f", "fe", "fig", "fm", "x")
@@ -22,8 +22,6 @@ class UpdateUsfmMarkerBehavior(Enum):
 
 class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
 
-    _untranslatable_paragraph_tags = ("r", "rem")
-
     def __init__(
         self,
         rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
@@ -43,6 +41,8 @@ def __init__(
         self._replace_stack: List[bool] = []
         self._row_index: int = 0
         self._token_index: int = 0
+        self._embed_updated: bool = False
+        self._embed_row_texts: List[str] = []
 
     @property
     def tokens(self) -> List[UsfmToken]:
@@ -170,17 +170,16 @@ def end_char(
     def _start_embed(
         self,
         state: UsfmParserState,
-        marker: str,
-        caller: str,
-        category: str,
+        scripture_ref: ScriptureRef,
     ) -> None:
+        self._embed_row_texts = self._advance_rows([scripture_ref])
+        self._embed_updated = bool(self._embed_row_texts)
+
         if self._replace_with_new_tokens(state):
             self._skip_tokens(state)
         else:
             self._collect_tokens(state)
 
-        super()._start_embed(state, marker, caller, category)
-
     def _end_embed(
         self, state: UsfmParserState, marker: str, attributes: Sequence[UsfmAttribute], closed: bool
     ) -> None:
@@ -189,6 +188,9 @@ def _end_embed(
         else:
             self._collect_tokens(state)
 
+        self._embed_row_texts.clear()
+        self._embed_updated = False
+
         super()._end_embed(state, marker, attributes, closed)
 
     def ref(self, state: UsfmParserState, marker: str, display: str, target: str) -> None:
@@ -237,11 +239,11 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture
     def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
         self._pop_new_tokens()
 
-    def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
-        row_texts = self._advance_rows([scripture_ref])
-        self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
+    def _start_note_text(self, state: UsfmParserState) -> None:
+        self._push_new_tokens([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in self._embed_row_texts])
 
     def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
+        self._embed_row_texts.clear()
         self._pop_new_tokens()
 
     def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
@@ -285,47 +287,46 @@ def _skip_tokens(self, state: UsfmParserState) -> None:
         self._token_index = state.index + 1 + state.special_token_count
 
     def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True) -> bool:
-        untranslatable_paragraph: bool = state.para_tag is not None and self._is_untranslatable_paragraph(
-            state.para_tag.marker
-        )
         if self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING:
-            if untranslatable_paragraph:
-                self._clear_new_tokens()
-            else:
-                self._add_new_tokens()
+            self._add_new_tokens()
             return True
 
         new_text: bool = bool(self._replace_stack) and self._replace_stack[-1]
-        in_embed: bool = state.token is not None and self._is_in_embed(state.token.marker)
-        is_style_tag: bool = (
-            state.token is not None and state.token.marker is not None and not self._is_embed_part(state.token.marker)
-        )
+        marker: Optional[str] = state.token if state.token is None else state.token.marker
+        in_embed: bool = self._is_in_embed(marker)
+        in_nested: bool = self._is_in_nested_embed(marker)
+        is_style_tag: bool = marker is not None and not self._is_embed_part(marker)
 
         existing_text = any(
             t.type == UsfmTokenType.TEXT and t.text
             for t in state.tokens[self._token_index : state.index + 1 + state.special_token_count]
         )
 
         use_new_tokens = (
-            not untranslatable_paragraph
-            and new_text
+            new_text
             and (not existing_text or self._text_behavior == UpdateUsfmTextBehavior.PREFER_NEW)
-            and (not in_embed or self._is_in_note_text())
+            and (
+                not in_embed
+                or (
+                    self._is_in_note_text()
+                    and not in_nested
+                    and self._embed_behavior == UpdateUsfmMarkerBehavior.PRESERVE
+                )
+            )
         )
 
         if use_new_tokens:
             self._add_new_tokens()
 
-        if untranslatable_paragraph or (
-            existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING
-        ):
+        if existing_text and self._text_behavior == UpdateUsfmTextBehavior.PREFER_EXISTING:
             self._clear_new_tokens()
 
-        within_new_text = any(self._replace_stack)
-        if within_new_text and in_embed:
+        embed_in_new_verse_text = any(self._replace_stack) and in_embed
+        if embed_in_new_verse_text or self._embed_updated:
             if self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP:
+                self._clear_new_tokens()
                 return True
-            if not self._is_in_note_text():
+            if not self._is_in_note_text() or in_nested:
                 return False
 
         skip_tokens = use_new_tokens and closed
@@ -353,6 +354,3 @@ def _push_token_as_previous(self) -> None:
 
     def _pop_new_tokens(self) -> None:
         self._replace_stack.pop()
-
-    def _is_untranslatable_paragraph(self, marker: Optional[str]) -> bool:
-        return marker in self._untranslatable_paragraph_tags
@@ -202,7 +202,11 @@ def text(self, state: UsfmParserState, text: str) -> None:
                     text = text.lstrip()
                 row_text += text
         elif len(text) > 0 and (self._current_text_type != ScriptureTextType.VERSE or state.is_verse_text):
-            if state.token is not None and self._is_in_embed(state.token.marker) and not self._is_in_note_text():
+            if (
+                state.token is not None
+                and self._is_in_embed(state.token.marker)
+                and (not self._is_in_note_text() or self._is_in_nested_embed(state.token.marker))
+            ):
                 return
 
             if (
@@ -230,7 +234,7 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
         if self._text._include_all_text:
             self._rows.append(self._text._create_scripture_row(scripture_ref, text, self._sentence_start))
 
-    def _start_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
+    def _start_note_text(self, state: UsfmParserState) -> None:
         if self._text._include_markers:
             return
         self._row_texts_stack.append("")
 
@@ -83,7 +83,7 @@ pytest-cov = "^4.1.0"
 ipykernel = "^6.7.0"
 jupyter = "^1.0.0"
 pandas = "^2.0.3"
-pyright = "^1.1.362"
+pyright = { extras = ["nodejs"], version = "^1.1.362" }
 decoy = "^2.1.0"
 pep8-naming = "^0.14.1"