Skip to content

Commit 5bdb3da

Browse files
committed
Updates from reviewer comments including:
* If there is new text, override "preserve" * Preserve just means "don't strip" this tag * Make "preserve" configurable and at the "update" level, not the "scriputure" level * Correct logic only stripping paragrpahs in a verse - not section headers
1 parent c6b995b commit 5bdb3da

File tree

4 files changed

+129
-27
lines changed

4 files changed

+129
-27
lines changed

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def update_usfm(
2525
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2626
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2727
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
28+
preserve_paragraph_styles: Optional[Sequence[str]] = None,
2829
) -> Optional[str]:
2930
file_name: str = self._settings.get_book_file_name(book_id)
3031
if not self._exists(file_name):
@@ -38,6 +39,7 @@ def update_usfm(
3839
paragraph_behavior,
3940
embed_behavior,
4041
style_behavior,
42+
preserve_paragraph_styles,
4143
)
4244
try:
4345
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

machine/corpora/scripture_ref_usfm_parser_handler.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ class ScriptureTextType(Enum):
1515
NONE = auto()
1616
NONVERSE = auto()
1717
VERSE = auto()
18-
EMBED = auto()
1918
NOTE_TEXT = auto()
2019

2120

22-
PRESERVE_PARAGRAPH_STYLES = ("r", "rem")
2321
EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
2422
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")
2523

@@ -76,16 +74,13 @@ def start_para(
7674
unknown: Optional[bool],
7775
attributes: Optional[Sequence[UsfmAttribute]],
7876
) -> None:
79-
if self._is_preserve_paragraph_type(marker):
80-
self._in_preserved_paragraph = True
8177
if self._cur_verse_ref.is_default:
8278
self._update_verse_ref(state.verse_ref, marker)
8379
if not state.is_verse_text:
8480
self._start_parent_element(marker)
8581
self._start_non_verse_text_wrapper(state)
8682

8783
def end_para(self, state: UsfmParserState, marker: str) -> None:
88-
self._in_preserved_paragraph = False
8984
if self._current_text_type == ScriptureTextType.NONVERSE:
9085
self._end_parent_element()
9186
self._end_non_verse_text_wrapper(state)
@@ -275,9 +270,6 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
275270
def _is_in_embed(self, marker: Optional[str]) -> bool:
276271
return self._in_embed or self._is_embed_style(marker)
277272

278-
def _is_in_preserved_paragraph(self, marker: Optional[str]) -> bool:
279-
return self._in_preserved_paragraph or self._is_preserve_paragraph_type(marker)
280-
281273
def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
282274
return self._in_nested_embed or (
283275
marker is not None and marker.startswith("+") and marker[1] in EMBED_PART_START_CHAR_STYLES
@@ -291,6 +283,3 @@ def _is_embed_part_style(self, marker: Optional[str]) -> bool:
291283

292284
def _is_embed_style(self, marker: Optional[str]) -> bool:
293285
return marker is not None and marker.strip("*") in EMBED_STYLES
294-
295-
def _is_preserve_paragraph_type(self, marker: Optional[str]) -> bool:
296-
return marker in PRESERVE_PARAGRAPH_STYLES

machine/corpora/update_usfm_parser_handler.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,20 @@ def __init__(
3030
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
3131
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
3232
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
33+
preserve_paragraph_styles: Optional[Sequence[str]] = None,
3334
) -> None:
3435
super().__init__()
3536
self._rows = rows or []
3637
self._tokens: List[UsfmToken] = []
3738
self._new_tokens: List[UsfmToken] = []
3839
self._new_embed_tokens: List[UsfmToken] = []
3940
self._id_text = id_text
41+
if preserve_paragraph_styles is None:
42+
self._preserve_paragraph_styles = set(["r", "rem"])
43+
elif isinstance(preserve_paragraph_styles, str):
44+
self._preserve_paragraph_styles = set([preserve_paragraph_styles])
45+
else:
46+
self._preserve_paragraph_styles = set(preserve_paragraph_styles)
4047
self._text_behavior = text_behavior
4148
self._paragraph_behavior = paragraph_behavior
4249
self._embed_behavior = embed_behavior
@@ -77,8 +84,11 @@ def start_para(
7784
unknown: bool,
7885
attributes: Optional[Sequence[UsfmAttribute]],
7986
) -> None:
87+
if marker in self._preserve_paragraph_styles:
88+
self._in_preserved_paragraph = True
89+
8090
if (
81-
state.verse_ref.verse_num != 0
91+
state.is_verse_text
8292
and (self._has_new_text() or self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING)
8393
and self._paragraph_behavior == UpdateUsfmMarkerBehavior.STRIP
8494
):
@@ -88,6 +98,10 @@ def start_para(
8898

8999
super().start_para(state, marker, unknown, attributes)
90100

101+
def end_para(self, state: UsfmParserState, marker: str) -> None:
102+
super().end_para(state, marker)
103+
self._in_preserved_paragraph = False
104+
91105
def start_row(self, state: UsfmParserState, marker: str) -> None:
92106
self._collect_tokens(state)
93107

@@ -310,20 +324,19 @@ def _replace_with_new_tokens(self, state: UsfmParserState, closed: bool = True)
310324

311325
use_new_tokens = (
312326
(
313-
(self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING)
314-
or (
315-
self._has_new_text()
316-
and (not existing_text or self._text_behavior == UpdateUsfmTextBehavior.PREFER_NEW)
317-
)
327+
self._text_behavior == UpdateUsfmTextBehavior.STRIP_EXISTING
328+
and not self._is_in_preserved_paragraph(marker)
318329
)
319-
and not self._is_in_preserved_paragraph(marker)
320-
and (
321-
not in_embed
322-
or (
323-
self._is_in_note_text()
324-
and not in_nested_embed
325-
and self._embed_behavior == UpdateUsfmMarkerBehavior.PRESERVE
326-
)
330+
or (
331+
self._has_new_text()
332+
and (not existing_text or self._text_behavior != UpdateUsfmTextBehavior.PREFER_EXISTING)
333+
)
334+
) and (
335+
not in_embed
336+
or (
337+
self._is_in_note_text()
338+
and not in_nested_embed
339+
and self._embed_behavior == UpdateUsfmMarkerBehavior.PRESERVE
327340
)
328341
)
329342

@@ -390,3 +403,6 @@ def _push_token_as_previous(self) -> None:
390403

391404
def _pop_new_tokens(self) -> None:
392405
self._replace_stack.pop()
406+
407+
def _is_in_preserved_paragraph(self, marker: Optional[str]) -> bool:
408+
return self._in_preserved_paragraph or marker in self._preserve_paragraph_styles

tests/corpora/test_update_usfm_parser_handler.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,93 @@ def test_get_usfm_strip_all_text() -> None:
101101
assess(target, result)
102102

103103

104+
def test_preserve_paragraphs():
105+
rows = [
106+
(
107+
scr_ref("MAT 1:0/1:rem"),
108+
str("Update remark"),
109+
),
110+
(
111+
scr_ref("MAT 1:1"),
112+
str("Update 1"),
113+
),
114+
]
115+
usfm = r"""\id MAT
116+
\c 1
117+
\rem Update remark
118+
\r reference
119+
\ip This is another remark, but with a different marker
120+
\v 1 This is a verse
121+
"""
122+
123+
target = update_usfm(rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING)
124+
result = r"""\id MAT
125+
\c 1
126+
\rem Update remark
127+
\r reference
128+
\ip
129+
\v 1 Update 1
130+
"""
131+
132+
assess(target, result)
133+
134+
target_diff_paragraph = update_usfm(
135+
rows, usfm, text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING, preserve_paragraph_styles=("ip")
136+
)
137+
result_diff_paragraph = r"""\id MAT
138+
\c 1
139+
\rem Update remark
140+
\r
141+
\ip This is another remark, but with a different marker
142+
\v 1 Update 1
143+
"""
144+
145+
assess(target_diff_paragraph, result_diff_paragraph)
146+
147+
148+
def test_paragraph_in_verse():
149+
rows = [
150+
(
151+
scr_ref("MAT 1:1"),
152+
str("Update 1"),
153+
),
154+
]
155+
usfm = r"""\id MAT - Test
156+
\c 1
157+
\v 1 verse 1 \p inner verse paragraph
158+
\s1 Section Header
159+
\v 2 Verse 2 \p inner verse paragraph
160+
"""
161+
162+
target = update_usfm(rows, usfm, paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP)
163+
164+
result = r"""\id MAT - Test
165+
\c 1
166+
\v 1 Update 1
167+
\s1 Section Header
168+
\v 2 Verse 2
169+
\p inner verse paragraph
170+
"""
171+
172+
assess(target, result)
173+
174+
target_strip = update_usfm(
175+
rows,
176+
usfm,
177+
text_behavior=UpdateUsfmTextBehavior.STRIP_EXISTING,
178+
paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
179+
)
180+
181+
result_strip = r"""\id MAT
182+
\c 1
183+
\v 1 Update 1
184+
\s1
185+
\v 2
186+
"""
187+
188+
assess(target_strip, result_strip)
189+
190+
104191
def test_get_usfm_prefer_existing():
105192
rows = [
106193
(
@@ -856,16 +943,24 @@ def update_usfm(
856943
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
857944
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
858945
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
946+
preserve_paragraph_styles: Optional[Sequence[str]] = None,
859947
) -> Optional[str]:
860948
if source is None:
861949
updater = FileParatextProjectTextUpdater(USFM_TEST_PROJECT_PATH)
862950
return updater.update_usfm(
863-
"MAT", rows, id_text, text_behavior, paragraph_behavior, embed_behavior, style_behavior
951+
"MAT",
952+
rows,
953+
id_text,
954+
text_behavior,
955+
paragraph_behavior,
956+
embed_behavior,
957+
style_behavior,
958+
preserve_paragraph_styles,
864959
)
865960
else:
866961
source = source.strip().replace("\r\n", "\n") + "\r\n"
867962
updater = UpdateUsfmParserHandler(
868-
rows, id_text, text_behavior, paragraph_behavior, embed_behavior, style_behavior
963+
rows, id_text, text_behavior, paragraph_behavior, embed_behavior, style_behavior, preserve_paragraph_styles
869964
)
870965
parse_usfm(source, updater)
871966
return updater.get_usfm()

0 commit comments

Comments
 (0)