Skip to content

Commit 45f24a9

Browse files
johnml1135ddaspit
andauthored
Scripture Update block (#168)
- Add update block handlers to `UpdateUsfmParserHandler` - no longer extract note text as a separate row in `UsfmTextBase` - no longer update note text in `UpdateUsfmParserHandler` - properly handle empty paragraphs before a verse - update `ScriptureRefUsfmParserHandler` to recognize embed text --------- Co-authored-by: Damien Daspit <[email protected]>
1 parent 7a4bde9 commit 45f24a9

19 files changed

+1012
-567
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
node-version: "14"
5656
- name: Lint with pyright
5757
run: |
58-
npm install -g [email protected].386
58+
npm install -g [email protected].400
5959
poetry run pyright
6060
- name: Test with pytest
6161
run: poetry run pytest --cov --cov-report=xml

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
"source.organizeImports": "explicit"
1414
},
1515
},
16+
"files.associations": {
17+
"*.SFM": "usfm",
18+
},
1619
"black-formatter.path": [
1720
"poetry",
1821
"run",

machine/corpora/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@
6161
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
6262
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
6363
from .usfm_tokenizer import RtlReferenceOrder, UsfmTokenizer
64+
from .usfm_update_block import UsfmUpdateBlock
65+
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
66+
from .usfm_update_block_handler import UsfmUpdateBlockHandler
6467
from .usx_file_alignment_collection import UsxFileAlignmentCollection
6568
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
6669
from .usx_file_text import UsxFileText
@@ -92,8 +95,8 @@
9295
"is_scripture",
9396
"lowercase",
9497
"MemoryAlignmentCollection",
95-
"MemoryText",
9698
"MemoryStreamContainer",
99+
"MemoryText",
97100
"MultiKeyRef",
98101
"nfc_normalize",
99102
"nfd_normalize",
@@ -126,9 +129,9 @@
126129
"TextRow",
127130
"TextRowFlags",
128131
"unescape_spaces",
129-
"UpdateUsfmTextBehavior",
130132
"UpdateUsfmMarkerBehavior",
131133
"UpdateUsfmParserHandler",
134+
"UpdateUsfmTextBehavior",
132135
"UsfmAttribute",
133136
"UsfmElementType",
134137
"UsfmFileText",
@@ -148,6 +151,10 @@
148151
"UsfmToken",
149152
"UsfmTokenizer",
150153
"UsfmTokenType",
154+
"UsfmUpdateBlock",
155+
"UsfmUpdateBlockElement",
156+
"UsfmUpdateBlockElementType",
157+
"UsfmUpdateBlockHandler",
151158
"UsxFileAlignmentCollection",
152159
"UsxFileAlignmentCorpus",
153160
"UsxFileText",

machine/corpora/paratext_project_terms_parser_base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import re
24
from abc import ABC, abstractmethod
35
from collections import defaultdict
@@ -45,7 +47,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
4547
else:
4648
term_id_to_category_dict = {}
4749

48-
terms_glosses_doc: Optional[ElementTree.ElementTree] = None
50+
terms_glosses_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
4951
resource_name = None
5052
if self._settings.language_code is not None:
5153
resource_name = _SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS.get(self._settings.language_code)
@@ -57,7 +59,7 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
5759
with open_binary(_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, resource_name) as stream:
5860
terms_glosses_doc = ElementTree.parse(stream)
5961

60-
term_renderings_doc: Optional[ElementTree.ElementTree] = None
62+
term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
6163
if self._exists("TermRenderings.xml"):
6264
with self._open("TermRenderings.xml") as stream:
6365
term_renderings_doc = ElementTree.parse(stream)
@@ -136,7 +138,7 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
136138
return term_string
137139

138140

139-
def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree) -> Dict[str, str]:
141+
def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element]) -> Dict[str, str]:
140142
term_id_to_category_dict: Dict[str, str] = {}
141143

142144
for term in biblical_terms_doc.findall(".//Term"):

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Optional, Sequence, Tuple, Union
2+
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
33

44
from ..utils.typeshed import StrPath
55
from .paratext_project_settings import ParatextProjectSettings
66
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
77
from .scripture_ref import ScriptureRef
88
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
99
from .usfm_parser import parse_usfm
10+
from .usfm_update_block_handler import UsfmUpdateBlockHandler
1011

1112

1213
class ParatextProjectTextUpdaterBase(ABC):
@@ -25,7 +26,8 @@ def update_usfm(
2526
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2627
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
2728
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
28-
preserve_paragraph_styles: Optional[Sequence[str]] = None,
29+
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
30+
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
2931
) -> Optional[str]:
3032
file_name: str = self._settings.get_book_file_name(book_id)
3133
if not self._exists(file_name):
@@ -40,6 +42,7 @@ def update_usfm(
4042
embed_behavior,
4143
style_behavior,
4244
preserve_paragraph_styles,
45+
update_block_handlers=update_block_handlers,
4346
)
4447
try:
4548
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)

machine/corpora/scripture_ref_usfm_parser_handler.py

Lines changed: 38 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@ class ScriptureTextType(Enum):
1515
NONE = auto()
1616
NONVERSE = auto()
1717
VERSE = auto()
18-
NOTE_TEXT = auto()
18+
EMBED = auto()
1919

2020

21-
EMBED_PART_START_CHAR_STYLES = ("f", "x", "z")
22-
EMBED_STYLES = ("f", "fe", "fig", "fm", "x")
21+
_EMBED_STYLES = {"f", "fe", "x", "fig"}
22+
23+
24+
def _is_embed_style(marker: Optional[str]) -> bool:
25+
return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))
2326

2427

2528
class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
@@ -28,18 +31,11 @@ def __init__(self) -> None:
2831
self._cur_elements_stack: List[ScriptureElement] = []
2932
self._cur_text_type_stack: List[ScriptureTextType] = []
3033
self._duplicate_verse: bool = False
31-
self._in_preserved_paragraph: bool = False
32-
self._in_embed: bool = False
33-
self._in_note_text: bool = False
34-
self._in_nested_embed: bool = False
3534

3635
@property
3736
def _current_text_type(self) -> ScriptureTextType:
3837
return ScriptureTextType.NONE if len(self._cur_text_type_stack) == 0 else self._cur_text_type_stack[-1]
3938

40-
def _is_in_note_text(self) -> bool:
41-
return self._in_note_text
42-
4339
def end_usfm(self, state: UsfmParserState) -> None:
4440
self._end_verse_text_wrapper(state)
4541

@@ -115,32 +111,6 @@ def start_sidebar(self, state: UsfmParserState, marker: str, category: str) -> N
115111
def end_sidebar(self, state: UsfmParserState, marker: str, closed: bool) -> None:
116112
self._end_parent_element()
117113

118-
def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
119-
self._in_embed = True
120-
self._start_embed_wrapper(state, marker)
121-
122-
def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
123-
self._end_note_text_wrapper(state)
124-
self._end_embed(state, marker, None, closed)
125-
self._in_embed = False
126-
127-
def _start_embed_wrapper(self, state: UsfmParserState, marker: str) -> None:
128-
if self._cur_verse_ref.is_default:
129-
self._update_verse_ref(state.verse_ref, marker)
130-
131-
if not self._duplicate_verse:
132-
self._check_convert_verse_para_to_non_verse(state)
133-
self._next_element(marker)
134-
135-
self._start_embed(state, self._create_non_verse_ref())
136-
137-
def _start_embed(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
138-
139-
def _end_embed(
140-
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
141-
) -> None:
142-
pass
143-
144114
def text(self, state: UsfmParserState, text: str) -> None:
145115
# if we hit text in a verse paragraph and we aren't in a verse, then start a non-verse segment
146116
if text.strip():
@@ -152,29 +122,23 @@ def opt_break(self, state: UsfmParserState) -> None:
152122
def start_char(
153123
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
154124
) -> None:
155-
if self._is_embed_part_style(marker) and self._in_note_text:
156-
self._in_nested_embed = True
157125
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
158126
self._check_convert_verse_para_to_non_verse(state)
159127

160-
if self._is_embed_style(marker):
161-
self._in_embed = True
162-
self._start_embed_wrapper(state, marker)
163-
164-
if self._is_note_text(marker):
165-
self._start_note_text_wrapper(state)
128+
if _is_embed_style(marker):
129+
self._start_embed_text_wrapper(state, marker)
166130

167131
def end_char(
168132
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
169133
) -> None:
170-
if self._is_embed_part_style(marker):
171-
if self._in_nested_embed:
172-
self._in_nested_embed = False
173-
elif self._is_note_text(marker):
174-
self._end_note_text_wrapper(state)
175-
if self._is_embed_style(marker):
176-
self._end_embed(state, marker, attributes, closed)
177-
self._in_embed = False
134+
if _is_embed_style(marker):
135+
self._end_embed_text_wrapper(state)
136+
137+
def start_note(self, state: UsfmParserState, marker: str, caller: str, category: Optional[str]) -> None:
138+
self._start_embed_text_wrapper(state, marker)
139+
140+
def end_note(self, state: UsfmParserState, marker: str, closed: bool) -> None:
141+
self._end_embed_text_wrapper(state)
178142

179143
def _start_verse_text(self, state: UsfmParserState, scripture_refs: Optional[Sequence[ScriptureRef]]) -> None: ...
180144

@@ -184,20 +148,9 @@ def _start_non_verse_text(self, state: UsfmParserState, scripture_ref: Scripture
184148

185149
def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
186150

187-
def _start_note_text_wrapper(self, state: UsfmParserState):
188-
self._in_note_text = True
189-
self._cur_text_type_stack.append(ScriptureTextType.NOTE_TEXT)
190-
self._start_note_text(state)
191-
192-
def _start_note_text(self, state: UsfmParserState) -> None: ...
151+
def _start_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
193152

194-
def _end_note_text_wrapper(self, state: UsfmParserState):
195-
if self._cur_text_type_stack and self._cur_text_type_stack[-1] == ScriptureTextType.NOTE_TEXT:
196-
self._end_note_text(state, self._create_non_verse_ref())
197-
self._cur_text_type_stack.pop()
198-
self._in_note_text = False
199-
200-
def _end_note_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
153+
def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None: ...
201154

202155
def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
203156
self._duplicate_verse = False
@@ -225,6 +178,25 @@ def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
225178
self._cur_elements_stack.append(ScriptureElement(0, marker))
226179
self._cur_verse_ref = verse_ref.copy()
227180

181+
def _start_embed_text_wrapper(self, state: UsfmParserState, marker: str) -> None:
182+
if self._cur_verse_ref.is_default:
183+
self._update_verse_ref(state.verse_ref, marker)
184+
185+
if not self._duplicate_verse:
186+
self._check_convert_verse_para_to_non_verse(state)
187+
self._next_element(marker)
188+
self._cur_text_type_stack.append(ScriptureTextType.EMBED)
189+
self._start_embed_text(state, self._create_non_verse_ref())
190+
191+
def _end_embed_text_wrapper(self, state: UsfmParserState) -> None:
192+
if (
193+
not self._duplicate_verse
194+
and self._cur_text_type_stack
195+
and self._cur_text_type_stack[-1] == ScriptureTextType.EMBED
196+
):
197+
self._end_embed_text(state, self._create_non_verse_ref())
198+
self._cur_text_type_stack.pop()
199+
228200
def _next_element(self, marker: str) -> None:
229201
prev_elem: ScriptureElement = self._cur_elements_stack.pop()
230202
self._cur_elements_stack.append(ScriptureElement(prev_elem.position + 1, marker))
@@ -237,7 +209,7 @@ def _end_parent_element(self) -> None:
237209
self._cur_elements_stack.pop()
238210

239211
def _end_embed_elements(self) -> None:
240-
if self._cur_elements_stack and self._is_embed_style(self._cur_elements_stack[-1].name):
212+
if self._cur_elements_stack and _is_embed_style(self._cur_elements_stack[-1].name):
241213
self._cur_elements_stack.pop()
242214

243215
def _create_verse_refs(self) -> List[ScriptureRef]:
@@ -266,23 +238,3 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
266238
):
267239
self._start_parent_element(para_tag.marker)
268240
self._start_non_verse_text_wrapper(state)
269-
270-
def _is_in_embed(self, marker: Optional[str]) -> bool:
271-
return self._in_embed or self._is_embed_style(marker)
272-
273-
def _is_in_nested_embed(self, marker: Optional[str]) -> bool:
274-
return self._in_nested_embed or (
275-
marker is not None
276-
and marker.startswith("+")
277-
and marker[1] in EMBED_PART_START_CHAR_STYLES
278-
and marker != "fm"
279-
)
280-
281-
def _is_note_text(self, marker: Optional[str]) -> bool:
282-
return marker == "ft"
283-
284-
def _is_embed_part_style(self, marker: Optional[str]) -> bool:
285-
return marker is not None and marker.startswith(EMBED_PART_START_CHAR_STYLES) and marker != "fm"
286-
287-
def _is_embed_style(self, marker: Optional[str]) -> bool:
288-
return marker is not None and marker.strip("*") in EMBED_STYLES

0 commit comments

Comments
 (0)