@@ -14,7 +14,7 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
1414
1515 Parameters
1616 ----------
17- language : str, optional
17+ language : str, required
1818 specify a language use its two character ISO 639-1 code,
1919 by default "en"
2020 clean : bool, optional
@@ -49,11 +49,23 @@ def processor(self, text):
4949
5050 def sentences_with_char_spans (self , sentences ):
5151 # since SENTENCE_BOUNDARY_REGEX doesnt account
52- # for trailing whitespaces \s* is used as suffix
52+ # for trailing whitespaces \s* & is used as suffix
5353 # to keep non-destructive text after segments joins
54- return [TextSpan (m .group (), m .start (), m .end ()) for sent in sentences
55- for m in re .finditer ('{0}\s*' .format (re .escape (sent )),
56- self .original_text )]
54+ sent_spans = []
55+ prior_start_char_idx = 0
56+ for sent in sentences :
57+ for match in re .finditer (r'{0}\s*' .format (re .escape (sent )), self .original_text ):
58+ match_str = match .group ()
59+ match_start_idx , match_end_idx = match .span ()
60+ if match_start_idx >= prior_start_char_idx :
61+ # making sure if curren sentence and its span
62+ # is either first sentence along with its char spans
63+ # or current sent spans adjacent to prior sentence spans
64+ sent_spans .append (
65+ TextSpan (match_str , match_start_idx , match_end_idx ))
66+ prior_start_char_idx = match_start_idx
67+ break
68+ return sent_spans
5769
5870 def segment (self , text ):
5971 self .original_text = text
@@ -66,11 +78,11 @@ def segment(self, text):
6678 text = self .cleaner (text ).clean ()
6779 postprocessed_sents = self .processor (text ).process ()
6880 sentence_w_char_spans = self .sentences_with_char_spans (postprocessed_sents )
69- if self .clean :
81+ if self .char_span :
82+ return sentence_w_char_spans
83+ elif self .clean :
7084 # clean and destructed sentences
7185 return postprocessed_sents
72- elif self .char_span :
73- return sentence_w_char_spans
7486 else :
7587 # nondestructive with whitespaces
7688 return [textspan .sent for textspan in sentence_w_char_spans ]
0 commit comments