11# -*- coding: utf-8 -*-
22import re
3- import spacy
43from pysbd .utils import Text , TextSpan
54from pysbd .lists_item_replacer import ListItemReplacer
65from pysbd .exclamation_words import ExclamationWords
76from pysbd .between_punctuation import BetweenPunctuation
87from pysbd .abbreviation_replacer import AbbreviationReplacer
98
10- nlp = spacy .blank ('en' )
11-
12-
139class Processor (object ):
1410
1511 def __init__ (self , text , lang , char_span = False ):
@@ -28,7 +24,6 @@ def __init__(self, text, lang, char_span=False):
2824 self .text = text
2925 self .lang = lang
3026 self .char_span = char_span
31- self .doc = nlp .make_doc (self .text )
3227
3328 def process (self ):
3429 if not self .text :
@@ -42,20 +37,8 @@ def process(self):
4237 self .text = Text (self .text ).apply (
4338 self .lang .Abbreviation .WithMultiplePeriodsAndEmailRule ,
4439 self .lang .GeoLocationRule , self .lang .FileFormatRule )
45- processed = self .split_into_segments ()
46- if self .char_span :
47- return self .sentences_with_char_spans (processed )
48- else :
49- return processed
50-
51- def sentences_with_char_spans (self , sentences ):
52- sent_start_token_idx = [m .start () for sent in sentences for m in re .finditer (re .escape (sent ), self .doc .text )]
53- for tok in self .doc :
54- if tok .idx in sent_start_token_idx :
55- tok .is_sent_start = True
56- else :
57- tok .is_sent_start = False
58- return [TextSpan (sent .text_with_ws , sent .start_char , sent .end_char ) for sent in self .doc .sents ]
40+ postprocessed_sents = self .split_into_segments ()
41+ return postprocessed_sents
5942
6043 def rm_none_flatten (self , sents ):
6144 """Remove None values and unpack list of list sents
0 commit comments