Skip to content

Commit 92362f7

Browse files
Merge pull request #71 from nipunsadvilkar/npn-abbr-refactor
2 parents e6c596f + 6b84eaa commit 92362f7

File tree

14 files changed

+164
-51
lines changed

14 files changed

+164
-51
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import spacy
55
import stanza
66

7-
import syntok
87
from syntok.tokenizer import Tokenizer
98
import syntok.segmenter as syntok_segmenter
109

@@ -27,7 +26,8 @@ def nltk_tokenize(text):
2726
return nltk.sent_tokenize(text)
2827

2928
def pysbd_tokenize(text):
30-
return pysbd_segmenter.segment(text)
29+
segments = pysbd_segmenter.segment(text)
30+
return [s.strip() for s in segments]
3131

3232
def spacy_tokenize(text):
3333
return [sent.text for sent in nlp(text).sents]
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import blingfire
2+
import nltk
3+
import pysbd
4+
import spacy
5+
import stanza
6+
7+
from syntok.tokenizer import Tokenizer
8+
import syntok.segmenter as syntok_segmenter
9+
10+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
11+
12+
nlp = spacy.blank('en')
13+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
14+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
15+
#stanza.download('en')
16+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
17+
18+
syntok_tokenizer = Tokenizer()
19+
20+
def blingfire_tokenize(text):
21+
return blingfire.text_to_sentences(text).split('\n')
22+
23+
def nltk_tokenize(text):
24+
return nltk.sent_tokenize(text)
25+
26+
def pysbd_tokenize(text):
27+
segments = pysbd_segmenter.segment(text)
28+
segments = [s.strip() for s in segments]
29+
return segments
30+
31+
def spacy_tokenize(text):
32+
return [sent.text.strip("\n") for sent in nlp(text).sents]
33+
34+
def spacy_dep_tokenize(text):
35+
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
36+
37+
def stanza_tokenize(text):
38+
return [e.text for e in stanza_nlp(text).sentences]
39+
40+
def make_sentences(segmented_tokens):
41+
for sentence in segmented_tokens:
42+
yield "".join(str(token) for token in sentence).strip()
43+
44+
def syntok_tokenize(text):
45+
tokens = syntok_tokenizer.split(text)
46+
result = syntok_segmenter.split(iter(tokens))
47+
segments = [sent for sent in make_sentences(result)]
48+
return segments
49+
50+
def speed_benchmark(big_text, tokenize_func):
51+
segments = tokenize_func(big_text)
52+
return segments
53+
54+
if __name__ == "__main__":
55+
import time
56+
libraries = (
57+
blingfire_tokenize,
58+
nltk_tokenize,
59+
pysbd_tokenize,
60+
spacy_tokenize,
61+
spacy_dep_tokenize,
62+
stanza_tokenize,
63+
syntok_tokenize)
64+
65+
for tokenize_func in libraries:
66+
t = time.time()
67+
# wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
68+
with open('benchmarks/1661-0.txt') as bigfile:
69+
big_text = bigfile.read()
70+
sentences = speed_benchmark(big_text, tokenize_func)
71+
72+
time_taken = time.time() - t
73+
print()
74+
print(tokenize_func.__name__)
75+
print('Speed : {:>20.2f} ms'.format(time_taken * 1000))

benchmarks/genia_benchmark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import spacy
55
import stanza
66

7-
import syntok
87
from syntok.tokenizer import Tokenizer
98
import syntok.segmenter as syntok_segmenter
109

@@ -27,7 +26,8 @@ def nltk_tokenize(text):
2726
return nltk.sent_tokenize(text)
2827

2928
def pysbd_tokenize(text):
30-
return pysbd_segmenter.segment(text)
29+
segments = pysbd_segmenter.segment(text)
30+
return [s.strip() for s in segments]
3131

3232
def spacy_tokenize(text):
3333
return [sent.text.strip("\n") for sent in nlp(text).sents]

pysbd/abbreviation_replacer.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ def replace(self):
3232
self.lang.KommanditgesellschaftRule,
3333
*self.lang.SingleLetterAbbreviationRules.All
3434
)
35-
self.text = self.search_for_abbreviations_in_string()
35+
abbr_handled_text = ""
36+
for line in self.text.splitlines(True):
37+
abbr_handled_text += self.search_for_abbreviations_in_string(line)
38+
self.text = abbr_handled_text
3639
self.replace_multi_period_abbreviations()
3740
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
3841
self.text = self.replace_abbreviation_as_sentence_boundary()
@@ -72,25 +75,24 @@ def replace_period_of_abbr(self, txt, abbr):
7275
return txt
7376

7477

75-
def search_for_abbreviations_in_string(self):
76-
original = self.text
77-
lowered = original.lower()
78+
def search_for_abbreviations_in_string(self, text):
79+
lowered = text.lower()
7880
for abbr in self.lang.Abbreviation.ABBREVIATIONS:
7981
stripped = abbr.strip()
8082
if stripped not in lowered:
8183
continue
8284
abbrev_match = re.findall(
83-
r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
85+
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
8486
)
8587
if not abbrev_match:
8688
continue
8789
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
88-
char_array = re.findall(next_word_start, self.text)
90+
char_array = re.findall(next_word_start, text)
8991
for ind, match in enumerate(abbrev_match):
90-
self.text = self.scan_for_replacements(
91-
self.text, match, ind, char_array
92+
text = self.scan_for_replacements(
93+
text, match, ind, char_array
9294
)
93-
return self.text
95+
return text
9496

9597
def scan_for_replacements(self, txt, am, ind, char_array):
9698
try:

pysbd/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.3.0rc"
5+
__version__ = "0.3.0"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/lang/common/common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
class Common(object):
66

77
# added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
8-
# TODO: above special cases group can be updated as per developer needs
98
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"
109

1110
# # Rubular: http://rubular.com/r/NqCqv372Ix

pysbd/lang/deutsch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def replace(self):
6363
SingleLowerCaseLetterRule,
6464
SingleLowerCaseLetterAtStartOfLineRule)
6565

66-
self.text = self.search_for_abbreviations_in_string()
66+
self.text = self.search_for_abbreviations_in_string(self.text)
6767
self.replace_multi_period_abbreviations()
6868
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
6969
self.text = self.replace_abbreviation_as_sentence_boundary()

pysbd/processor.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,17 +77,18 @@ def split_into_segments(self):
7777
sents = [self.check_for_punctuation(s) for s in sents]
7878
# flatten list of list of sentences
7979
sents = self.rm_none_flatten(sents)
80-
new_sents = []
80+
postprocessed_sents = []
8181
for sent in sents:
8282
sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
8383
post_process_sent = self.post_process_segments(sent)
8484
if post_process_sent and isinstance(post_process_sent, str):
85-
new_sents.append(post_process_sent)
85+
postprocessed_sents.append(post_process_sent)
8686
elif isinstance(post_process_sent, list):
8787
for pps in post_process_sent:
88-
new_sents.append(pps)
89-
new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents]
90-
return new_sents
88+
postprocessed_sents.append(pps)
89+
postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
90+
for ns in postprocessed_sents]
91+
return postprocessed_sents
9192

9293
def post_process_segments(self, txt):
9394
if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):

pysbd/segmenter.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
1414
1515
Parameters
1616
----------
17-
language : str, optional
17+
language : str, required
1818
specify a language use its two character ISO 639-1 code,
1919
by default "en"
2020
clean : bool, optional
@@ -49,11 +49,23 @@ def processor(self, text):
4949

5050
def sentences_with_char_spans(self, sentences):
5151
# since SENTENCE_BOUNDARY_REGEX doesnt account
52-
# for trailing whitespaces \s* is used as suffix
52+
# for trailing whitespaces \s* & is used as suffix
5353
# to keep non-destructive text after segments joins
54-
return [TextSpan(m.group(), m.start(), m.end()) for sent in sentences
55-
for m in re.finditer('{0}\s*'.format(re.escape(sent)),
56-
self.original_text)]
54+
sent_spans = []
55+
prior_start_char_idx = 0
56+
for sent in sentences:
57+
for match in re.finditer(r'{0}\s*'.format(re.escape(sent)), self.original_text):
58+
match_str = match.group()
59+
match_start_idx, match_end_idx = match.span()
60+
if match_start_idx >= prior_start_char_idx:
61+
# making sure if curren sentence and its span
62+
# is either first sentence along with its char spans
63+
# or current sent spans adjacent to prior sentence spans
64+
sent_spans.append(
65+
TextSpan(match_str, match_start_idx, match_end_idx))
66+
prior_start_char_idx = match_start_idx
67+
break
68+
return sent_spans
5769

5870
def segment(self, text):
5971
self.original_text = text
@@ -66,11 +78,11 @@ def segment(self, text):
6678
text = self.cleaner(text).clean()
6779
postprocessed_sents = self.processor(text).process()
6880
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
69-
if self.clean:
81+
if self.char_span:
82+
return sentence_w_char_spans
83+
elif self.clean:
7084
# clean and destructed sentences
7185
return postprocessed_sents
72-
elif self.char_span:
73-
return sentence_w_char_spans
7486
else:
7587
# nondestructive with whitespaces
7688
return [textspan.sent for textspan in sentence_w_char_spans]

pysbd/utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,12 @@ def __init__(self, sent, start, end):
5656
self.end = end
5757

5858
def __repr__(self): # pragma: no cover
59-
return "{0}(sent='{1}', start={2}, end={3})".format(
60-
self.__class__.__name__, self.sent, self.start, self.end)
59+
return "{0}(sent={1}, start={2}, end={3})".format(
60+
self.__class__.__name__, repr(self.sent), self.start, self.end)
6161

6262
def __eq__(self, other):
6363
if isinstance(self, other.__class__):
64-
return self.sent == other.sent and self.start == other.start and self.end == self.end
65-
return False
64+
return self.sent == other.sent and self.start == other.start and self.end == other.end
6665

6766

6867
class PySBDFactory(object):

0 commit comments

Comments
 (0)