Skip to content

Commit 6ebcb9d

Browse files
author
Mark Neumann
committed
Merge branch 'mn-benchmark' of https://github.com/DeNeutoy/pySBD into mn-benchmark
2 parents d4ef3e9 + 4d3848e commit 6ebcb9d

28 files changed

+62
-35
lines changed

pysbd/exclamation_words.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,3 @@ class ExclamationWords(object):
1515
def apply_rules(cls, text):
1616
return re.sub(ExclamationWords.EXCLAMATION_REGEX, replace_punctuation,
1717
text)
18-

pysbd/processor.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,11 @@
11
# -*- coding: utf-8 -*-
22
import re
3-
import spacy
43
from pysbd.utils import Text, TextSpan
54
from pysbd.lists_item_replacer import ListItemReplacer
65
from pysbd.exclamation_words import ExclamationWords
76
from pysbd.between_punctuation import BetweenPunctuation
87
from pysbd.abbreviation_replacer import AbbreviationReplacer
98

10-
nlp = spacy.blank('en')
11-
12-
139
class Processor(object):
1410

1511
def __init__(self, text, lang, char_span=False):
@@ -28,7 +24,6 @@ def __init__(self, text, lang, char_span=False):
2824
self.text = text
2925
self.lang = lang
3026
self.char_span = char_span
31-
self.doc = nlp.make_doc(self.text)
3227

3328
def process(self):
3429
if not self.text:
@@ -42,20 +37,8 @@ def process(self):
4237
self.text = Text(self.text).apply(
4338
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
4439
self.lang.GeoLocationRule, self.lang.FileFormatRule)
45-
processed = self.split_into_segments()
46-
if self.char_span:
47-
return self.sentences_with_char_spans(processed)
48-
else:
49-
return processed
50-
51-
def sentences_with_char_spans(self, sentences):
52-
sent_start_token_idx = [m.start() for sent in sentences for m in re.finditer(re.escape(sent), self.doc.text)]
53-
for tok in self.doc:
54-
if tok.idx in sent_start_token_idx:
55-
tok.is_sent_start = True
56-
else:
57-
tok.is_sent_start = False
58-
return [TextSpan(sent.text_with_ws, sent.start_char, sent.end_char) for sent in self.doc.sents]
40+
postprocessed_sents = self.split_into_segments()
41+
return postprocessed_sents
5942

6043
def rm_none_flatten(self, sents):
6144
"""Remove None values and unpack list of list sents

pysbd/segmenter.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
# -*- coding: utf-8 -*-
2+
import re
3+
24
from pysbd.languages import Language
35
from pysbd.processor import Processor
46
from pysbd.cleaner import Cleaner
7+
from pysbd.utils import TextSpan
58

69
class Segmenter(object):
710

@@ -44,16 +47,30 @@ def processor(self, text):
4447
return Processor(text, self.language_module,
4548
char_span=self.char_span)
4649

50+
def sentences_with_char_spans(self, sentences):
51+
# since SENTENCE_BOUNDARY_REGEX doesnt account
52+
# for trailing whitespaces \s* is used as suffix
53+
# to keep non-destructive text after segments joins
54+
return [TextSpan(m.group(), m.start(), m.end()) for sent in sentences
55+
for m in re.finditer('{0}\s*'.format(re.escape(sent)),
56+
self.original_text)]
57+
4758
def segment(self, text):
59+
self.original_text = text
4860
if not text:
4961
return []
5062
if self.clean and self.char_span:
5163
raise ValueError("char_span must be False if clean is True. "
5264
"Since `clean=True` will modify original text.")
53-
if self.language != 'en' and self.char_span:
54-
raise ValueError("char_span functionality not supported for "
55-
"languages other than English (`en`)")
5665
elif self.clean:
5766
text = self.cleaner(text).clean()
58-
segments = self.processor(text).process()
59-
return segments
67+
postprocessed_sents = self.processor(text).process()
68+
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
69+
if self.clean:
70+
# clean and destructed sentences
71+
return postprocessed_sents
72+
elif self.char_span:
73+
return sentence_w_char_spans
74+
else:
75+
# nondestructive with whitespaces
76+
return [textspan.sent for textspan in sentence_w_char_spans]

tests/lang/test_amharic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@
1010
def test_am_sbd(am_default_fixture, text, expected_sents):
1111
"""Amharic language SBD tests"""
1212
segments = am_default_fixture.segment(text)
13+
segments = [s.strip() for s in segments]
1314
assert segments == expected_sents

tests/lang/test_arabic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@
2020
def test_ar_sbd(ar_default_fixture, text, expected_sents):
2121
"""Arabic language SBD tests"""
2222
segments = ar_default_fixture.segment(text)
23+
segments = [s.strip() for s in segments]
2324
assert segments == expected_sents

tests/lang/test_armenian.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,12 @@
8383
def test_hy_sbd(hy_default_fixture, text, expected_sents):
8484
"""Armenian language SBD tests"""
8585
segments = hy_default_fixture.segment(text)
86+
segments = [s.strip() for s in segments]
8687
assert segments == expected_sents
8788

8889
@pytest.mark.parametrize('text,expected_sents', HY_MORE_TEST_CASES)
8990
def test_hy_sbd_more(hy_default_fixture, text, expected_sents):
9091
"""Armenian language SBD tests"""
9192
segments = hy_default_fixture.segment(text)
93+
segments = [s.strip() for s in segments]
9294
assert segments == expected_sents

tests/lang/test_bulgarian.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@
1616
def test_bg_sbd(bg_default_fixture, text, expected_sents):
1717
"""Bulgarian language SBD tests"""
1818
segments = bg_default_fixture.segment(text)
19+
segments = [s.strip() for s in segments]
1920
assert segments == expected_sents

tests/lang/test_burmese.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@
1010
def test_my_sbd(my_default_fixture, text, expected_sents):
1111
"""Burmese language SBD tests"""
1212
segments = my_default_fixture.segment(text)
13+
segments = [s.strip() for s in segments]
1314
assert segments == expected_sents

tests/lang/test_chinese.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@
1212
def test_zsh_sbd(zh_default_fixture, text, expected_sents):
1313
"""Chinese language SBD tests from Pragmatic Segmenter"""
1414
segments = zh_default_fixture.segment(text)
15+
segments = [s.strip() for s in segments]
1516
assert segments == expected_sents

tests/lang/test_danish.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
def test_da_sbd(da_default_fixture, text, expected_sents):
102102
"""Danish language SBD tests"""
103103
segments = da_default_fixture.segment(text)
104+
segments = [s.strip() for s in segments]
104105
assert segments == expected_sents
105106

106107
DA_RULES_CLEAN_TEST_CASES = [
@@ -117,11 +118,13 @@ def test_da_sbd(da_default_fixture, text, expected_sents):
117118
def test_da_sbd_clean(da_with_clean_no_span_fixture, text, expected_sents):
118119
"""Danish language SBD tests with text clean"""
119120
segments = da_with_clean_no_span_fixture.segment(text)
121+
segments = [s.strip() for s in segments]
120122
assert segments == expected_sents
121123

122124
@pytest.mark.parametrize('text,expected_sents', DA_PDF_TEST_DATA)
123125
def test_da_pdf_type(text, expected_sents):
124126
"""SBD tests from Pragmatic Segmenter for doctype:pdf"""
125127
seg = pysbd.Segmenter(language="da", clean=True, doc_type='pdf')
126128
segments = seg.segment(text)
129+
segments = [s.strip() for s in segments]
127130
assert segments == expected_sents

0 commit comments

Comments
 (0)