Skip to content

Commit 6006cdd

Browse files
Merge pull request #32 from nipunsadvilkar/bug-29-30-31
🐛 BugFixes on abbreviation, list_item_replacer
2 parents 5fa8d01 + 781843e commit 6006cdd

File tree

9 files changed

+53
-19
lines changed

9 files changed

+53
-19
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,10 @@
1010
# v0.1.2
1111

1212
- 🐛BugFix - IndexError of `scanlists` function
13+
14+
# v0.1.3
15+
16+
- 🐛 Fix `lists_item_replacer` - \#29
17+
- 🐛 Fix & ♻️refactor `replace_multi_period_abbreviations` - \#30
18+
- 🐛 Fix `abbreviation_replacer` - \#31
19+
- ✅ Add regression tests for issues

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ This project is a direct port of ruby gem - [Pragmatic Segmenter](https://github
1818

1919
```python
2020
import pysbd
21-
text = "Hello World. My name is Jonas."
21+
text = "My name is Jonas E. Smith. Please turn to p. 55."
2222
seg = pysbd.Segmenter(language="en", clean=False)
2323
print(seg.segment(text))
24-
# ['Hello World.', 'My name is Jonas.']
24+
# ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
2525
```
2626

2727
## Contributing

pysbd/abbreviation_replacer.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,11 @@ def replace(self):
5050
return self.text
5151

5252
def replace_multi_period_abbreviations(self):
53-
mpa = re.findall(Common.MULTI_PERIOD_ABBREVIATION_REGEX, self.text, flags=re.IGNORECASE)
54-
if not mpa:
55-
return self.text
56-
for each in mpa:
57-
replacement = re.sub(re.escape(r'.'), '∯', each)
58-
self.text = re.sub(each, replacement, self.text)
53+
def mpa_replace(match):
54+
match = match.group()
55+
match = re.sub(re.escape(r'.'), '∯', match)
56+
return match
57+
self.text = re.sub(Common.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE)
5958

6059
def search_for_abbreviations_in_string(self):
6160
original = self.text
@@ -76,7 +75,10 @@ def search_for_abbreviations_in_string(self):
7675
return self.text
7776

7877
def scan_for_replacements(self, txt, am, ind, char_array):
79-
char = char_array[ind] if char_array else ''
78+
try:
79+
char = char_array[ind]
80+
except IndexError:
81+
char = ''
8082
prepositive = Abbreviation.PREPOSITIVE_ABBREVIATIONS
8183
number_abbr = Abbreviation.NUMBER_ABBREVIATIONS
8284
upper = str(char).isupper()

pysbd/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.1.2"
5+
__version__ = "0.1.3"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/between_punctuation.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,5 @@ def sub_punctuation_between_quotes_slanted(self, txt):
9595

9696

9797
if __name__ == "__main__":
98-
# text = "Hello .World 'This is great.' you work for Google"
99-
text = "\"Dinah'll miss me very much to-night, I should think!\""
98+
text = "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention"
10099
print(BetweenPunctuation(text).replace())
101-
# "Dinah&⎋&ll miss me very much to-night, I should think&ᓴ&"

pysbd/lists_item_replacer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ def add_line_break(self):
6060
self.format_roman_numeral_lists()
6161
self.format_numbered_list_with_periods()
6262
self.format_numbered_list_with_parens()
63-
# print('###', repr(self.text))
6463
return self.text
6564

6665
def replace_parens(self):
@@ -116,8 +115,7 @@ def scan_lists(self, regex1, regex2, replacement, strip=False):
116115
for ind, item in enumerate(list_array):
117116
# to avoid IndexError
118117
# ruby returns nil if index is out of range
119-
# print(ind, item, replacement)
120-
if (ind < len(list_array) - 1) and (item + 1 == list_array[ind + 1]):
118+
if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
121119
self.substitute_found_list_items(regex2, item, strip, replacement)
122120
elif ind > 0:
123121
if (((item - 1) == list_array[ind - 1]) or
@@ -131,7 +129,8 @@ def replace_item(match, val=None, strip=False, repl='♨'):
131129
match = match.group()
132130
if strip:
133131
match = str(match).strip()
134-
chomped_match = match if len(match) == 1 else match[:-1]
132+
chomped_match = match if len(match) == 1 else match.strip('.])')
133+
print(each, match, chomped_match)
135134
if str(each) == chomped_match:
136135
return "{}{}".format(each, replacement)
137136
else:

pysbd/segmenter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ def segment(self, text):
2323

2424

2525
if __name__ == "__main__":
26-
text = "This new form of generalized PDF in (9) is generic and suitable for all the fading models presented in Table I withbranches MRC reception. In section III, (9) will be used in the derivations of the unified ABER and ACC expression."
27-
# ["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."
26+
# text = "Proof. First let v ∈ V be incident to at least three leaves and suppose there is a minimum power dominating set S of G that does not contain v. If S excludes two or more of the leaves of G incident to v, then those leaves cannot be dominated or forced at any step. Thus, S excludes at most one leaf incident to v, which means S contains at least two leaves ℓ 1 and ℓ 2 incident to v. Then, (S\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction. Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅. Then neither ℓ 1 nor ℓ 2 can be dominated or forced at any step, contradicting the assumption that S is a power dominating set. If S is a power dominating set that contains ℓ 1 or ℓ 2 , say ℓ 1 , then (S\{ℓ 1 }) ∪ {v} is also a power dominating set and has the same cardinality. Applying this to every vertex incident to exactly two leaves produces the minimum power dominating set required by (3). Definition 3.4. Given a graph G = (V, E) and a set X ⊆ V , define ℓ r (G, X) as the graph obtained by attaching r leaves to each vertex in X. If X = {v 1 , . . . , v k }, we denote the r leaves attached to vertex v i as ℓ"
27+
text = "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012)."
2828
print("Input String:\n{}".format(text))
2929
seg = Segmenter(language="en", clean=True)
3030
segments = seg.segment(text)

tests/regression/__init__.py

Whitespace-only changes.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# -*- coding: utf-8 -*-
2+
import pytest
3+
import pysbd
4+
5+
TEST_ISSUE_DATA = [
6+
('#27', "This new form of generalized PDF in (9) is generic and suitable for all the fading models presented in Table I withbranches MRC reception. In section III, (9) will be used in the derivations of the unified ABER and ACC expression.",
7+
["This new form of generalized PDF in (9) is generic and suitable for all the fading models presented in Table I withbranches MRC reception.",
8+
"In section III, (9) will be used in the derivations of the unified ABER and ACC expression."]),
9+
('#29', "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).",
10+
["Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007).",
11+
"Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior.",
12+
"A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012)."]),
13+
('#30', "Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system. EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.",
14+
["Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system.",
15+
"EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material."
16+
]),
17+
('#31', r"Proof. First let v ∈ V be incident to at least three leaves and suppose there is a minimum power dominating set S of G that does not contain v. If S excludes two or more of the leaves of G incident to v, then those leaves cannot be dominated or forced at any step. Thus, S excludes at most one leaf incident to v, which means S contains at least two leaves ℓ 1 and ℓ 2 incident to v. Then, (S\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction. Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅. Then neither ℓ 1 nor ℓ 2 can be dominated or forced at any step, contradicting the assumption that S is a power dominating set. If S is a power dominating set that contains ℓ 1 or ℓ 2 , say ℓ 1 , then (S\{ℓ 1 }) ∪ {v} is also a power dominating set and has the same cardinality. Applying this to every vertex incident to exactly two leaves produces the minimum power dominating set required by (3). Definition 3.4. Given a graph G = (V, E) and a set X ⊆ V , define ℓ r (G, X) as the graph obtained by attaching r leaves to each vertex in X. If X = {v 1 , . . . , v k }, we denote the r leaves attached to vertex v i as ℓ",
18+
['Proof.', 'First let v ∈ V be incident to at least three leaves and suppose there is a minimum power dominating set S of G that does not contain v. If S excludes two or more of the leaves of G incident to v, then those leaves cannot be dominated or forced at any step.', 'Thus, S excludes at most one leaf incident to v, which means S contains at least two leaves ℓ 1 and ℓ 2 incident to v. Then, (S\\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction.', 'Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅.', 'Then neither ℓ 1 nor ℓ 2 can be dominated or forced at any step, contradicting the assumption that S is a power dominating set.', 'If S is a power dominating set that contains ℓ 1 or ℓ 2 , say ℓ 1 , then (S\\{ℓ 1 }) ∪ {v} is also a power dominating set and has the same cardinality.', 'Applying this to every vertex incident to exactly two leaves produces the minimum power dominating set required by (3).', 'Definition 3.4.', 'Given a graph G = (V, E) and a set X ⊆ V , define ℓ r (G, X) as the graph obtained by attaching r leaves to each vertex in X. If X = {v 1 , . . . , v k }, we denote the r leaves attached to vertex v i as ℓ'])
19+
]
20+
21+
@pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA)
22+
def test_issue(issue_no, text, expected_sents):
23+
"""pySBD issues tests from https://github.com/nipunsadvilkar/pySBD/issues/"""
24+
seg = pysbd.Segmenter(language="en", clean=False)
25+
segments = seg.segment(text)
26+
assert segments == expected_sents
27+
# clubbing sentences and matching with original text
28+
assert text == " ".join(segments)

0 commit comments

Comments
 (0)