11# -*- coding: utf-8 -*-
22import re
33from pysbd .utils import Text
4+
45# TODO: SENTENCE_STARTERS should be lang specific
56from pysbd .lang .standard import Abbreviation , SENTENCE_STARTERS
6- from pysbd .lang .common .numbers import (Common , SingleLetterAbbreviationRules ,
7- AmPmRules )
7+ from pysbd .lang .common .numbers import Common , SingleLetterAbbreviationRules , AmPmRules
88
99
1010def replace_pre_number_abbr (txt , abbr ):
11- txt = re .sub (r'(?<=\s{abbr})\.(?=\s\d)|(?<=^{abbr})\.(?=\s\d)' .format (abbr = abbr .strip ()), "∯" , txt )
12- txt = re .sub (r'(?<=\s{abbr})\.(?=\s+\()|(?<=^{abbr})\.(?=\s+\()' .format (abbr = abbr .strip ()), "∯" , txt )
11+ # prepend a space to avoid needing another regex for start of string
12+ txt = " " + txt
13+ txt = re .sub (r"(?<=\s{abbr})\.(?=(\s\d|\s+\())" .format (abbr = abbr .strip ()), "∯" , txt )
14+ # remove the prepended space
15+ txt = txt [1 :]
1316 return txt
1417
1518
1619def replace_prepositive_abbr (txt , abbr ):
17- txt = re .sub (r'(?<=\s{abbr})\.(?=\s)|(?<=^{abbr})\.(?=\s)' .format (abbr = abbr .strip ()), "∯" , txt )
18- txt = re .sub (r'(?<=\s{abbr})\.(?=:\d+)|(?<=^{abbr})\.(?=:\d+)' .format (abbr = abbr .strip ()), "∯" , txt )
20+ # prepend a space to avoid needing another regex for start of string
21+ txt = " " + txt
22+ txt = re .sub (r"(?<=\s{abbr})\.(?=(\s|:\d+))" .format (abbr = abbr .strip ()), "∯" , txt )
23+ # remove the prepended space
24+ txt = txt [1 :]
1925 return txt
2026
2127
2228def replace_period_of_abbr (txt , abbr ):
23- txt = re .sub (r"(?<=\s{abbr})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^{abbr})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))" .format (abbr = abbr .strip ()), '∯' , txt )
24- txt = re .sub (r"(?<=\s{abbr})\.(?=,)|(?<=^{abbr})\.(?=,)" .format (abbr = abbr .strip ()), '∯' , txt )
29+ # prepend a space to avoid needing another regex for start of string
30+ txt = " " + txt
31+ txt = re .sub (
32+ r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))" .format (
33+ abbr = abbr .strip ()
34+ ),
35+ "∯" ,
36+ txt ,
37+ )
38+ # remove the prepended space
39+ txt = txt [1 :]
2540 return txt
2641
2742
2843def replace_abbreviation_as_sentence_boundary (txt ):
29- for word in SENTENCE_STARTERS :
30- escaped = re .escape (word )
31- regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s{}\s)" .format (escaped )
32- txt = re .sub (regex , '\\ 1.' , txt )
44+ sent_starters = "|" .join ((r"(?=\s{}\s)" .format (word ) for word in SENTENCE_STARTERS ))
45+ regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})" .format (sent_starters )
46+ txt = re .sub (regex , '\\ 1.' , txt )
3347 return txt
3448
3549
3650class AbbreviationReplacer (object ):
37-
38- def __init__ (self , text , language = 'en' ):
51+ def __init__ (self , text , language = "en" ):
3952 self .text = text
4053 self .language = language
4154
4255 def replace (self ):
43- self .text = Text (self .text ).apply (Common .PossessiveAbbreviationRule ,
44- Common .KommanditgesellschaftRule ,
45- * SingleLetterAbbreviationRules .All )
56+ self .text = Text (self .text ).apply (
57+ Common .PossessiveAbbreviationRule ,
58+ Common .KommanditgesellschaftRule ,
59+ * SingleLetterAbbreviationRules .All
60+ )
4661 self .text = self .search_for_abbreviations_in_string ()
4762 self .replace_multi_period_abbreviations ()
4863 self .text = Text (self .text ).apply (* AmPmRules .All )
@@ -52,9 +67,15 @@ def replace(self):
5267 def replace_multi_period_abbreviations (self ):
5368 def mpa_replace (match ):
5469 match = match .group ()
55- match = re .sub (re .escape (r'.' ), '∯' , match )
70+ match = re .sub (re .escape (r"." ), "∯" , match )
5671 return match
57- self .text = re .sub (Common .MULTI_PERIOD_ABBREVIATION_REGEX , mpa_replace , self .text , flags = re .IGNORECASE )
72+
73+ self .text = re .sub (
74+ Common .MULTI_PERIOD_ABBREVIATION_REGEX ,
75+ mpa_replace ,
76+ self .text ,
77+ flags = re .IGNORECASE ,
78+ )
5879
5980 def search_for_abbreviations_in_string (self ):
6081 original = self .text
@@ -64,25 +85,27 @@ def search_for_abbreviations_in_string(self):
6485 if stripped not in lowered :
6586 continue
6687 abbrev_match = re .findall (
67- r' (?:^|\s|\r|\n){}' .format (stripped ), original ,
68- flags = re . IGNORECASE )
88+ r" (?:^|\s|\r|\n){}" .format (stripped ), original , flags = re . IGNORECASE
89+ )
6990 if not abbrev_match :
7091 continue
7192 next_word_start = r"(?<={" + str (re .escape (stripped )) + "} ).{1}"
7293 char_array = re .findall (next_word_start , self .text )
7394 for ind , match in enumerate (abbrev_match ):
74- self .text = self .scan_for_replacements (self .text , match , ind , char_array )
95+ self .text = self .scan_for_replacements (
96+ self .text , match , ind , char_array
97+ )
7598 return self .text
7699
77100 def scan_for_replacements (self , txt , am , ind , char_array ):
78101 try :
79102 char = char_array [ind ]
80103 except IndexError :
81- char = ''
104+ char = ""
82105 prepositive = Abbreviation .PREPOSITIVE_ABBREVIATIONS
83106 number_abbr = Abbreviation .NUMBER_ABBREVIATIONS
84107 upper = str (char ).isupper ()
85- if ( not upper or am .strip ().lower () in prepositive ) :
108+ if not upper or am .strip ().lower () in prepositive :
86109 if am .strip ().lower () in prepositive :
87110 txt = replace_prepositive_abbr (txt , am )
88111 elif am .strip ().lower () in number_abbr :
0 commit comments