|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 |
|
3 | 3 | GOLDEN_EN_RULES = [ |
| 4 | + # 1) Simple period to end sentence |
4 | 5 | ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]), |
| 6 | + # 2) Question mark to end sentence |
5 | 7 | ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]), |
| 8 | + # 3) Exclamation point to end sentence |
6 | 9 | ("There it is! I found it.", ["There it is!", "I found it."]), |
| 10 | + # 4) One letter upper case abbreviations |
7 | 11 | ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]), |
| 12 | + # 5) One letter lower case abbreviations |
8 | 13 | ("Please turn to p. 55.", ["Please turn to p. 55."]), |
| 14 | + # 6) Two letter lower case abbreviations in the middle of a sentence |
9 | 15 | ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]), |
| 16 | + # 7) Two letter upper case abbreviations in the middle of a sentence |
10 | 17 | ("They closed the deal with Pitt, Briggs & Co. at noon.", |
11 | 18 | ["They closed the deal with Pitt, Briggs & Co. at noon."]), |
| 19 | + # 8) Two letter lower case abbreviations at the end of a sentence |
12 | 20 | ( |
13 | 21 | "Let's ask Jane and co. They should know.", |
14 | 22 | ["Let's ask Jane and co.", "They should know."]), |
| 23 | + # 9) Two letter upper case abbreviations at the end of a sentence |
15 | 24 | ( |
16 | 25 | "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [ |
17 | 26 | "They closed the deal with Pitt, Briggs & Co.", |
18 | 27 | "It closed yesterday." |
19 | 28 | ], |
20 | 29 | ), |
| 30 | + # 10) Two letter (prepositive) abbreviations |
21 | 31 | ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]), |
| 32 | + # 11) Two letter (prepositive & postpositive) abbreviations |
22 | 33 | ( |
23 | 34 | "St. Michael's Church is on 5th st. near the light.", |
24 | 35 | ["St. Michael's Church is on 5th st. near the light."], |
25 | 36 | ), |
| 37 | + # 12) Possesive two letter abbreviations |
26 | 38 | ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]), |
| 39 | + # 13) Multi-period abbreviations in the middle of a sentence |
27 | 40 | ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]), |
| 41 | + # 14) Multi-period abbreviations at the end of a sentence |
28 | 42 | ( |
29 | 43 | "I live in the E.U. How about you?", |
30 | 44 | ["I live in the E.U.", "How about you?"], |
31 | 45 | ), |
| 46 | + # 15) U.S. as sentence boundary |
32 | 47 | ( |
33 | 48 | "I live in the U.S. How about you?", |
34 | 49 | ["I live in the U.S.", "How about you?"], |
35 | 50 | ), |
| 51 | + # 16) U.S. as non sentence boundary with next word capitalized |
36 | 52 | ("I work for the U.S. Government in Virginia.", |
37 | 53 | ["I work for the U.S. Government in Virginia."]), |
| 54 | + # 17) U.S. as non sentence boundary |
38 | 55 | ("I have lived in the U.S. for 20 years.", |
39 | 56 | ["I have lived in the U.S. for 20 years."]), |
40 | 57 | # Most difficult sentence to crack |
| 58 | + # 18) A.M. / P.M. as non sentence boundary and sentence boundary |
41 | 59 | ( |
42 | 60 | "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", |
43 | 61 | [ |
44 | 62 | "At 5 a.m. Mr. Smith went to the bank.", |
45 | 63 | "He left the bank at 6 P.M.", "Mr. Smith then went to the store." |
46 | 64 | ] |
47 | 65 | ), |
| 66 | + # 19) Number as non sentence boundary |
48 | 67 | ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]), |
| 68 | + # 20) Number as sentence boundary |
49 | 69 | ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]), |
| 70 | + # 21) Parenthetical inside sentence |
50 | 71 | ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.", |
51 | 72 | ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]), |
| 73 | + # 22) Email addresses |
52 | 74 | ( "Her email is [email protected]. I sent her an email.", |
53 | 75 | [ "Her email is [email protected].", "I sent her an email."]), |
| 76 | + # 23) Web addresses |
54 | 77 | ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", |
55 | 78 | ["The site is: https://www.example.50.com/new-site/awesome_content.html.", |
56 | 79 | "Please check it out."]), |
| 80 | + # 24) Single quotations inside sentence |
57 | 81 | ( |
58 | 82 | "She turned to him, 'This is great.' she said.", |
59 | 83 | ["She turned to him, 'This is great.' she said."], |
60 | 84 | ), |
| 85 | + # 25) Double quotations inside sentence |
61 | 86 | ( |
62 | 87 | 'She turned to him, "This is great." she said.', |
63 | 88 | ['She turned to him, "This is great." she said.'], |
64 | 89 | ), |
| 90 | + # 26) Double quotations at the end of a sentence |
65 | 91 | ( |
66 | 92 | 'She turned to him, "This is great." She held the book out to show him.', |
67 | 93 | [ |
68 | 94 | 'She turned to him, "This is great."', |
69 | 95 | "She held the book out to show him." |
70 | 96 | ], |
71 | 97 | ), |
| 98 | + # 27) Double punctuation (exclamation point) |
72 | 99 | ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]), |
| 100 | + # 28) Double punctuation (question mark) |
73 | 101 | ("Hello?? Who is there?", ["Hello??", "Who is there?"]), |
| 102 | + # 29) Double punctuation (exclamation point / question mark) |
74 | 103 | ("Hello!? Is that you?", ["Hello!?", "Is that you?"]), |
| 104 | + # 30) Double punctuation (question mark / exclamation point) |
75 | 105 | ("Hello?! Is that you?", ["Hello?!", "Is that you?"]), |
| 106 | + # 31) List (period followed by parens and no period to end item) |
76 | 107 | ( |
77 | 108 | "1.) The first item 2.) The second item", |
78 | 109 | ["1.) The first item", "2.) The second item"], |
79 | 110 | ), |
| 111 | + # 32) List (period followed by parens and period to end item) |
80 | 112 | ( |
81 | 113 | "1.) The first item. 2.) The second item.", |
82 | 114 | ["1.) The first item.", "2.) The second item."], |
83 | 115 | ), |
| 116 | + # 33) List (parens and no period to end item) |
84 | 117 | ( |
85 | 118 | "1) The first item 2) The second item", |
86 | 119 | ["1) The first item", "2) The second item"], |
87 | 120 | ), |
| 121 | + # 34) List (parens and period to end item) |
88 | 122 | ("1) The first item. 2) The second item.", |
89 | 123 | ["1) The first item.", "2) The second item."]), |
| 124 | + # 35) List (period to mark list and no period to end item) |
90 | 125 | ( |
91 | 126 | "1. The first item 2. The second item", |
92 | 127 | ["1. The first item", "2. The second item"], |
93 | 128 | ), |
| 129 | + # 36) List (period to mark list and period to end item) |
94 | 130 | ( |
95 | 131 | "1. The first item. 2. The second item.", |
96 | 132 | ["1. The first item.", "2. The second item."], |
97 | 133 | ), |
| 134 | + # 37) List with bullet |
98 | 135 | ( |
99 | 136 | "• 9. The first item • 10. The second item", |
100 | 137 | ["• 9. The first item", "• 10. The second item"], |
101 | 138 | ), |
| 139 | + # 38) List with hypthen |
102 | 140 | ( |
103 | 141 | "⁃9. The first item ⁃10. The second item", |
104 | 142 | ["⁃9. The first item", "⁃10. The second item"], |
105 | 143 | ), |
| 144 | + # 39) Alphabetical list |
106 | 145 | ( |
107 | 146 | "a. The first item b. The second item c. The third list item", |
108 | 147 | ["a. The first item", "b. The second item", "c. The third list item"], |
109 | 148 | ), |
| 149 | + # 40) Geo Coordinates |
110 | 150 | ( |
111 | 151 | "You can find it at N°. 1026.253.553. That is where the treasure is.", |
112 | 152 | [ |
113 | 153 | "You can find it at N°. 1026.253.553.", |
114 | 154 | "That is where the treasure is." |
115 | 155 | ], |
116 | 156 | ), |
| 157 | + # 41) Named entities with an exclamation point |
117 | 158 | ( |
118 | 159 | "She works at Yahoo! in the accounting department.", |
119 | 160 | ["She works at Yahoo! in the accounting department."], |
120 | 161 | ), |
| 162 | + # 42) I as a sentence boundary and I as an abbreviation |
121 | 163 | ( |
122 | 164 | "We make a good team, you and I. Did you see Albert I. Jones yesterday?", |
123 | 165 | [ |
124 | 166 | "We make a good team, you and I.", |
125 | 167 | "Did you see Albert I. Jones yesterday?" |
126 | 168 | ], |
127 | 169 | ), |
| 170 | + # 43) Ellipsis at end of quotation |
128 | 171 | ( |
129 | 172 | "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", |
130 | 173 | [ |
131 | 174 | "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”" |
132 | 175 | ], |
133 | 176 | ), |
| 177 | + # 44) Ellipsis with square brackets |
134 | 178 | ( |
135 | 179 | """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""", |
136 | 180 | [ |
137 | 181 | '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).' |
138 | 182 | ], |
139 | 183 | ), |
| 184 | + # 45) Ellipsis as sentence boundary (standard ellipsis rules) |
140 | 185 | ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", |
141 | 186 | [ |
142 | 187 | "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", |
143 | 188 | "Next sentence." |
144 | 189 | ]), |
| 190 | + # 46) Ellipsis as sentence boundary (non-standard ellipsis rules) |
145 | 191 | ( |
146 | 192 | "I never meant that.... She left the store.", |
147 | 193 | ["I never meant that....", "She left the store."], |
148 | 194 | ), |
| 195 | + # 47) Ellipsis as non sentence boundary |
149 | 196 | ( |
150 | 197 | "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", |
151 | 198 | [ |
152 | 199 | "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it." |
153 | 200 | ], |
154 | 201 | ), |
| 202 | + # 48) 4-dot ellipsis |
155 | 203 | ( |
156 | 204 | "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", |
157 | 205 | [ |
|
0 commit comments