Skip to content

Commit 89dbbd2

Browse files
author
Mark Neumann
committed
my benchmarks
1 parent 85d33fe commit 89dbbd2

File tree

2 files changed

+184
-0
lines changed

2 files changed

+184
-0
lines changed

benchmarks/genia_benchmark.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import blingfire
2+
import nltk
3+
import pysbd
4+
import spacy
5+
import stanza
6+
7+
import syntok
8+
from syntok.tokenizer import Tokenizer
9+
import syntok.segmenter as syntok_segmenter
10+
11+
from pathlib import Path
12+
13+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
14+
15+
nlp = spacy.blank('en')
16+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
17+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
18+
#stanza.download('en')
19+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
20+
21+
syntok_tokenizer = Tokenizer()
22+
23+
def blingfire_tokenize(text):
24+
return blingfire.text_to_sentences(text).split('\n')
25+
26+
def nltk_tokenize(text):
27+
return nltk.sent_tokenize(text)
28+
29+
def pysbd_tokenize(text):
30+
return pysbd_segmenter.segment(text)
31+
32+
def spacy_tokenize(text):
33+
return [sent.text.strip("\n") for sent in nlp(text).sents]
34+
35+
def spacy_dep_tokenize(text):
36+
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
37+
38+
def stanza_tokenize(text):
39+
return [e.text for e in stanza_nlp(text).sentences]
40+
41+
def make_sentences(segmented_tokens):
42+
for sentence in segmented_tokens:
43+
yield "".join(str(token) for token in sentence).strip()
44+
45+
def syntok_tokenize(text):
46+
tokens = syntok_tokenizer.split(text)
47+
result = syntok_segmenter.split(iter(tokens))
48+
segments = [sent for sent in make_sentences(result)]
49+
return segments
50+
51+
def load_genia_corpus(genia_raw_dir):
52+
txtfiles = Path(genia_raw_dir).glob("**/*.txt")
53+
txtfiles = list(txtfiles)
54+
all_docs = []
55+
for ind, txtfile in enumerate(txtfiles, start=1):
56+
with open(txtfile) as f:
57+
geniatext = f.read().strip()
58+
expected = geniatext.split('\n')
59+
all_docs.append((geniatext, expected))
60+
61+
return all_docs
62+
63+
def benchmark(docs, tokenize_func):
64+
65+
correct = 0
66+
for (text, expected) in docs:
67+
segments = tokenize_func(text)
68+
if segments == expected:
69+
correct +=1
70+
return correct
71+
72+
73+
if __name__ == "__main__":
74+
import argparse
75+
parser = argparse.ArgumentParser()
76+
parser.add_argument(
77+
'--genia',
78+
help="Path to the directory containing genia data."
79+
)
80+
81+
args = parser.parse_args()
82+
83+
libraries = (
84+
blingfire_tokenize,
85+
nltk_tokenize,
86+
pysbd_tokenize,
87+
spacy_tokenize,
88+
spacy_dep_tokenize,
89+
stanza_tokenize,
90+
syntok_tokenize
91+
)
92+
93+
docs = load_genia_corpus(args.genia)
94+
total = len(docs)
95+
for tokenize_func in libraries:
96+
correct = benchmark(docs, tokenize_func)
97+
percent_score = correct/total * 100
98+
print()
99+
print(tokenize_func.__name__)
100+
print(f'GENIA abstract acc: {percent_score:0.2f}%')

benchmarks/mark_benchmark.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import blingfire
2+
import nltk
3+
import pysbd
4+
import spacy
5+
import stanza
6+
7+
import syntok
8+
from syntok.tokenizer import Tokenizer
9+
import syntok.segmenter as syntok_segmenter
10+
11+
from english_golden_rules import GOLDEN_EN_RULES
12+
13+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
14+
15+
nlp = spacy.blank('en')
16+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
17+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
18+
#stanza.download('en')
19+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
20+
21+
syntok_tokenizer = Tokenizer()
22+
23+
def blingfire_tokenize(text):
24+
return blingfire.text_to_sentences(text).split('\n')
25+
26+
def nltk_tokenize(text):
27+
return nltk.sent_tokenize(text)
28+
29+
def pysbd_tokenize(text):
30+
return pysbd_segmenter.segment(text)
31+
32+
def spacy_tokenize(text):
33+
return [sent.text for sent in nlp(text).sents]
34+
35+
def spacy_dep_tokenize(text):
36+
return [sent.text for sent in nlp_dep(text).sents]
37+
38+
def stanza_tokenize(text):
39+
return [e.text for e in stanza_nlp(text).sentences]
40+
41+
def make_sentences(segmented_tokens):
42+
for sentence in segmented_tokens:
43+
yield "".join(str(token) for token in sentence).strip()
44+
45+
def syntok_tokenize(text):
46+
tokens = syntok_tokenizer.split(text)
47+
result = syntok_segmenter.split(iter(tokens))
48+
segments = [sent for sent in make_sentences(result)]
49+
return segments
50+
51+
52+
total_rules = len(GOLDEN_EN_RULES)
53+
54+
def benchmark(golden_rules, tokenize_func):
55+
score = 0
56+
for rule in golden_rules:
57+
text, expected = rule
58+
segments = tokenize_func(text)
59+
if segments == expected:
60+
score += 1
61+
percent_score = (score / total_rules) * 100.0
62+
63+
return percent_score
64+
65+
if __name__ == "__main__":
66+
import time
67+
libraries = (
68+
blingfire_tokenize,
69+
nltk_tokenize,
70+
pysbd_tokenize,
71+
spacy_tokenize,
72+
spacy_dep_tokenize,
73+
stanza_tokenize,
74+
syntok_tokenize)
75+
for tokenize_func in libraries:
76+
t = time.time()
77+
for i in range(100):
78+
percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
79+
80+
time_taken = time.time() - t
81+
print()
82+
print(tokenize_func.__name__)
83+
print(f'GRS score: {percent_score:0.2f}%')
84+
print(f'Speed(Avg over 100 runs): {time_taken*1000/100:>10.2f} ms')

0 commit comments

Comments
 (0)