-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnaivebase.py
More file actions
103 lines (94 loc) · 3.47 KB
/
naivebase.py
File metadata and controls
103 lines (94 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 24 14:52:00 2018
@author: Prem chand
"""
import pandas as pd
import collections
import nltk.classify.util
import nltk.metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
#import re
import itertools
from nltk.classify import NaiveBayesClassifier
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import matplotlib.pyplot as plt
stop_words=set(stopwords.words('english'))
my_set=set(line.strip() for line in open('stopwords.txt'))
def labeltoint(label):
if label == 'pos':
return 1
else:
return 0
#remove symbols,Punctuation and break tags
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
def cleanSentences(string):
string = string.lower().replace("<br />", " ")
string.replace('[^\w\s]','')
return re.sub(strip_special_chars, "", string.lower())
def bigrams(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
words=words.split(" ")
words = [w for w in words if not w in stop_words]
words = [w for w in words if not w in my_set]
porter = PorterStemmer()
words = [porter.stem(word) for word in words]
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
classifier=None
def naiveBaseClassifier(featx):
posdata=pd.read_csv(r'posReview1.csv', sep=',')
negdata=pd.read_csv(r'negReview1.csv', sep=',')
poslist=posdata["Review"].tolist()
neglist=negdata["Review"].tolist()
neglist=neglist[0:12248]+neglist[12250:]
finalpos=[]
finalneg=[]
for item in poslist:
item1=cleanSentences(item)
finalpos.append(item1)
for item in neglist:
item1=cleanSentences(item)
finalneg.append(item1)
negfeats = [(featx(review),'neg') for review in finalneg]
posfeats = [(featx(review), 'pos') for review in finalpos]
negcutoff = int(len(negfeats)*0.5)
poscutoff = int(len(posfeats)*0.5)
traindata = negfeats[:negcutoff] + posfeats[:poscutoff]
testdata = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(traindata)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
y_true, y_score = [], []
for i, (feats, label) in enumerate(testdata):
refsets[label].add(i)
observed = classifier.classify(feats)
y_true.append(labeltoint(label))
y_score.append(labeltoint(observed))
testsets[observed].add(i)
print('accuracy:', nltk.classify.util.accuracy(classifier, testdata))
# Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=1)
pr_auc = auc(recall, precision)
print("Precision-Recall AUC: %.2f" % pr_auc)
# ROC AUC
fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=1)
roc_auc = auc(fpr, tpr)
print ("ROC AUC: %.2f" % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
classifier.show_most_informative_features()
naiveBaseClassifier(bigrams)