-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtidy.py
More file actions
49 lines (40 loc) · 1.31 KB
/
tidy.py
File metadata and controls
49 lines (40 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas
from collections import Counter
import numpy as np
import csv, ast
#calculate tf
bad_list = []
good_list = []
with open('data_diana.csv','r') as csvfile:
reader = csv.reader(csvfile)
count = 0
for row in reader:
# words = row[4]
try:
words = ast.literal_eval(row[4])
if float(row[2]) <= 2.7:
for word in words:
bad_list.append(word)
else:
for word in words:
good_list.append(word)
except:
count += 1
print(count)
#group by score <-- this will put into dictionary
n1 = len(good_list)
n2 = len(bad_list)
good_count = Counter(good_list)
bad_count = Counter(bad_list)
print(type(bad_count))
good_freq = dict((item, num / (len(good_count)+len(bad_count))) for item, num in good_count.most_common())
bad_freq = dict((item, num / len(bad_count)) for item, num in bad_count.most_common())
#for each word. that is in both good and bad. ln(bad prop/ good prop ) * bad proportion
#seperate all words only found in the bad proportion
diff = {}
for key, good_value in good_freq.items():
if key in bad_freq:
diff[key] = np.log(bad_freq[key]/good_value)*bad_freq[key]
tfidf = sorted(list(diff.items()), key = lambda x: -x[1])
for tup in tfidf:
print(tup[0])