-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript v6.py
More file actions
110 lines (98 loc) · 4.1 KB
/
script v6.py
File metadata and controls
110 lines (98 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import numpy as np
from scipy.stats import linregress
import pandas as pd
from sklearn.linear_model import LinearRegression
from itertools import combinations
def model_stats(final_set): # Finds the maximum value in a matrix off the diagonal
df = pd.DataFrame(normdata[refcode], columns=final_set)
corr = df.corr()
X = [entry[p] for p in final_set]
X = np.transpose(X)
reg = LinearRegression()
fit = reg.fit(X, y)
accuracy = float(reg.score(X, y))
coeffs = {}
for d in final_set:
coeffs[d] = fit.coef_[final_set.index(d)]
maximum = 0
for a in corr:
for b in corr:
if a == b:
corr[a][b] = 0
maximum = max([maximum, abs(corr[a][b])])
max_corrs = maximum # maximum off-diagonal element
results = [accuracy, coeffs, max_corrs]
return results
cutoff_r2 = 0.80
cutoff_corr = 0.50
# save the normalized data
with open('normdata.json', 'r') as f:
normdata = json.load(f)
# single-variant linear regression fit
descriptor = 'dn2'
predictors = ['dm2', 'cn2_x', 'cn2_y', 'cn2_z', 'm2n2_angle']
coefficients = {}
for refcode, entry in normdata.items():
print('Now processing', refcode)
y = entry['dn2']
# single-variate linear regression
inputs = ['dm2', 'cn2_x', 'cn2_y', 'cn2_z', 'm2n2_angle']
slopeList = {}
for p in predictors:
x = entry[p]
slope, intercept, r_value, p_value, std_err = linregress(x, y)
if abs(slope) < 0.1 and len(inputs) > 1: # Removes inputs that have a slope < 0.1
inputs.remove(p)
# Creates correlation matrix
df = pd.DataFrame(normdata[refcode], columns=inputs)
corr = df.corr()
off_limits = []
mutable = []
for a in inputs: # Checks the correlation between two predictors
corrSum = 0
for b in inputs:
if 0.6 < abs(corr[a][b]) < 1: # If high correlation, add 1 to counter
corrSum += 1
if corrSum == 0: # Sorts between high and low correlation
off_limits.append(a)
else:
mutable.append(a)
coeffList = []
R2list = []
corrMaxList = []
if len(mutable) > 0 and len(off_limits) > 0: # If mutable is not empty, run each combination of its elements
for a in range(len(mutable) + 1):
comb = combinations(mutable, a)
for order in comb: # For each combination of predictors
finals = off_limits + list(order)
stats = model_stats(finals)
R2list.append(stats[0])
coeffList.append(stats[1])
corrMaxList.append(stats[2])
elif len(mutable) > 0: # If none of the predictors are highly correlated, checks R^2 of off_limits
for a in range(1, len(mutable)):
comb = combinations(mutable, a)
for order in comb: # For each combination of predictors
stats = model_stats(list(order))
R2list.append(stats[0])
coeffList.append(stats[1])
corrMaxList.append(stats[2])
else:
stats = model_stats(off_limits + list(order))
R2list.append(stats[0])
coeffList.append(stats[1])
corrMaxList.append(stats[2])
coefficients[refcode] = {'r2': [], 'corr_max': [], 'predictors': []}
for i, j in enumerate(zip(R2list, corrMaxList, coeffList)):
if j[0] >= cutoff_r2 and j[1] <= cutoff_corr:
coefficients[refcode]['r2'].append(j[0])
coefficients[refcode]['corr_max'].append(j[1])
coefficients[refcode]['predictors'].append(j[2].copy())
if len(coefficients[refcode]['r2']) == 0: # If it doesn't pass the cutoffs, append the highest R^2 value
max_index = R2list.index(max(R2list))
coefficients[refcode]['r2'].append(max(R2list))
coefficients[refcode]['corr_max'].append(corrMaxList[max_index])
coefficients[refcode]['predictors'].append(coeffList[max_index])
with open('coefficients6.txt', 'w') as outfile:
json.dump(coefficients, outfile, indent=4)