forked from udacity/cd0583-diagnose-and-fix
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
78 lines (61 loc) · 2.59 KB
/
train.py
File metadata and controls
78 lines (61 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import json
import json
import pandas as pd
import requests
import zipfile
import io
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection
from evidently.pipeline.column_mapping import ColumnMapping
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
#load data
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
raw_data = pd.read_csv(arc.open("day.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')
#set column mapping for Evidently Profile
data_columns = ColumnMapping()
data_columns.numerical_features = ['weathersit', 'temp', 'atemp', 'hum', 'windspeed']
data_columns.categorical_features = ['holiday', 'workingday']
#evaluate data drift with Evidently Profile
def eval_drift(reference, production, column_mapping):
data_drift_profile = Profile(sections=[DataDriftProfileSection()])
data_drift_profile.calculate(reference, production, column_mapping=column_mapping)
report = data_drift_profile.json()
json_report = json.loads(report)
drifts = []
for feature in column_mapping.numerical_features + column_mapping.categorical_features:
drifts.append((feature, json_report['data_drift']['data']['metrics'][feature]['drift_score']))
return drifts
#set reference dates
reference_dates = ('2011-01-01 00:00:00','2011-01-28 23:00:00')
#set experiment batches dates
experiment_batches = [
('2011-02-01 00:00:00','2011-02-28 23:00:00'),
('2011-03-01 00:00:00','2011-03-31 23:00:00'),
('2011-04-01 00:00:00','2011-04-30 23:00:00'),
('2011-05-01 00:00:00','2011-05-31 23:00:00'),
('2011-06-01 00:00:00','2011-06-30 23:00:00'),
('2011-07-01 00:00:00','2011-07-31 23:00:00'),
]
#log into MLflow
client = MlflowClient()
#set experiment
mlflow.set_experiment('Data Drift Evaluation with Evidently')
#start new run
for date in experiment_batches:
with mlflow.start_run() as run: #inside brackets run_name='test'
# Log parameters
mlflow.log_param("begin", date[0])
mlflow.log_param("end", date[1])
# Log metrics
metrics = eval_drift(raw_data.loc[reference_dates[0]:reference_dates[1]],
raw_data.loc[date[0]:date[1]],
column_mapping=data_columns)
for feature in metrics:
mlflow.log_metric(feature[0], round(feature[1], 3))
print(run.info)