forked from R3dFruitRollUp/High-Dim-TS-Medium
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaning.py
More file actions
35 lines (24 loc) · 800 Bytes
/
data_cleaning.py
File metadata and controls
35 lines (24 loc) · 800 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
def cleaner(file):
'''
Cleans a parquet file by dropping all columns with nulls or zeros.
Parameters
==========
file: csv file with features and labels to clean.
Returns
==========
dataset: cleaned dataset.
'''
df_all = pd.read_csv(file, index_col='timestamp')
# remove cols with all nans
df_all.dropna(axis=1, how='all',inplace=True)
# remove cols with all 0s
df_non_zero = df_all.loc[:,(df_all != 0).any(axis=0)]
# remove cols using variance threshold
# selector = VarianceThreshold(threshold=0.2)
# df_varianced = selector.fit_transform(df_all)
dataset = df_non_zero
return dataset