-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateCSVfromSeparatedPheno.py
More file actions
58 lines (48 loc) · 2.52 KB
/
createCSVfromSeparatedPheno.py
File metadata and controls
58 lines (48 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import csv
import sys
import argparse
import os
def parseArgs():
parser = argparse.ArgumentParser(description='Parse ADSP files that were previously separated by phenotype.')
parser.add_argument('-i', '--input_dir', default='separated_by_phenotype',type=str, required=False, help='(input) Input directory with 3 files (more allowed, but will not be read): cases.csv, controls.csv, and mci.csv')
parser.add_argument('-o', '--output', type=str, default="ml_data.csv",required=False,help='(output) Path to output CSV with formatted ADNI data')
args = parser.parse_args()
if not os.path.isdir(args.input_dir):
print("Must input a valid directory path with the -i option")
sys.exit()
return args
def main():
args=parseArgs()
headerWritten=False
for file_name in ['cases.csv','controls.csv','mci.csv']:
input_file = args.input_dir +"/" + file_name
#input_file = sys.argv[1]
#output_file = sys.argv[2]
selected_headers = ['SUBJID','Sex', 'Age', 'APOE_reported', 'Race'] # Replace with your desired headers
with open(input_file, 'r', newline='') as infile, \
open(args.output, 'a', newline='') as outfile:
reader = csv.DictReader(infile)
# Filter the fieldnames to include only the selected headers
writer_fieldnames = [header for header in selected_headers if header in reader.fieldnames]
writer_fieldnames2 = writer_fieldnames[:]
writer_fieldnames2.append("Class")
writer = csv.DictWriter(outfile, fieldnames=writer_fieldnames2)
if not headerWritten:
writer.writeheader() # Write the header row to the new file
headerWritten=True
for row in reader:
# Create a new dictionary with only the selected columns
new_row = {header: row[header] for header in writer_fieldnames}
if file_name == 'cases.csv':
new_row['Class'] ='AD' #AD, Control, PSP, etc.
elif file_name == 'mci.csv':
new_row['Class'] ='MCI'
elif file_name == 'controls.csv':
new_row['Class'] ='Control'
if new_row['Age']=='90+':
new_row['Age']='90'
if not any(value == 'NA' for value in new_row.values()):
writer.writerow(new_row)
if __name__ == '__main__':
main()
#Make sure to get rid of lines with NA values afterwards.