nt_context_evaluation/parseWindow.py at main · jmillerlab/nt_context_evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#! /usr/bin/env python3
'''
The purpose of this program is to read through the output from run_nt.py. run_nt.py takes a snapshot of a single window. Here, we combine all of those predictions to create a list
of all predictions at the nth position of an input sequence. It makes it easier to query predictions at the beginning or end of the sequence given to SegmentNT.
'''

import sys
import argparse


def parseArgs():
    parser = argparse.ArgumentParser(description='Parse output from run_nt.py to make it easier to find predictions at any position in the input sequence for exons and introns')
    parser.add_argument('-i', '--input', type=str, required=True, help='(input) Required input file from run_nt.py output.')
    parser.add_argument('-o', '--output', type=str, required=True,help='(output) Required output text file')
    args = parser.parse_args()
    return args

def reformatRunNTResults(args):
    intron_pred=[]
    exon_pred=[] #list of lists with each position in the list being an array of all predictions at that context level
    inArray=False
    intron=False
    exon=False
    pos=0
    lastRow=False
    lastlastRow=False

    #Read each line of the run_nt.py output
    with open(args.input) as inputF: ###open(output_from run_nt.py)
        for line in inputF:
            line=line.strip()
            if not inArray:
                if not inArray and not line.startswith("Probabilities,"):
                    continue
            if line.startswith("Probabilities, "):
                line=line[15:] #remove "Probabilities, " from the line
                pos=0
                if line.startswith('intron: [['):
                    inArray=True
                    intron=True
                    line=line[10:] #remove 'intron: [[' from the line
                elif line.startswith('exon: [['):
                    inArray=True
                    line=line[8:] #remove 'exon: [[' from the line
                    exon=True
            elif line.startswith('['):
                line=line[1:]
                pos=0
            if line.endswith("]]"):
                lastlastRow=True
                line=line[0:-2] #remove the bracket from the end of the line
            elif line.endswith("]"):
                line=line[0:-1] #remove the bracket from the end of the line
                lastRow=True

            predictions=map(float,line.split()) #get predictions and store as floats. Ensures that all values are floats.

            for num in predictions:
                if intron:
                    if len(intron_pred)==pos:
                        intron_pred.append([])
                    intron_pred[pos].append(num)
                elif exon:
                    if len(exon_pred)==pos:
                        exon_pred.append([])
                    exon_pred[pos].append(num)
                pos+=1
            if lastRow:
                pos=0
                lastRow=False
            if lastlastRow:
                inArray=False
                intron=False
                exon=False
                lastlastRow=False

    with open(args.output,'w') as output:
        output.write(str(len(intron_pred)) +" " +str(len(exon_pred)) +"\n")

        output.write("Intron\n")
        for array in intron_pred:
            output.write(str(array) +"\n")

        output.write("\nExon\n")
        for array in exon_pred:
            output.write(str(array)+"\n")

def main():
    args=parseArgs()
    reformatRunNTResults(args)

if __name__ == '__main__':
    main()