-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgbk_to_faa_batch.py
More file actions
executable file
·157 lines (128 loc) · 4.4 KB
/
gbk_to_faa_batch.py
File metadata and controls
executable file
·157 lines (128 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
BATCH writes out all protein fasta sequences from a Genbank file
Input:
- path to input directory containing Genbank files
Output:
- protein fasta (.faa)
"""
# TODO:
# - [x] add ability to read gzipped genbank_files
# - [ ] add regex to filter input files to .gbk or .gbff
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from Bio import SeqIO
from pathlib import Path
from typing import TextIO
import argparse
import gzip
import sys
# =================================================================
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-i",
"--input_dir",
dest="input_dir",
type=Path,
metavar="FILE",
required=True,
help="Path to target directory containing all input Genbank files for conversion (.gbk or .gbff) [Required]",
)
parser.add_argument(
"-o",
"--outdir",
dest="outdir",
type=Path,
metavar="DIR",
required=False,
help="Path output directory for faa files [Default: same dir as input dir]",
)
parser.add_argument(
"-c",
"--cpu",
dest="cpu",
type=int,
default=None,
metavar="N",
required=False,
help="No. of CPUs to use for parallelism [Default: max available]",
)
parser.add_argument(
"--no_desc",
dest="no_desc",
action="store_true",
help="Write out fasta file with just the accession ID in the header, but no description",
)
args = parser.parse_args()
# create default outdir from input dir
if args.outdir is None:
input_parentdir = Path(args.input_dir)
args.outdir = input_parentdir
return args
# =============================================================
def open_gz(file: Path) -> TextIO:
"""Utility function: open file, even if it is gzipped"""
if file.suffix == ".gz":
return gzip.open(file, "rt")
else:
return open(file, "r")
# =================================================================
def gbk_to_faa(
genbank_file: Path,
outdir: Path,
args: argparse.Namespace,
) -> None:
"""Reads Genbank file and writes to file
Args:
genbank_file (Path): path to genbank_file
Returns:
None: writes output .faa file that shares the same name as the input genbank_file
"""
############# make output file ###################
outfaa = Path(outdir) / f"{genbank_file.stem}.faa"
if outfaa.exists():
outfaa.unlink()
outfaa.parent.mkdir(parents=True, exist_ok=True)
############# read genbank and write faa ###################
with (
open_gz(file=genbank_file) as infile,
open(file=outfaa, mode="w") as outfile,
):
for rec in SeqIO.parse(infile, "genbank"):
for feat in rec.features:
if feat.type != "CDS":
continue
# extract faa info from qualifiers dict[str, list[str]]
faa_id = feat.qualifiers.get("locus_tag", [0])[0]
faa_seq = feat.qualifiers.get("translation", [0])[0]
# WARN: key for descriptions can vary between genbanks
# faa_desc = feat.qualifiers.get("product", [0])[0]
faa_desc = feat.qualifiers.get("annotation", [0])[0]
if args.no_desc:
outfile.write(f">{faa_id}\n{faa_seq}\n")
else:
outfile.write(f">{faa_id} {faa_desc}\n{faa_seq}\n")
# =================================================================
def main() -> None:
# parse args
args = parse_args()
# get all input genbank files
gbk_files = sorted([f for f in Path(args.input_dir).glob("*.gb*") if f.is_file()])
############### PARALLEL PROCESSING ###################
# gbk_to_faa(genbank_file=args.input, outdir=args.outdir, args=args)
# make partial func
partial_gbk_to_faa = partial(
gbk_to_faa,
outdir=args.outdir,
args=args,
)
# ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=args.cpu) as exe:
list(exe.map(partial_gbk_to_faa, gbk_files))
# =================================================================
if __name__ == "__main__":
sys.exit(main())