RAD_Tools/VCF2phy.py at master · CoBiG2/RAD_Tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/python3
# Copyright 2015 Diogo N Silva <[email protected]>
# This file is part of VCP2phy.
# VCP2phy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# VCP2phy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with VCP2phy. If not, see <http://www.gnu.org/licenses/>.

import argparse
from collections import defaultdict, OrderedDict

parser = argparse.ArgumentParser(description="Converter of VCF into phylip "
											 "format")
parser.add_argument("-vcf", dest="vcf_file", help="VCF file")
parser.add_argument("-loci", dest="loci_file", help="If provided, the output "
					"file will contain the full sequences contained in the "
					" loci file.")
parser.add_argument("-o", dest="output_file", help="Name of output file")

arg = parser.parse_args()

iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M",
         "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A",
		 "CC": "C", "GG": "G", "TT":"T"}

def parse_vcf(vcf_file):
	"""
	Parses the VCF file and returns a dictionary with the loci (chromossomes)
	as keys and a list of positions as values.
	"""

	loci = defaultdict(list)

	vcf_handle = open(vcf_file)

	for line in vcf_handle:

		if line.startswith("##"):
			pass

		elif line.startswith("#CHROM"):
			taxa_list = line.strip().split()[9:]

		elif line.strip() != "":
			fields = line.strip().split()
			loci[int(fields[0].split("_")[-1])].append(int(fields[1]) - 1)

	vcf_handle.close()

	return loci, taxa_list


def convert_genotype(genotype_string, variants):
	"""
	Converts a genotype string from the vcf ("0|0") into an actual genotype
	("A"), provided the variants for that particular site
	"""

	# Remove separators
	genotype_string =  genotype_string.replace("|", "").replace("/", "").upper()

	if genotype_string == "..":
		return "N"

	# Convert to actual bases
	actual_genotype = "".join(sorted([variants[int(x)] for x in
									  genotype_string]))

	return iupac[actual_genotype]


def parse_vcf_variants(vcf_file):
	"""
	Parses the VCF but retains the variation information. This is done when
	converting a VCF directly to phylip without using a .loci file. The
	resulting phylip file will contain only variable sites, though.
	"""

	vcf_handle = open(vcf_file)
	# Will store the sequence data for each taxon in the VCF


	for line in vcf_handle:

		if line.startswith("##"):
			pass

		elif line.startswith("#CHROM"):
			# List of taxa that will retain their order in the VCF
			taxa_list = line.strip().split()[9:]
			# Will store the sequence data for each taxon in the VCF
			seq_data = dict((tx,[]) for tx in taxa_list)

		# Skip empty lines and read the VCF data
		elif line.strip() != "":
			fields = line.strip().split()
			# e.g. ("A","T")
			variants = [fields[3]] + fields[4].split(",")
			# e.g. ["0|0", ".|.", "1|1"]
			genotypes = [x.split(":")[0] for x in fields[9:]]

			# Add information to seq_data for each data point
			for p,gen in enumerate(genotypes):

				# get the genotype
				genotype = convert_genotype(gen, variants)

				# Get taxon
				tx = taxa_list[p]

				seq_data[tx].append(genotype)

	return seq_data


def mask_alignment(aln, var_positions):
	"""
	Parses an alignment in an OrderedDict format {taxon: seq} and returns a
	similar alignment OrderedDict with the variable positions not present in the
	var_positions list masked
	"""

	masked_dic = OrderedDict((tx, []) for tx in aln)

	for p, column in enumerate(zip(*aln.values())):

		gapless_column = len(set([x for x in column if x.lower() not in ["-", "n"]]))

		# If this represents a real variable position or a column with no
		# variation (excluding missing data), save sequences as they
		# are
		if p in var_positions or (column not in var_positions and
							      gapless_column == 1):

			# This appends the existing nucleotide of each taxon to its
			# corresponding taxon in masked_dic
			list(map(lambda char, tx: masked_dic[tx].append(char),
			                column, list(aln.keys())))

		elif p not in var_positions and gapless_column > 1:

			# This appends an "n" to each taxon in masked_dic
			list(map(lambda tx: masked_dic[tx].append("n"), list(aln.keys())))

	masked_dic = OrderedDict((tx, "".join(seq)) for tx, seq in masked_dic.items())

	return masked_dic


def parse_loci(loci_file, vcf_loci, taxa_list):
	"""
	Writes a phylip file from a loci file, according to the positions in the
	VCF. Variable positions that are not present in the VCF are soft masked
	"""

	fh = open(loci_file)

	current_aln = OrderedDict()

	# Keeps track of the sequence lenght for each loci. For the partitions file.
	seq_lens = []

	# Using list values instead of strings makes the append process much faster
	total_aln = dict((x, []) for x in taxa_list)

	for line in fh:

		if line.startswith("//"):

			# Get locus number
			locus = int(line.split("|")[1]) + 1

			print("\rParsing alignment {}".format(locus), end="")

			if locus in vcf_loci:

				if current_aln:
					masked_aln = mask_alignment(current_aln, vcf_loci[locus])
					current_len = len(list(masked_aln.values())[0])

					# Append masking result to total alignment
					for tx in taxa_list:
						try:
							total_aln[tx].append(masked_aln[tx])
						except KeyError:
							total_aln[tx].append("n" * current_len)

					seq_lens.append(current_len)

			current_aln = OrderedDict()

		# Loci present in VCF. Begin alignment parsing routine
		elif line.strip() != "":
		# Get alignment

			fields = line.strip().split()
			# Removes ">" from taxon name
			taxon = fields[0]
			current_aln[taxon] = fields[1]

	fh.close()

	# Finalize alignment object by converting list values into strings
	return dict((tx, "".join(seq)) for tx, seq in total_aln.items()), seq_lens


def write_to_phy(aln_dict, seq_lens, output_file):
	"""
	Writes a alignment dictionary {taxon: seq} to a phylip file
	"""

	# Write main phylip file
	fh = open(output_file + ".phy", "w")

	fh.write("{} {}\n".format(len(aln_dict),
	                          len(list(aln_dict.values())[0])))

	for tx, seq in aln_dict.items():
		fh.write("{}\t{}\n".format(tx, seq))

	fh.close()

	# Write partitions file
	fh = open(output_file + ".partitions", "w")
	locus_n = 1
	c = 1

	for l in seq_lens:
		fh.write("DNA, p{}={}-{}\n".format(locus_n, c, c + l - 1))
		c += l
		locus_n += 1

	fh.close()


def write_vcf_variants(seq_data, output_file):
	"""
	Writes the variable sites of the VCF file directly to phylip format
	"""

	fh = open(output_file + ".phy", "w")

	# Get number taxa
	ntaxa = len(seq_data)

	# Get alignment length
	nlen = len(list(seq_data.values())[0])

	# Write header
	fh.write("{}\t{}\n".format(ntaxa, nlen))

	# Write alignment
	for tx, seq_list in seq_data.items():
		fh.write("{}\t{}\n".format(tx, "".join(seq_list)))

	fh.close()


def main():

	# Arguments
	vcf_file = arg.vcf_file
	loci_file = arg.loci_file
	output_file = arg.output_file

	if not loci_file:
		print("Loci file not provided. Converting only variable sites in VCF.")
		seq_data = parse_vcf_variants(vcf_file)
		write_vcf_variants(seq_data, output_file)
		return

	# Parse VCF
	loci_storage, taxa_list = parse_vcf(vcf_file)

	# Parse loci
	aln_obj, seq_lens = parse_loci(loci_file, loci_storage, taxa_list)

	# Write phylip file
	write_to_phy(aln_obj, seq_lens, output_file)

if __name__ == "__main__":
	main()