11#!/usr/bin/env python
22
33# Caleb Lareau, Broad Institute
4- # Finished: 2 June 2018
4+ # Finished: 16 June 2018
55# This program will demultiplex barcoded Tn5-based
66# scATAC from v2.1 scheme
77
1111import regex
1212import sys
1313import gzip
14+ from barcodeHelp import * # local python script
15+
1416from optparse import OptionParser
1517from multiprocessing import Pool , freeze_support
1618from itertools import repeat
2830
2931opts .add_option ("-a" , "--fastq1" , help = "<Read1> Accepts fastq or fastq.gz" )
3032opts .add_option ("-b" , "--fastq2" , help = "<Read2> Accepts fastq or fastq.gz" )
33+
3134opts .add_option ("-n" , "--nreads" , default = 5000000 , help = "Number of reads in each split output file" )
3235opts .add_option ("-c" , "--ncores" , default = 4 , help = "Number of cores for parallel processing." )
3336
3639opts .add_option ("-l" , "--nextera" , default = "TCGTCGGCAGCGTC" , help = "Nextera Adaptor Sequence" )
3740opts .add_option ("-m" , "--me" , default = "AGATGTGTATAAGAGACAG" , help = "ME Sequence" )
3841
39- opts .add_option ("-o" , "--out" , help = "Output sample convention" )
42+ opts .add_option ("-x" , "--nmismatches" , default = 1 , help = "Number of mismatches" )
43+ opts .add_option ("-o" , "--output" , help = "Output sample convention" )
44+
4045options , arguments = opts .parse_args ()
4146
4247print (options )
4954##### INPUTS #####
5055a = options .fastq1
5156b = options .fastq2
52- outname = options .out
53- o = options .out
57+ outname = options .output
58+ o = options .output
5459
5560cpu = int (options .ncores )
5661n = int (options .nreads )
5964c2 = options .constant2
6065nxt = options .nextera
6166me = options .me
67+ n_mismatch = int (options .nmismatches )
6268
6369# Infer the length from the adaptors
6470c1_len = len (c1 )
7884
7985# Define global variables
8086dumb = "N" * 7 + "_" + "N" * 7 + "_" + "N" * 7 + "_" + "N" * 6
87+ dumb2 = "N" * 27
8188
8289# Define barcodes
8390barcodes = ["GGACGAC" ,"GCAGTGT" ,"GAGAGGT" ,"GAACCGT" ,"GGTTAGT" ,"GCCTTTG" ,"GATAGAC" ,"GTGGTAG" ,"GTAATAC" ,"CGAGGTC" ,"CATCAGT" ,"CCAAGCT" ,"CCTTAGG" ,"CACGGAC" ,"CAGGCGG" ,"CCGAACC" ,"CACTTCT" ,"CTGGCAT" ,"CGATTAC" ,"TCGTTCT" ,"TGCTACT" ,"TTCCTCT" ,"TACTTTC" ,"TGAATCC" ,"TAGTACC" ,"TTATCAT" ,"TGATTGT" ,"TGGCAAC" ,"TGTTTAG" ,"AGTTTCT" ,"ATGGTGT" ,"ATTGCCT" ,"ACTCAAT" ,"AGACCAT" ,"AGCGAAT" ,"ACCTACC" ,"AGATAGG" ,"AAGGTTC" ,"AGGCATG" ,"GTGGCGC" ,"GGTCGTA" ,"GTGTCCA" ,"GAGGACA" ,"GTCCTTC" ,"GAGCGTG" ,"GATCACC" ,"GTTGATG" ,"CATACGC" ,"CTGCGCC" ,"CGTAGCC" ,"CGCGGCG" ,"CATCTTA" ,"CCAGTCA" ,"CGTTTGA" ,"CCACTTG" ,"CTAACTC" ,"CGAGTGG" ,"TCCTGGC" ,"TGACCGC" ,"TAAGGTA" ,"TCGCGCA" ,"TCATACA" ,"TAAGAGG" ,"TGGAAGG" ,"TCCGCTC" ,"TAACGCC" ,"TGCGTTG" ,"TCGGATG" ,"AGCCGCC" ,"ACACGCG" ,"ACTACGA" ,"AATGGCC" ,"ATGTTCC" ,"ACGTTGG" ,"AGACTTC" ,"ATATAAC" ,"ATAGTTG" ,"GCACAGC" ,"GACAATA" ,"GAATCAA" ,"GCTCCAA" ,"GCGTAGA" ,"GGAAGTT" ,"GGAGCCT" ,"GAATATG" ,"GGTTCAC" ,"CTAGAGC" ,"CGTGATA" ,"CGCCTAA" ,"CGATGCA" ,"CTTGCGA" ,"CCATAAT" ,"CCTATGT" ,"CGCGCTT" ,"CCGCGAT" ,"CGGCCAG" ,"TTGAGGC" ,"TTTCCTA" ,"TCAGCAA" ,"TCCTTAA" ,"TGGACCA" ,"TAGTGTT" ,"TATACTT" ,"TGTCGCT" ,"TACGCAT" ,"TTGTAAG" ,"TGTAGTG" ,"AGTAAGC" ,"ATGAATA" ,"AACGTAA" ,"AATTCCA" ,"AATGATT" ,"AAGTTAT" ,"ACAGCTT" ,"AGCTGAG" ,"ACAGTAC" ,"GGCAGGC" ,"GCGCACG" ,"GAGCTAA" ,"GGTAACA" ,"GCTAATT" ,"GTCGGTT" ,"GGTGTTT" ,"GCGACTC" ,"CTTACCG" ,"CTATTCG" ,"CTAAGAA" ,"CACGCCA" ,"CGGAGGA" ,"CTTGTCC" ,"CTCATTT" ,"CGGATCT" ,"CAGAATT" ,"CGCAATC" ,"TGCGAGC" ,"TTAAGCG" ,"TCTTGTA" ,"TACCGAA" ,"TTCTGCA" ,"TCCAGTT" ,"TGGCCTT" ,"TCGGCGT" ,"TCTGAAC" ,"TCGACAG" ,"AAGCAGC" ,"ATTCACG" ,"AAGTGCG" ,"ATAGGCA" ,"ATTCGTT" ,"ACGTATT" ,"ACCGGCT" ,"AATTGGT" ,"ATTATTC" ,"AACGGTG" ,"GAGTTGC" ,"GGCGGAA" ,"GTTAGGA" ,"GTGCATT" ,"GCCTCGT" ,"GCTTTAT" ,"GTGTGTC" ,"GGCGTCC" ,"CTCTTGC" ,"CGGCTGC" ,"CGGTACG" ,"CGTACAA" ,"CACATGA" ,"CCGGTTT" ,"CGACACT" ,"CCTCCTT" ,"CATGTAT" ,"CTTCATC" ,"CAGAGAG" ,"TATGTGC" ,"TCAAGAC" ,"TTGGTTA" ,"TGGTGAA" ,"TTACAGA" ,"TGAGATT" ,"TTTGGTC" ,"TTGGACT" ,"TTCGTAC" ,"TGAGGAG" ,"ACCATGC" ,"AGAGACC" ,"AGCAACG" ,"ACGAGAA" ,"AACCACA" ,"AACTCTT" ,"ATGAGCT" ,"AGGACGT" ,"AGGATAC" ]
8491tn5 = ["AAAGAA" ,"AACAGC" ,"AACGTG" ,"AAGCCA" ,"AAGTAT" ,"AATTGG" ,"ACAAGG" ,"ACCCAA" ,"ACCTTC" ,"ACGGAC" ,"ACTGCA" ,"AGACCC" ,"AGATGT" ,"AGCACG" ,"AGGTTA" ,"AGTAAA" ,"AGTCTG" ,"ATACTT" ,"ATAGCG" ,"ATATAC" ,"ATCCGG" ,"ATGAAG" ,"ATTAGT" ,"CAACCG" ,"CAAGTC" ,"CACCAC" ,"CACTGT" ,"CAGACT" ,"CAGGAG" ,"CATAGA" ,"CCACGC" ,"CCGATG" ,"CCGTAA" ,"CCTCTA" ,"CGAAAG" ,"CGAGCA" ,"CGCATA" ,"CGGCGT" ,"CGGTCC" ,"CGTTAT" ,"CTAGGT" ,"CTATTA" ,"CTCAAT" ,"CTGTGG" ,"CTTACG" ,"CTTGAA" ,"GAAATA" ,"GAAGGG" ,"GACTCG" ,"GAGCTT" ,"GAGGCC" ,"GAGTGA" ,"GATCAA" ,"GCCAGA" ,"GCCGTT" ,"GCGAAT" ,"GCGCGG" ,"GCTCCC" ,"GCTGAG" ,"GCTTGT" ,"GGACGA" ,"GGATTG" ,"GGCCAT" ,"GGGATC" ,"GGTAGG" ,"GGTGCT" ,"GTACAG" ,"GTCCTA" ,"GTCGGC" ,"GTGGTG" ,"GTTAAC" ,"GTTTCA" ,"TAAGCT" ,"TAATAG" ,"TACCGA" ,"TAGAGG" ,"TATTTC" ,"TCAGTG" ,"TCATCA" ,"TCCAAG" ,"TCGCCT" ,"TCGGGA" ,"TCTAGC" ,"TGAATT" ,"TGAGAC" ,"TGCGGT" ,"TGCTAA" ,"TGGCAG" ,"TGTGTA" ,"TGTTCG" ,"TTAAGA" ,"TTCGCA" ,"TTCTTG" ,"TTGCTC" ,"TTGGAT" ,"TTTGGG" ]
8592
8693#------------------------------
8794
88- def prove_barcode (bc ):
89- '''
90- Function that takes a putative barcode and returns the nearest valid one
91- '''
92-
93- if (bc in barcodes ):
94- return (bc )
95- else :
96- eo = process .extractOne (bc , barcodes )
97- if (eo [1 ] >= 71 ): # 71 comes from 5/7... the score is the score homology
98- return (eo [0 ])
99- else :
100- return ("NNNNNNN" )
101-
102- def prove_tn5 (bc ):
103- '''
104- Function that takes a putative barcode and returns the nearest valid one
105- '''
106-
107- if (bc in tn5 ):
108- return (bc )
109- else :
110- eo = process .extractOne (bc , tn5 )
111- if (eo [1 ] >= 66 ): # 66 comes from 4/6... the score is the score homology
112- return (eo [0 ])
113- else :
114- return ("NNNNNN" )
115-
116-
117- def formatRead (title , sequence , quality ):
118- """
119- Takes three components of fastq file and stiches them together in a string
120- """
121- return ("@%s\n %s\n +\n %s\n " % (title , sequence , quality ))
122-
12395def extractbarcode_v2_tn5 (sequence1 ):
12496 '''
12597 Function to extract barcodes
12698 '''
127- # Parse out sequence features and split based on constant sequences
128- bc1 = prove_barcode (sequence1 [0 :7 ])
12999
130100 # Parse out barcodes if we can ID the constants
131101 try :
@@ -136,14 +106,15 @@ def extractbarcode_v2_tn5(sequence1):
136106 me_hit = find_near_matches (me , sequence1 [55 :], max_l_dist = 2 )
137107
138108 # Now grab the barcodes
139- bc2 = prove_barcode (sequence1 [c1_hit [0 ][1 ]+ 7 :c2_hit [0 ][0 ]+ 23 ])
140- bc3 = prove_barcode (sequence1 [c2_hit [0 ][1 ]+ 23 :nxt_hit [0 ][0 ]+ 33 ])
141- bc_tn5 = prove_tn5 (sequence1 [nxt_hit [0 ][1 ]+ 33 :me_hit [0 ][0 ]+ 55 ])
109+ bc1 , mm1 = prove_barcode (sequence1 [0 :7 ], barcodes , n_mismatch )
110+ bc2 , mm2 = prove_barcode (sequence1 [c1_hit [0 ][1 ]+ 7 :c2_hit [0 ][0 ]+ 23 ], barcodes , n_mismatch )
111+ bc3 , mm3 = prove_barcode (sequence1 [c2_hit [0 ][1 ]+ 23 :nxt_hit [0 ][0 ]+ 33 ], barcodes , n_mismatch )
112+ bc_tn5 , mm4 = prove_barcode (sequence1 [nxt_hit [0 ][1 ]+ 33 :me_hit [0 ][0 ]+ 55 ], tn5 , n_mismatch )
142113 seq = sequence1 [me_hit [0 ][1 ]+ 55 :]
143114
144- return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5 , seq )
115+ return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5 , seq , str ( mm1 ) + "," + str ( mm2 ) + "," + str ( mm3 ) + "," + str ( mm4 ) )
145116 except :
146- return (dumb , sequence1 )
117+ return (dumb , sequence1 , "0,0,0,0" )
147118
148119
149120def debarcode_multiplexed (duo ):
@@ -156,6 +127,7 @@ def debarcode_multiplexed(duo):
156127 # parameters to return
157128 fq1 = ""
158129 fq2 = ""
130+ mm_quant = ""
159131
160132 nbc1 = 0
161133 nbc2 = 0
@@ -170,7 +142,7 @@ def debarcode_multiplexed(duo):
170142 title2 = listRead2 [0 ]; sequence2 = listRead2 [1 ]; quality2 = listRead2 [2 ]
171143
172144 # Return the barcode with underscores + the biological sequence learned
173- barcode , sequence1 = extractbarcode_tn5 (sequence1 )
145+ barcode , sequence1 , mm = extractbarcode_v2_tn5 (sequence1 )
174146 quality1 = quality1 [- 1 * len (sequence1 ):]
175147
176148 four = barcode .split ("_" )
@@ -190,14 +162,16 @@ def debarcode_multiplexed(duo):
190162 npass = 1
191163 fq1 = formatRead ("" .join (four ) + "_" + title1 , sequence1 , quality1 )
192164 fq2 = formatRead ("" .join (four ) + "_" + title2 , sequence2 , quality2 )
193- return ([[fq1 , fq2 ], [nbc1 , nbc2 , nbc3 , ntn5 , npass , nfail ]])
165+ mm_quant = mm_quant + "" .join (four ) + "," + mm + "\n "
166+ return ([[fq1 , fq2 ], [nbc1 , nbc2 , nbc3 , ntn5 , npass , nfail ], [mm_quant ]])
194167
195168
196169# Define variables to keep track of things that fail
197170nbc1 = 0
198171nbc2 = 0
199172nbc3 = 0
200173ntn5 = 0
174+
201175npass = 0
202176nfail = 0
203177
@@ -211,7 +185,7 @@ def debarcode_multiplexed(duo):
211185 # iterate over batches of length n
212186 for i , batch1 in enumerate (it1 ):
213187 batch2 = it2 .__next__ ()
214- output = o + "-parse " + str (i + 1 ).zfill (3 )
188+ output = o + "-c " + str (i + 1 ).zfill (3 )
215189
216190 # parallel process the barcode processing and accounting of failures.
217191 pool = Pool (processes = cpu )
@@ -221,6 +195,8 @@ def debarcode_multiplexed(duo):
221195 # Aggregate output
222196 fqs = list (map ('' .join , zip (* [item .pop (0 ) for item in pm ])))
223197 counts = list (map (sum , zip (* [item .pop (0 ) for item in pm ])))
198+ mm_values = list (map ('' .join , zip (* [item .pop (0 ) for item in pm ])))
199+
224200 nbc1 = nbc1 + counts [0 ]
225201 nbc2 = nbc2 + counts [1 ]
226202 nbc3 = nbc3 + counts [2 ]
@@ -232,9 +208,10 @@ def debarcode_multiplexed(duo):
232208 # Export one chunk in parallel
233209 filename1 = output + '_1.fastq.gz'
234210 filename2 = output + '_2.fastq.gz'
211+ filenameMM = output + '_mismatches.csv.gz'
235212
236- pool = Pool (processes = 2 )
237- toke = pool .starmap (chunkWriterGzip , [(filename1 , fqs [0 ]), (filename2 , fqs [1 ])])
213+ pool = Pool (processes = 3 )
214+ toke = pool .starmap (chunk_writer_gzip , [(filename1 , fqs [0 ]), (filename2 , fqs [1 ]), ( filenameMM , mm_values )])
238215 pool .close ()
239216
240217with open (o + "-debarcode" + '.sumstats.log' , 'w' ) as logfile :
0 commit comments