-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathga_path2seq.pl
More file actions
executable file
·75 lines (56 loc) · 1.46 KB
/
ga_path2seq.pl
File metadata and controls
executable file
·75 lines (56 loc) · 1.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/perl -w
use strict;
sub eord {
my ($seed,$str,$enc) = @_;
my $x = int(rand(1<<30));
srand($seed);
my $s="";
for (my $i=0; $i < 4; $i++) {
$s .= chr(int(rand(256))) if $enc;
}
foreach (split("", $str)) {
$s .= chr(ord($_) ^ int(rand(256)))
}
for (my $i=0; $i < 4; $i++) {
$s .= chr(int(rand(256))) if $enc;
}
srand($x);
return $enc ? $s : substr($s, 4, -4);
}
# Load a GAF file, process the paths, and turn into a DNA sequence string.
# These are always A + 1-bit encoded ([CG]) versions of the node name.
# We reverse complement the entire thing if the node starts with <
# Turn GAF paths into a fasta sequence
my $count = 0;
my %seen = ();
while (<>) {
my @F = split(/\t/, $_);
next if $F[5] =~ /^</; # TODO: reverse complement
# Remove duplicates
next if $seen{$F[5]};
$seen{$F[5]}=1;
print STDERR $F[5],"\n";
my $seq = "";
foreach ($F[5] =~ m/[<>][^<>]*/g) {
m/(.)(.*)/;
#$seq .= ($1 eq ">") ? "AA" : "TT";
#$seq .= "AA";
#$seq .= "N";
foreach (split("", eord(1, $2, 1))) {
#foreach (split("", $2)) {
my $o = ord($_);
my @bit = qw/C G/;
#print "$o\n";
for (my $i=7; $i>=0; $i--) {
$seq .= $bit[$o & (1<<$i) ? 1 : 0];
#print $o & (1<<$i) ? 1 : 0;
}
#print "\n";
}
}
print ">$count\n$seq\n";
$count++;
}
# Assemble (phrap on FASTA)
# Map FASTA back to contigs
# Count depth of contig to determine real routes from incorrect routes