Mercurial > repos > tiagoantao > barcode_stacks
comparison barcode_sort.pl @ 0:5dc02b8a2a57 draft default tip
planemo upload commit 70654da77972b29181d3b388035836b6fa84d59d-dirty
author | tiagoantao |
---|---|
date | Fri, 08 Apr 2016 11:38:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5dc02b8a2a57 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; use warnings; | |
3 | |
4 my $cut_site = $ARGV[7]; #let the user specify the cut site keyword (e.g. "TGCA" for the SbfI 6 bp cutter) | |
5 | |
6 my $b = $ARGV[6]; #let the user specify barcode length | |
7 my $n = 2; # set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here) | |
8 my $c = $b + $n; # set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode | |
9 my $e = 6; # set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!! | |
10 my $e = length($cut_site); #calculate the length of the cut site | |
11 # note this is the length of what's left after enzyme digestion, NOT the full length of the enzyme recognition site | |
12 # the program expects all correct forward reads to follow the pattern: $n initial nucleotides, then $b nucleotides of barcode, then $e nucleotides of the cutsite | |
13 | |
14 my $is_resilient; | |
15 | |
16 open (IN, $ARGV[0]); # read file with barcodes | |
17 my %counts = (); # make a hash of barcodes that will be searched | |
18 while(<IN>) { # counts of each barcode can be tracked with this hash, with a few more lines of code below | |
19 chomp($_); | |
20 $counts{$_} = 0; | |
21 } | |
22 close IN; | |
23 | |
24 open (IN1, $ARGV[1]); # read fastq file of raw forward reads | |
25 open (IN2, $ARGV[2]); # read fastq file of raw reverse reads -- these must have pairs in identical order | |
26 open (OUT1, ">$ARGV[3]"); # create fastq outfile for flipped forward reads (cutsite end) | |
27 open (OUT2, ">$ARGV[4]"); # create fastq outfile for flipped reverse reads (randomly sheared end) | |
28 my $forward; my $reverse; my $barcode; # establish string variables for all parts of fastq files | |
29 my $name1; my $name2; my $third1; my $third2; my $qual1; my $qual2; | |
30 | |
31 $is_resilient = $ARGV[5]; # true if is resilient to errors - Brian's additions | |
32 | |
33 | |
34 sub strDiffMaxDelta { | |
35 | |
36 my ( $s1, $s2, $maxDelta ) = @_; | |
37 my $diffCount = () = ( $s1 ^ $s2 ) =~ /[^\x00]/g; | |
38 return $diffCount <= $maxDelta; | |
39 | |
40 } | |
41 | |
42 if ($is_resilient eq "true") { | |
43 | |
44 my $max_errors = $ARGV[8]; | |
45 | |
46 while($name1 = <IN1>) { # start reading through the paired fastq input files | |
47 $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files) | |
48 $forward = <IN1>; | |
49 $reverse = <IN2>; | |
50 $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>; | |
51 my $which = 0; my $for; my $rev; # establish variables used below | |
52 | |
53 if(strDiffMaxDelta(substr($forward, $c, $e), $cut_site, $max_errors)) { # check for SbfI cutsite in the correct place in forward read | |
54 if(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)) { # check for SbfI cutsite in the correct place in reverse read | |
55 $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read | |
56 $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read | |
57 if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read... | |
58 $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented | |
59 } | |
60 elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read... | |
61 $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped | |
62 } | |
63 } | |
64 else { # the cutsite is only found in the forward read | |
65 #$barcode = substr($forward, $n, $b); | |
66 #if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented | |
67 $which = 1; | |
68 | |
69 } | |
70 } | |
71 elsif(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)){ # cutsite not in forward read but is in reverse read | |
72 #$barcode = substr($reverse, $n, $b); | |
73 #if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped | |
74 $which = 2; | |
75 } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point | |
76 if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair | |
77 my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores... | |
78 my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides... | |
79 print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags | |
80 print OUT2 "$name2$reverse$third2$qual2"; | |
81 } | |
82 elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair | |
83 my $temp1 = substr($reverse, $n); | |
84 my $temp2 = substr($qual2, $n); | |
85 print OUT1 "$name2$temp1$third2$temp2"; | |
86 print OUT2 "$name1$forward$third1$qual1"; | |
87 } # if which == 0, nothing is printed out for the pair | |
88 } | |
89 close IN1; | |
90 close IN2; | |
91 close OUT1; | |
92 close OUT2; | |
93 | |
94 foreach my $key (sort keys %counts) { | |
95 print "$key" . "\t" . "$counts{$key}\n"; | |
96 } | |
97 | |
98 } | |
99 | |
100 | |
101 else{ #Paul's code | |
102 | |
103 while($name1 = <IN1>) { # start reading through the paired fastq input files | |
104 $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files) | |
105 $forward = <IN1>; | |
106 $reverse = <IN2>; | |
107 $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>; | |
108 | |
109 my $which = 0; my $for; my $rev; # establish variables used below | |
110 if(substr($forward, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in forward read | |
111 if(substr($reverse, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in reverse read | |
112 $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read | |
113 $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read | |
114 if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read... | |
115 $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented | |
116 } | |
117 elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read... | |
118 $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped | |
119 } | |
120 } | |
121 else { # the cutsite is only found in the forward read | |
122 $barcode = substr($forward, $n, $b); | |
123 if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented | |
124 } | |
125 } | |
126 elsif(substr($reverse, $c, $e) eq "TGCAGG") { # cutsite not in forward read but is in reverse read | |
127 $barcode = substr($reverse, $n, $b); | |
128 if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped | |
129 } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point | |
130 if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair | |
131 my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores... | |
132 my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides... | |
133 print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags | |
134 print OUT2 "$name2$reverse$third2$qual2"; | |
135 } | |
136 elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair | |
137 my $temp1 = substr($reverse, $n); | |
138 my $temp2 = substr($qual2, $n); | |
139 print OUT1 "$name2$temp1$third2$temp2"; | |
140 print OUT2 "$name1$forward$third1$qual1"; | |
141 } # if which == 0, nothing is printed out for the pair | |
142 | |
143 | |
144 } | |
145 close IN1; | |
146 close IN2; | |
147 close OUT1; | |
148 close OUT2; | |
149 | |
150 foreach my $key (sort keys %counts) { | |
151 print "$key" . "\t" . "$counts{$key}\n"; | |
152 } | |
153 } |