Mercurial > repos > romaingred > gred_rnaseq
comparison gred_rnaseq-2369cfe4f737/alignTophat.pl @ 2:825c3b7313b6 draft
Uploaded
author | romaingred |
---|---|
date | Thu, 16 Nov 2017 10:14:55 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:f55d249522a0 | 2:825c3b7313b6 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 | |
6 my $max_procs = 8; | |
7 | |
8 my ($ma, $fastq, $fastqN, $mis,$findSens, $contaminants, $directional, $build_contaminant, $ref, $build_ref, $annotation, $dirBam, $dirBam_cont,$dirBed, $dirBedgraph, $outText, $outSimple, $random_w); | |
9 | |
10 GetOptions ( | |
11 "direction:s" => \$directional, | |
12 "random_w:s" => \$random_w, | |
13 "find:s" => \$findSens, | |
14 "fastq=s" => \$fastq, | |
15 "fastq_n=s" => \$fastqN, | |
16 "ma=i" => \$ma, | |
17 "mis=i" => \$mis, | |
18 "contaminants=s" => \$contaminants, | |
19 "anno=s" => \$annotation, | |
20 "build_contaminant" => \$build_contaminant, | |
21 "ref=s" => \$ref, | |
22 "build_ref" => \$build_ref, | |
23 "dirbamCont=s" => \$dirBam_cont, | |
24 "dirbam=s" => \$dirBam, | |
25 "dirbedgraph=s" => \$dirBedgraph, | |
26 "text=s" => \$outText, | |
27 ); | |
28 | |
29 $directional = lc($directional); | |
30 $findSens = lc($findSens); | |
31 $random_w = lc($random_w); | |
32 | |
33 if ($build_ref) | |
34 { | |
35 `(bowtie2-build $ref $ref) 2> /dev/null `; | |
36 } | |
37 | |
38 if ($build_contaminant) | |
39 { | |
40 `(bowtie2-build $contaminants $contaminants) 2> /dev/null `; | |
41 } | |
42 | |
43 #"outsimple=s" => \$outSimple | |
44 #"annotation=s" => \$annotation, | |
45 | |
46 #$annotation = "/data/visiteur/annotation/ara/TAIR10_all_sorted.bed"; | |
47 #$annotation = "/data/home/pogorelcnikr/SG/humanGenes.bed"; | |
48 | |
49 # mapping against contaminants databases in: contaminants, fastq, # of mismatches, output_dir out: tophat dir | |
50 tophat2($fastq, $contaminants, $mis, 1, 'dir_contaminant'); | |
51 `bamToFastq -i dir_contaminant/unmapped.bam -fq clean.fastq`; | |
52 | |
53 # mapping against contaminants databases in: $ref, fastq, # of mismatches, output_dir out: tophat dir | |
54 tophat2('clean.fastq', $ref, $mis, $max_procs, 'dir_reference'); | |
55 | |
56 my $repTot = 0; | |
57 if ($ma == 0) | |
58 { | |
59 open (my $rT, 'dir_reference/align_summary.txt'); | |
60 my @repT = <$rT>; | |
61 $repTot = $1 if $repT[2] =~ /\s+Mapped\s+:\s+(\d+).*/; | |
62 close $rT; | |
63 } | |
64 else | |
65 { | |
66 $repTot = $ma; | |
67 } | |
68 | |
69 # Creation of bedgraphs | |
70 my $dir_bedgraph = $fastqN.'_bedgraph/'; | |
71 findUnique('dir_reference/accepted_hits.bam', $dir_bedgraph, $repTot); | |
72 | |
73 # count in: unique_mapped.bam out: raw_count_simple antisens_count | |
74 if ($random_w eq 'true') | |
75 { | |
76 reference('rand_sorted.bam', $repTot); | |
77 `mv rand_sorted.bam dir_reference/random_sorted.bam`; | |
78 } | |
79 else | |
80 { | |
81 reference('ac_sorted.bam', $repTot); | |
82 `mv ac_sorted.bam dir_reference/unique_sorted.bam`; | |
83 } | |
84 | |
85 zip('dir_reference', $dirBam); | |
86 zip('dir_contaminant', $dirBam_cont); | |
87 zip( $dir_bedgraph , $dirBedgraph); | |
88 | |
89 ################################# | |
90 # inputs: | |
91 # fastq $_[0] | |
92 # bowtie2_index $_[1] | |
93 # mismatches # $_[2] | |
94 # output directory $_[3] | |
95 # max_procs $_[4] | |
96 ################################# | |
97 sub tophat2 | |
98 { | |
99 my ($fastq, $index, $mismtaches, $proc, $out_repertory) = @_; | |
100 $out_repertory = $out_repertory.'/' unless $out_repertory =~ /\/$/; | |
101 # print STDERR "/data/visiteur/tophat-2.0.12.Linux_x86_64/tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq\n"; | |
102 `tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq 2> /dev/stdin`; | |
103 } | |
104 ############################################################################################ | |
105 | |
106 ################################# | |
107 # inputs: | |
108 # bam $_[0] | |
109 # output directory $_[1] | |
110 ################################# | |
111 sub findUnique | |
112 { | |
113 my ($bam, $out_bedgraph, $sc) = @_; | |
114 | |
115 mkdir $out_bedgraph; | |
116 | |
117 my $bedg_uni = $out_bedgraph.'/'.$fastqN.'_unique.bedgraph' ; | |
118 my $bedg_all_multi = $out_bedgraph.'/'.$fastqN.'_all_multi.bedgraph' ; | |
119 my $bedg_random = $out_bedgraph.'/'.$fastqN.'_random_multi.bedgraph' ; | |
120 | |
121 my $bedg_uni_p = $out_bedgraph.'/'.$fastqN.'_unique_plusStrand.bedgraph' ; | |
122 my $bedg_all_multi_p = $out_bedgraph.'/'.$fastqN.'_all_multi_plusStrand.bedgraph' ; | |
123 my $bedg_random_p = $out_bedgraph.'/'.$fastqN.'_random_multi_plusStrand.bedgraph' ; | |
124 | |
125 my $bedg_uni_m = $out_bedgraph.'/'.$fastqN.'_unique_minusStrand.bedgraph' ; | |
126 my $bedg_all_multi_m = $out_bedgraph.'/'.$fastqN.'_all_multi_minusStrand.bedgraph' ; | |
127 my $bedg_random_m = $out_bedgraph.'/'.$fastqN.'_random_multi_minusStrand.bedgraph' ; | |
128 | |
129 my $sca = 1000000/$sc; | |
130 | |
131 #unique file | |
132 `samtools view -H $bam > ac.sam 2> /dev/null && samtools view -@ $max_procs $bam 2> /dev/null |grep NH:i:1\$ >> ac.sam && samtools view -@ $max_procs -Shb ac.sam > ac.bam 2> /dev/null`; | |
133 `samtools sort -@ $max_procs ac.bam -o ac_sorted.bam 2> /dev/null`; | |
134 `genomeCoverageBed -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni 2> /dev/null`; | |
135 if ($directional eq 'true') | |
136 { | |
137 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_p 2> /dev/null`; | |
138 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_m 2> /dev/null`; | |
139 } | |
140 #all multi | |
141 `samtools sort -@ $max_procs $bam -o accepted_hits_sorted.bam 2> /dev/null`; | |
142 `genomeCoverageBed -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi 2> /dev/null`; | |
143 if ($directional eq 'true') | |
144 { | |
145 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_p 2> /dev/null`; | |
146 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_m 2> /dev/null`; | |
147 } | |
148 #random multi | |
149 `samtools view -@ $max_procs -hb -F256 $bam > rand.bam 2> /dev/null`; | |
150 `samtools sort -@ $max_procs rand.bam -o rand_sorted.bam 2> /dev/null`; | |
151 `genomeCoverageBed -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random 2> /dev/null`; | |
152 if ($directional eq 'true') | |
153 { | |
154 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_p 2> /dev/null`; | |
155 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_m 2> /dev/null`; | |
156 } | |
157 } | |
158 ############################################################################################ | |
159 | |
160 ################################# | |
161 # inputs: | |
162 # bam $_[0] | |
163 # output directory $_[1] | |
164 ################################# | |
165 sub reference | |
166 { | |
167 my $bam = shift; | |
168 my $mapnum = shift; | |
169 | |
170 my @tabRef = (); | |
171 | |
172 open (my $anno, $annotation) || die "cannot open annotation file!\n"; | |
173 while(<$anno>) | |
174 { | |
175 chomp $_; | |
176 my @split = split /\t/, $_; | |
177 my $size = $split[2] - $split[1]; | |
178 push @tabRef, [$_,0,0,$size]; | |
179 } | |
180 close $anno; | |
181 | |
182 `bedtools bamtobed -i $bam > ac.bed`; | |
183 | |
184 # my $mapnum = 0; | |
185 # open (my $bed, 'ac.bed') || die "cannot open ac.bed\n"; | |
186 # while(<$bed>){ $mapnum++; } | |
187 # close $bed; | |
188 | |
189 if ($directional eq 'true') | |
190 { | |
191 #Same strand | |
192 `bedtools intersect -s -wao -b ac.bed -a $annotation> intersectStranded.bed`; | |
193 #Different strand | |
194 `bedtools intersect -S -wao -b ac.bed -a $annotation> intersectOppositeStrand.bed`; | |
195 | |
196 open (my $strand, 'intersectStranded.bed') || die "cannot open intersectStranded.bed\n"; | |
197 my $i = 0; | |
198 my $firstLine = <$strand>; chomp $firstLine; | |
199 my @split = split /\t/, $firstLine; | |
200 my $prevId = $split[3]; | |
201 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
202 { | |
203 $tabRef[$i]->[1] = 1; | |
204 } | |
205 while(<$strand>) | |
206 { | |
207 chomp $_; | |
208 @split = split /\t/, $_; | |
209 if ($split[3] ne $prevId) | |
210 { | |
211 $i++; | |
212 } | |
213 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
214 { | |
215 $tabRef[$i]->[1] += 1; | |
216 } | |
217 $prevId = $split[3]; | |
218 } | |
219 close $strand; | |
220 | |
221 open ($strand, 'intersectOppositeStrand.bed') || die "cannot open intersectOppositeStrand.bed\n"; | |
222 $i = 0; | |
223 $firstLine = <$strand>; chomp $firstLine; | |
224 @split = split /\t/, $firstLine; | |
225 $prevId = $split[3]; | |
226 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
227 { | |
228 $tabRef[$i]->[2] = 1; | |
229 } | |
230 while(<$strand>) | |
231 { | |
232 chomp $_; | |
233 @split = split /\t/, $_; | |
234 if ($split[3] ne $prevId) | |
235 { | |
236 $i++; | |
237 } | |
238 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
239 { | |
240 $tabRef[$i]->[2] += 1; | |
241 } | |
242 $prevId = $split[3]; | |
243 } | |
244 close $strand; | |
245 } | |
246 | |
247 else | |
248 { | |
249 `bedtools intersect -wao -b ac.bed -a $annotation> intersect.bed`; | |
250 | |
251 open (my $strand, 'intersect.bed') || die "cannot open intersect.bed\n"; | |
252 my $i = 0; | |
253 my $firstLine = <$strand>; chomp $firstLine; | |
254 my @split = split /\t/, $firstLine; | |
255 my $prevId = $split[3]; | |
256 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
257 { | |
258 $tabRef[$i]->[1] = 1; | |
259 } | |
260 while(<$strand>) | |
261 { | |
262 chomp $_; | |
263 @split = split /\t/, $_; | |
264 if ($split[3] ne $prevId) | |
265 { | |
266 $i++; | |
267 } | |
268 if ($split[16] != 0 && $split[12] - $split[11] == $split[16]) | |
269 { | |
270 $tabRef[$i]->[1] += 1; | |
271 } | |
272 $prevId = $split[3]; | |
273 } | |
274 close $strand; | |
275 } | |
276 | |
277 my $totplus = 0; my $totminus = 0; | |
278 foreach my $outPut (@tabRef) | |
279 { | |
280 $totplus += $outPut->[1]; | |
281 $totminus += $outPut->[2]; | |
282 } | |
283 | |
284 open (my $out, ">".$outText) || die "cannot open raw_count.txt\n"; | |
285 print $out "Chr\tstart\tend\tID\tscore\tstrand\ttype\tD1\tD2\tD3\treads\trpm\treads on opposite strand\trpm\n"; | |
286 my ($Sr, $Rr) = (0,0); | |
287 foreach my $outPut (@tabRef) | |
288 { | |
289 if ($totplus > $totminus || $findSens eq 'false') | |
290 { | |
291 $Sr = rpm($outPut->[1],$mapnum); | |
292 $Rr = rpm($outPut->[2],$mapnum); | |
293 print $out $outPut->[0]."\t".$outPut->[1]."\t".$Sr."\t".$outPut->[2]."\t".$Rr."\n"; | |
294 } | |
295 else | |
296 { | |
297 $Rr = rpm($outPut->[1],$mapnum); | |
298 $Sr = rpm($outPut->[2],$mapnum); | |
299 print $out $outPut->[0]."\t".$outPut->[2]."\t".$Sr."\t".$outPut->[1]."\t".$Rr."\n"; | |
300 } | |
301 } | |
302 close $out; | |
303 } | |
304 ############################################################################################ | |
305 | |
306 ################################# | |
307 # inputs: | |
308 # bam $_[0] | |
309 # output directory $_[1] | |
310 ################################# | |
311 sub rpm | |
312 { | |
313 my $raw_count = shift; | |
314 my $mapped_number = shift; | |
315 my $RPM = 0; | |
316 $RPM = ($raw_count * 1000000) / ($mapped_number) if $mapped_number != 0; | |
317 return $RPM; | |
318 } | |
319 ############################################################################################ | |
320 | |
321 ################################# | |
322 # inputs: | |
323 # bam $_[0] | |
324 # output directory $_[1] | |
325 ################################# | |
326 sub zip | |
327 { | |
328 my ($in_dir, $out_tar) = @_; | |
329 `tar -cvf tmp.tar $in_dir/`; | |
330 `mv tmp.tar $out_tar`; | |
331 rmdir $in_dir; | |
332 } | |
333 ############################################################################################ | |
334 |