2
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5
|
|
6 my $max_procs = 8;
|
|
7
|
|
8 my ($ma, $fastq, $fastqN, $mis,$findSens, $contaminants, $directional, $build_contaminant, $ref, $build_ref, $annotation, $dirBam, $dirBam_cont,$dirBed, $dirBedgraph, $outText, $outSimple, $random_w);
|
|
9
|
|
10 GetOptions (
|
|
11 "direction:s" => \$directional,
|
|
12 "random_w:s" => \$random_w,
|
|
13 "find:s" => \$findSens,
|
|
14 "fastq=s" => \$fastq,
|
|
15 "fastq_n=s" => \$fastqN,
|
|
16 "ma=i" => \$ma,
|
|
17 "mis=i" => \$mis,
|
|
18 "contaminants=s" => \$contaminants,
|
|
19 "anno=s" => \$annotation,
|
|
20 "build_contaminant" => \$build_contaminant,
|
|
21 "ref=s" => \$ref,
|
|
22 "build_ref" => \$build_ref,
|
|
23 "dirbamCont=s" => \$dirBam_cont,
|
|
24 "dirbam=s" => \$dirBam,
|
|
25 "dirbedgraph=s" => \$dirBedgraph,
|
|
26 "text=s" => \$outText,
|
|
27 );
|
|
28
|
|
29 $directional = lc($directional);
|
|
30 $findSens = lc($findSens);
|
|
31 $random_w = lc($random_w);
|
|
32
|
|
33 if ($build_ref)
|
|
34 {
|
|
35 `(bowtie2-build $ref $ref) 2> /dev/null `;
|
|
36 }
|
|
37
|
|
38 if ($build_contaminant)
|
|
39 {
|
|
40 `(bowtie2-build $contaminants $contaminants) 2> /dev/null `;
|
|
41 }
|
|
42
|
|
43 #"outsimple=s" => \$outSimple
|
|
44 #"annotation=s" => \$annotation,
|
|
45
|
|
46 #$annotation = "/data/visiteur/annotation/ara/TAIR10_all_sorted.bed";
|
|
47 #$annotation = "/data/home/pogorelcnikr/SG/humanGenes.bed";
|
|
48
|
|
49 # mapping against contaminants databases in: contaminants, fastq, # of mismatches, output_dir out: tophat dir
|
|
50 tophat2($fastq, $contaminants, $mis, 1, 'dir_contaminant');
|
|
51 `bamToFastq -i dir_contaminant/unmapped.bam -fq clean.fastq`;
|
|
52
|
|
53 # mapping against contaminants databases in: $ref, fastq, # of mismatches, output_dir out: tophat dir
|
|
54 tophat2('clean.fastq', $ref, $mis, $max_procs, 'dir_reference');
|
|
55
|
|
56 my $repTot = 0;
|
|
57 if ($ma == 0)
|
|
58 {
|
|
59 open (my $rT, 'dir_reference/align_summary.txt');
|
|
60 my @repT = <$rT>;
|
|
61 $repTot = $1 if $repT[2] =~ /\s+Mapped\s+:\s+(\d+).*/;
|
|
62 close $rT;
|
|
63 }
|
|
64 else
|
|
65 {
|
|
66 $repTot = $ma;
|
|
67 }
|
|
68
|
|
69 # Creation of bedgraphs
|
|
70 my $dir_bedgraph = $fastqN.'_bedgraph/';
|
|
71 findUnique('dir_reference/accepted_hits.bam', $dir_bedgraph, $repTot);
|
|
72
|
|
73 # count in: unique_mapped.bam out: raw_count_simple antisens_count
|
|
74 if ($random_w eq 'true')
|
|
75 {
|
|
76 reference('rand_sorted.bam', $repTot);
|
|
77 `mv rand_sorted.bam dir_reference/random_sorted.bam`;
|
|
78 }
|
|
79 else
|
|
80 {
|
|
81 reference('ac_sorted.bam', $repTot);
|
|
82 `mv ac_sorted.bam dir_reference/unique_sorted.bam`;
|
|
83 }
|
|
84
|
|
85 zip('dir_reference', $dirBam);
|
|
86 zip('dir_contaminant', $dirBam_cont);
|
|
87 zip( $dir_bedgraph , $dirBedgraph);
|
|
88
|
|
89 #################################
|
|
90 # inputs:
|
|
91 # fastq $_[0]
|
|
92 # bowtie2_index $_[1]
|
|
93 # mismatches # $_[2]
|
|
94 # output directory $_[3]
|
|
95 # max_procs $_[4]
|
|
96 #################################
|
|
97 sub tophat2
|
|
98 {
|
|
99 my ($fastq, $index, $mismtaches, $proc, $out_repertory) = @_;
|
|
100 $out_repertory = $out_repertory.'/' unless $out_repertory =~ /\/$/;
|
|
101 # print STDERR "/data/visiteur/tophat-2.0.12.Linux_x86_64/tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq\n";
|
|
102 `tophat2 -p $proc -N $mismtaches --b2-very-sensitive -o $out_repertory $index $fastq 2> /dev/stdin`;
|
|
103 }
|
|
104 ############################################################################################
|
|
105
|
|
106 #################################
|
|
107 # inputs:
|
|
108 # bam $_[0]
|
|
109 # output directory $_[1]
|
|
110 #################################
|
|
111 sub findUnique
|
|
112 {
|
|
113 my ($bam, $out_bedgraph, $sc) = @_;
|
|
114
|
|
115 mkdir $out_bedgraph;
|
|
116
|
|
117 my $bedg_uni = $out_bedgraph.'/'.$fastqN.'_unique.bedgraph' ;
|
|
118 my $bedg_all_multi = $out_bedgraph.'/'.$fastqN.'_all_multi.bedgraph' ;
|
|
119 my $bedg_random = $out_bedgraph.'/'.$fastqN.'_random_multi.bedgraph' ;
|
|
120
|
|
121 my $bedg_uni_p = $out_bedgraph.'/'.$fastqN.'_unique_plusStrand.bedgraph' ;
|
|
122 my $bedg_all_multi_p = $out_bedgraph.'/'.$fastqN.'_all_multi_plusStrand.bedgraph' ;
|
|
123 my $bedg_random_p = $out_bedgraph.'/'.$fastqN.'_random_multi_plusStrand.bedgraph' ;
|
|
124
|
|
125 my $bedg_uni_m = $out_bedgraph.'/'.$fastqN.'_unique_minusStrand.bedgraph' ;
|
|
126 my $bedg_all_multi_m = $out_bedgraph.'/'.$fastqN.'_all_multi_minusStrand.bedgraph' ;
|
|
127 my $bedg_random_m = $out_bedgraph.'/'.$fastqN.'_random_multi_minusStrand.bedgraph' ;
|
|
128
|
|
129 my $sca = 1000000/$sc;
|
|
130
|
|
131 #unique file
|
|
132 `samtools view -H $bam > ac.sam 2> /dev/null && samtools view -@ $max_procs $bam 2> /dev/null |grep NH:i:1\$ >> ac.sam && samtools view -@ $max_procs -Shb ac.sam > ac.bam 2> /dev/null`;
|
|
133 `samtools sort -@ $max_procs ac.bam -o ac_sorted.bam 2> /dev/null`;
|
|
134 `genomeCoverageBed -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni 2> /dev/null`;
|
|
135 if ($directional eq 'true')
|
|
136 {
|
|
137 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_p 2> /dev/null`;
|
|
138 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam ac_sorted.bam > $bedg_uni_m 2> /dev/null`;
|
|
139 }
|
|
140 #all multi
|
|
141 `samtools sort -@ $max_procs $bam -o accepted_hits_sorted.bam 2> /dev/null`;
|
|
142 `genomeCoverageBed -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi 2> /dev/null`;
|
|
143 if ($directional eq 'true')
|
|
144 {
|
|
145 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_p 2> /dev/null`;
|
|
146 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam accepted_hits_sorted.bam > $bedg_all_multi_m 2> /dev/null`;
|
|
147 }
|
|
148 #random multi
|
|
149 `samtools view -@ $max_procs -hb -F256 $bam > rand.bam 2> /dev/null`;
|
|
150 `samtools sort -@ $max_procs rand.bam -o rand_sorted.bam 2> /dev/null`;
|
|
151 `genomeCoverageBed -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random 2> /dev/null`;
|
|
152 if ($directional eq 'true')
|
|
153 {
|
|
154 `genomeCoverageBed -strand + -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_p 2> /dev/null`;
|
|
155 `genomeCoverageBed -strand - -scale $sca -split -bg -ibam rand_sorted.bam > $bedg_random_m 2> /dev/null`;
|
|
156 }
|
|
157 }
|
|
158 ############################################################################################
|
|
159
|
|
160 #################################
|
|
161 # inputs:
|
|
162 # bam $_[0]
|
|
163 # output directory $_[1]
|
|
164 #################################
|
|
165 sub reference
|
|
166 {
|
|
167 my $bam = shift;
|
|
168 my $mapnum = shift;
|
|
169
|
|
170 my @tabRef = ();
|
|
171
|
|
172 open (my $anno, $annotation) || die "cannot open annotation file!\n";
|
|
173 while(<$anno>)
|
|
174 {
|
|
175 chomp $_;
|
|
176 my @split = split /\t/, $_;
|
|
177 my $size = $split[2] - $split[1];
|
|
178 push @tabRef, [$_,0,0,$size];
|
|
179 }
|
|
180 close $anno;
|
|
181
|
|
182 `bedtools bamtobed -i $bam > ac.bed`;
|
|
183
|
|
184 # my $mapnum = 0;
|
|
185 # open (my $bed, 'ac.bed') || die "cannot open ac.bed\n";
|
|
186 # while(<$bed>){ $mapnum++; }
|
|
187 # close $bed;
|
|
188
|
|
189 if ($directional eq 'true')
|
|
190 {
|
|
191 #Same strand
|
|
192 `bedtools intersect -s -wao -b ac.bed -a $annotation> intersectStranded.bed`;
|
|
193 #Different strand
|
|
194 `bedtools intersect -S -wao -b ac.bed -a $annotation> intersectOppositeStrand.bed`;
|
|
195
|
|
196 open (my $strand, 'intersectStranded.bed') || die "cannot open intersectStranded.bed\n";
|
|
197 my $i = 0;
|
|
198 my $firstLine = <$strand>; chomp $firstLine;
|
|
199 my @split = split /\t/, $firstLine;
|
|
200 my $prevId = $split[3];
|
|
201 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
202 {
|
|
203 $tabRef[$i]->[1] = 1;
|
|
204 }
|
|
205 while(<$strand>)
|
|
206 {
|
|
207 chomp $_;
|
|
208 @split = split /\t/, $_;
|
|
209 if ($split[3] ne $prevId)
|
|
210 {
|
|
211 $i++;
|
|
212 }
|
|
213 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
214 {
|
|
215 $tabRef[$i]->[1] += 1;
|
|
216 }
|
|
217 $prevId = $split[3];
|
|
218 }
|
|
219 close $strand;
|
|
220
|
|
221 open ($strand, 'intersectOppositeStrand.bed') || die "cannot open intersectOppositeStrand.bed\n";
|
|
222 $i = 0;
|
|
223 $firstLine = <$strand>; chomp $firstLine;
|
|
224 @split = split /\t/, $firstLine;
|
|
225 $prevId = $split[3];
|
|
226 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
227 {
|
|
228 $tabRef[$i]->[2] = 1;
|
|
229 }
|
|
230 while(<$strand>)
|
|
231 {
|
|
232 chomp $_;
|
|
233 @split = split /\t/, $_;
|
|
234 if ($split[3] ne $prevId)
|
|
235 {
|
|
236 $i++;
|
|
237 }
|
|
238 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
239 {
|
|
240 $tabRef[$i]->[2] += 1;
|
|
241 }
|
|
242 $prevId = $split[3];
|
|
243 }
|
|
244 close $strand;
|
|
245 }
|
|
246
|
|
247 else
|
|
248 {
|
|
249 `bedtools intersect -wao -b ac.bed -a $annotation> intersect.bed`;
|
|
250
|
|
251 open (my $strand, 'intersect.bed') || die "cannot open intersect.bed\n";
|
|
252 my $i = 0;
|
|
253 my $firstLine = <$strand>; chomp $firstLine;
|
|
254 my @split = split /\t/, $firstLine;
|
|
255 my $prevId = $split[3];
|
|
256 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
257 {
|
|
258 $tabRef[$i]->[1] = 1;
|
|
259 }
|
|
260 while(<$strand>)
|
|
261 {
|
|
262 chomp $_;
|
|
263 @split = split /\t/, $_;
|
|
264 if ($split[3] ne $prevId)
|
|
265 {
|
|
266 $i++;
|
|
267 }
|
|
268 if ($split[16] != 0 && $split[12] - $split[11] == $split[16])
|
|
269 {
|
|
270 $tabRef[$i]->[1] += 1;
|
|
271 }
|
|
272 $prevId = $split[3];
|
|
273 }
|
|
274 close $strand;
|
|
275 }
|
|
276
|
|
277 my $totplus = 0; my $totminus = 0;
|
|
278 foreach my $outPut (@tabRef)
|
|
279 {
|
|
280 $totplus += $outPut->[1];
|
|
281 $totminus += $outPut->[2];
|
|
282 }
|
|
283
|
|
284 open (my $out, ">".$outText) || die "cannot open raw_count.txt\n";
|
|
285 print $out "Chr\tstart\tend\tID\tscore\tstrand\ttype\tD1\tD2\tD3\treads\trpm\treads on opposite strand\trpm\n";
|
|
286 my ($Sr, $Rr) = (0,0);
|
|
287 foreach my $outPut (@tabRef)
|
|
288 {
|
|
289 if ($totplus > $totminus || $findSens eq 'false')
|
|
290 {
|
|
291 $Sr = rpm($outPut->[1],$mapnum);
|
|
292 $Rr = rpm($outPut->[2],$mapnum);
|
|
293 print $out $outPut->[0]."\t".$outPut->[1]."\t".$Sr."\t".$outPut->[2]."\t".$Rr."\n";
|
|
294 }
|
|
295 else
|
|
296 {
|
|
297 $Rr = rpm($outPut->[1],$mapnum);
|
|
298 $Sr = rpm($outPut->[2],$mapnum);
|
|
299 print $out $outPut->[0]."\t".$outPut->[2]."\t".$Sr."\t".$outPut->[1]."\t".$Rr."\n";
|
|
300 }
|
|
301 }
|
|
302 close $out;
|
|
303 }
|
|
304 ############################################################################################
|
|
305
|
|
306 #################################
|
|
307 # inputs:
|
|
308 # bam $_[0]
|
|
309 # output directory $_[1]
|
|
310 #################################
|
|
311 sub rpm
|
|
312 {
|
|
313 my $raw_count = shift;
|
|
314 my $mapped_number = shift;
|
|
315 my $RPM = 0;
|
|
316 $RPM = ($raw_count * 1000000) / ($mapped_number) if $mapped_number != 0;
|
|
317 return $RPM;
|
|
318 }
|
|
319 ############################################################################################
|
|
320
|
|
321 #################################
|
|
322 # inputs:
|
|
323 # bam $_[0]
|
|
324 # output directory $_[1]
|
|
325 #################################
|
|
326 sub zip
|
|
327 {
|
|
328 my ($in_dir, $out_tar) = @_;
|
|
329 `tar -cvf tmp.tar $in_dir/`;
|
|
330 `mv tmp.tar $out_tar`;
|
|
331 rmdir $in_dir;
|
|
332 }
|
|
333 ############################################################################################
|
|
334
|