Mercurial > repos > romaingred > pirna_pipeline
comparison bin/align.pm @ 0:198009598544 draft
Uploaded
author | romaingred |
---|---|
date | Wed, 11 Oct 2017 09:57:58 -0400 |
parents | |
children | 42bc59c7db3a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:198009598544 |
---|---|
1 package align; | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use File::Basename; | |
6 use String::Random; | |
7 | |
8 use FindBin; | |
9 use lib $FindBin::Bin; | |
10 use Rcall qw ( histogram ); | |
11 | |
12 use Exporter; | |
13 our @ISA = qw( Exporter ); | |
14 our @EXPORT = qw( &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam ); | |
15 | |
16 sub to_build | |
17 { | |
18 my $toBuildHashP = shift; my $log = shift; | |
19 | |
20 while ( my ( $k, $v ) = each %{ $toBuildHashP } ) | |
21 { | |
22 build_index ( $k, $log ) if $v == 1; | |
23 } | |
24 } | |
25 | |
26 sub build_index | |
27 { | |
28 my $to_index = shift; | |
29 my $log = shift; | |
30 my $index_log = $to_index.'_index.err'; | |
31 `bwa index $to_index 2> $index_log`; | |
32 print $log "Creating index for $to_index\n"; | |
33 } | |
34 | |
35 sub get_unique | |
36 { | |
37 my ( $sam, $s_uni, $prefix, $details, $report ) = @_; | |
38 | |
39 my $fout = $prefix.'all.fastq'; | |
40 my $funi = $prefix.'unique.fastq'; | |
41 my $frej = $prefix.'rejected.fastq'; | |
42 | |
43 my $repartition = $prefix.'distribution.txt'; | |
44 my $png_rep = $prefix.'distribution.png'; | |
45 my ( %duplicates, %genome_hits) ; | |
46 | |
47 #alignement to the first reference | |
48 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report ); | |
49 my $ref_fai = $return[4]; | |
50 my $mappers = $return[5]; | |
51 my $mappers_uni = $return[6]; | |
52 my $size_mappedHashR = $return[7]; | |
53 | |
54 if ( $details == 1 ) | |
55 { | |
56 #print number of duplicates and hits number | |
57 my ($pourcentage, $total) =(0,0); | |
58 | |
59 $total += $_ foreach values %{$size_mappedHashR}; | |
60 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n"; | |
61 print $rep "size\tnumber\tpercentage\n"; | |
62 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR})) | |
63 { | |
64 $pourcentage = 0; | |
65 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0; | |
66 | |
67 print $rep "$k\t$size_mappedHashR->{$k}\t"; | |
68 printf $rep "%.2f\n",$pourcentage; | |
69 } | |
70 | |
71 histogram($size_mappedHashR, $png_rep, $total); | |
72 | |
73 | |
74 my $dup = $prefix.'dup_mapnum.txt'; | |
75 my $dup_u = $prefix .'dup_unique.txt'; | |
76 my $dup_r = $prefix .'dup_nonmapp.txt'; | |
77 open(my $tab,">".$dup) || die "cannot open output txt file\n"; | |
78 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n"; | |
79 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n"; | |
80 print $tab "sequence\tcount\tmapnum\n"; | |
81 print $tab_u "sequence\tcount\n"; | |
82 print $tab_r "sequence\tcount\n"; | |
83 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates) | |
84 { | |
85 $duplicates{$k} = 0 unless exists($duplicates{$k}); | |
86 $genome_hits{$k} = 0 unless exists($genome_hits{$k}); | |
87 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; } | |
88 else {print $tab_r $k."\t".$duplicates{$k}."\n";} | |
89 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; } | |
90 } | |
91 close $dup; close $dup_r; close $dup_u; | |
92 } | |
93 return ( $ref_fai, $mappers, $mappers_uni ); | |
94 } | |
95 | |
96 sub sam_parse | |
97 { | |
98 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ; | |
99 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size); | |
100 $mappers = $mappersUnique = $reads = 0; | |
101 | |
102 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | |
103 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n"; | |
104 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n"; | |
105 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n"; | |
106 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n"; | |
107 my $sequence = ''; | |
108 while(<$fic>) | |
109 { | |
110 chomp $_; | |
111 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
112 { | |
113 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
114 { | |
115 $size{$1} = $2; | |
116 $unique_number{$1} = 0; | |
117 $number{$1} = 0; | |
118 $numberNM{$1} = 0; | |
119 $numberM{$1} = 0; | |
120 } | |
121 print $sam_uni $_."\n"; | |
122 next; | |
123 } | |
124 $reads++; | |
125 my @line = split (/\t/,$_); | |
126 $sequence = $line[9]; | |
127 if ($line[1] & 16) | |
128 { | |
129 $sequence =reverse($sequence); | |
130 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | |
131 } | |
132 if ($line[1] == 16 || $line[1] == 0) | |
133 { | |
134 my $len = length($sequence); | |
135 $size_num{$len} ++; | |
136 $size_num_spe{$line[2]}{$len}++; | |
137 $mappers ++; | |
138 | |
139 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ ); | |
140 ${$duplicate_hashR}{$sequence}++; | |
141 $number{$line[2]}++; | |
142 $numberSens{$line[2]}++ if $line[1] == 0 ; | |
143 $numberReverse{$line[2]}++ if $line[1] == 16 ; | |
144 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
145 | |
146 if ($line[11] eq "XT:A:U") | |
147 { | |
148 $unique_number{$line[2]}++; | |
149 $mappersUnique++; | |
150 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
151 print $sam_uni $_."\n"; | |
152 } | |
153 if ($_ =~ /.*XM:i:(\d+).*/) | |
154 { | |
155 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;} | |
156 } | |
157 } | |
158 else | |
159 { | |
160 ${$best_hit_number_hashR}{$sequence} = 0; | |
161 ${$duplicate_hashR}{$sequence}++; | |
162 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
163 } | |
164 } | |
165 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni; | |
166 | |
167 print $report "Parsing $sam file\n"; | |
168 print $report "\treads: $reads\n"; | |
169 print $report "\tmappers: $mappers\n"; | |
170 print $report "\tunique mappers: $mappersUnique\n"; | |
171 print $report "-----------------------------\n"; | |
172 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM ); | |
173 } | |
174 | |
175 sub get_hash_alignment | |
176 { | |
177 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ; | |
178 my ($reads, $mappers, $unmapped) = (0,0,0); | |
179 my $accep_unique; | |
180 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ); | |
181 | |
182 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | |
183 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1; | |
184 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1; | |
185 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f; | |
186 #if ($name eq "snRNAs") { | |
187 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/; | |
188 #} | |
189 my $sequence = ''; | |
190 while(<$fic>) | |
191 { | |
192 chomp $_; | |
193 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
194 { | |
195 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
196 { | |
197 print $fai $1."\t".$2."\n"; | |
198 } | |
199 next; | |
200 } | |
201 $reads++; | |
202 my @line = split (/\t/,$_); | |
203 $sequence = $line[9]; | |
204 if ($line[1] & 16) | |
205 { | |
206 $sequence =reverse($sequence); | |
207 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | |
208 } | |
209 if ($line[1] & 16 || $line[1] == 0) | |
210 { | |
211 $mappers ++; | |
212 if ($accept == 1 ) | |
213 { | |
214 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
215 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); | |
216 } | |
217 } | |
218 else | |
219 { | |
220 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1; | |
221 $unmapped++; | |
222 } | |
223 } | |
224 # close $accep_unique if ($name eq "bonafide_reads"); | |
225 close $fic; | |
226 close $accepted if $accept == 1; | |
227 close $rejected if $reject ==1; | |
228 close $fai if $fai_f; | |
229 print $report "\treads: $reads\n"; | |
230 print $report "\tmappers: $mappers\n"; | |
231 print $report "\tunmapped: $unmapped\n"; | |
232 print $report "-----------------------------\n"; | |
233 return ($mappers, $unmapped); | |
234 } | |
235 | |
236 sub sam_count | |
237 { | |
238 my $sam = shift; | |
239 my ( %number, %size ); | |
240 | |
241 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | |
242 while(<$fic>) | |
243 { | |
244 chomp $_; | |
245 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
246 { | |
247 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
248 { | |
249 $size{$1} = $2; | |
250 $number{$1} = 0; | |
251 } | |
252 } | |
253 else | |
254 { | |
255 my @line = split (/\t/,$_); | |
256 if ( $line[1] & 16 || $line[1] == 0 ) | |
257 { | |
258 $number{$line[2]}++; | |
259 } | |
260 } | |
261 } | |
262 close $fic; | |
263 return ( \%number, \%size ); | |
264 } | |
265 | |
266 sub sam_count_mis | |
267 { | |
268 | |
269 my $sam = shift; | |
270 my ( %number, %numberNM, %numberM, %size); | |
271 | |
272 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | |
273 while(<$fic>) | |
274 { | |
275 chomp $_; | |
276 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
277 { | |
278 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
279 { | |
280 $size{$1} = $2; | |
281 $number{$1} = 0; | |
282 $numberNM{$1} = 0; | |
283 $numberM{$1} = 0; | |
284 } | |
285 } | |
286 else | |
287 { | |
288 my @line = split (/\t/,$_); | |
289 if ( $line[1] & 16 || $line[1] == 0 ) | |
290 { | |
291 $number{ $line[2] }++; | |
292 if ($_ =~ /.*XM:i:(\d+).*/) | |
293 { | |
294 if ( $1 == 0 ){ $numberNM{$line[2]}++; } else { $numberM{$line[2]}++; } | |
295 } | |
296 } | |
297 } | |
298 } | |
299 return (\%number, \%size, \%numberNM, \%numberM ); | |
300 } | |
301 | |
302 sub sam_to_bam_bg | |
303 { | |
304 my ( $sam, $scale, $number_of_cpus ) = @_; | |
305 my ( $bam_sorted, $bedgraphM, $bedgraphP ) = ( '', '', '' ); | |
306 if ( $sam =~ /(.*?).sam$/ ) | |
307 { | |
308 $bam_sorted = $1.'_sorted.bam'; | |
309 $bedgraphP= $1.'_plus.bedgraph'; | |
310 $bedgraphM = $1.'_minus.bedgraph'; | |
311 } | |
312 `samtools view -Shb --threads $number_of_cpus $sam | samtools sort -O BAM --threads $number_of_cpus /dev/stdin > $bam_sorted`; | |
313 `bedtools genomecov -scale $scale -strand + -bg -ibam $bam_sorted > $bedgraphP`; | |
314 `bedtools genomecov -scale $scale -strand - -bg -ibam $bam_sorted > $bedgraphM`; | |
315 } | |
316 | |
317 sub sam_sorted_bam | |
318 { | |
319 my ( $sam, $number_of_cpus ) = @_; | |
320 my $bam_sorted =''; | |
321 if ( $sam =~ /(.*?).sam$/ ) | |
322 { | |
323 $bam_sorted = $1.'_sorted.bam'; | |
324 } | |
325 `samtools view -Shb --threads $number_of_cpus $sam | samtools sort -O BAM --threads $number_of_cpus /dev/stdin > $bam_sorted`; | |
326 } | |
327 | |
328 sub BWA_call | |
329 { | |
330 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_; | |
331 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 ); | |
332 print $report "-----------------------------\n"; | |
333 print $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\n"; | |
334 `bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `; | |
335 } | |
336 | |
337 sub rpms_rpkm | |
338 { | |
339 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; | |
340 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; | |
341 print $out "ID\treads counts\tRPKM"; | |
342 print $out "\tper million of piRNAs" if ($piRNA_number != 0); | |
343 print $out "\tper million of miRNAs" if ($miRNA_number != 0); | |
344 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); | |
345 print $out "\n"; | |
346 foreach my $k ( sort keys %{$counthashR} ) | |
347 { | |
348 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); | |
349 | |
350 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | |
351 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm; | |
352 | |
353 if ($piRNA_number != 0 ) | |
354 { | |
355 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number; | |
356 printf $out "\t%.2f",$pirna; | |
357 } | |
358 if ($miRNA_number != 0 ) | |
359 { | |
360 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number; | |
361 printf $out "\t%.2f",$mirna; | |
362 } | |
363 if ($bonafide_number != 0 ) | |
364 { | |
365 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number; | |
366 printf $out "\t%.2f",$bonafide; | |
367 } | |
368 print $out "\n"; | |
369 } | |
370 close $out; | |
371 } | |
372 | |
373 sub extract_sam | |
374 { | |
375 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_; | |
376 | |
377 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n"; | |
378 | |
379 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; | |
380 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; | |
381 | |
382 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef); | |
383 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n"; | |
384 | |
385 my $sequence = ''; | |
386 while(<$s_in>) | |
387 { | |
388 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
389 { | |
390 print $s_out $_ if defined ($hashRef); | |
391 print $s_uni_out $_; | |
392 next; | |
393 } | |
394 my @line = split (/\t/,$_); | |
395 $sequence = $line[0]; | |
396 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) ) | |
397 { | |
398 my $arn = $line[9]; | |
399 if ($line[1] & 16) | |
400 { | |
401 $arn =reverse($arn); | |
402 $arn =~ tr/atgcuATGCU/tacgaTACGA/; | |
403 } | |
404 #&& $line[11] eq "XT:A:U" ) | |
405 if ( ( $line[1] == 16 || $line[1] == 0 ) ) | |
406 { | |
407 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
408 print $s_out $_ if defined ($hashRef); | |
409 if ( $line[11] eq "XT:A:U" ) | |
410 { | |
411 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
412 print $s_uni_out $_ ; | |
413 } | |
414 } | |
415 } | |
416 } | |
417 close $s_in; close $s_out if defined ($hashRef); | |
418 close $s_uni_out; close $f_out; close $f_uni_out; | |
419 } | |
420 | |
421 sub get_fastq_seq{ | |
422 my $fastq = shift; | |
423 my %hash; my $cmp = 0; | |
424 | |
425 open my $fic, '<', $fastq || die "cannot open input file $! \n"; | |
426 while(<$fic>) | |
427 { | |
428 chomp $_; | |
429 $cmp++; | |
430 if ($cmp % 4 == 1) | |
431 { | |
432 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); | |
433 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;} | |
434 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;} | |
435 } | |
436 elsif ($cmp % 4 == 3 ) | |
437 { | |
438 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | |
439 } | |
440 } | |
441 close $fic; | |
442 return \%hash; | |
443 } | |
444 | |
445 1; |