comparison bin/align.pm @ 0:198009598544 draft

Uploaded
author romaingred
date Wed, 11 Oct 2017 09:57:58 -0400
parents
children 42bc59c7db3a
comparison
equal deleted inserted replaced
-1:000000000000 0:198009598544
1 package align;
2
3 use strict;
4 use warnings;
5 use File::Basename;
6 use String::Random;
7
8 use FindBin;
9 use lib $FindBin::Bin;
10 use Rcall qw ( histogram );
11
12 use Exporter;
13 our @ISA = qw( Exporter );
14 our @EXPORT = qw( &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
15
16 sub to_build
17 {
18 my $toBuildHashP = shift; my $log = shift;
19
20 while ( my ( $k, $v ) = each %{ $toBuildHashP } )
21 {
22 build_index ( $k, $log ) if $v == 1;
23 }
24 }
25
26 sub build_index
27 {
28 my $to_index = shift;
29 my $log = shift;
30 my $index_log = $to_index.'_index.err';
31 `bwa index $to_index 2> $index_log`;
32 print $log "Creating index for $to_index\n";
33 }
34
35 sub get_unique
36 {
37 my ( $sam, $s_uni, $prefix, $details, $report ) = @_;
38
39 my $fout = $prefix.'all.fastq';
40 my $funi = $prefix.'unique.fastq';
41 my $frej = $prefix.'rejected.fastq';
42
43 my $repartition = $prefix.'distribution.txt';
44 my $png_rep = $prefix.'distribution.png';
45 my ( %duplicates, %genome_hits) ;
46
47 #alignement to the first reference
48 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
49 my $ref_fai = $return[4];
50 my $mappers = $return[5];
51 my $mappers_uni = $return[6];
52 my $size_mappedHashR = $return[7];
53
54 if ( $details == 1 )
55 {
56 #print number of duplicates and hits number
57 my ($pourcentage, $total) =(0,0);
58
59 $total += $_ foreach values %{$size_mappedHashR};
60 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
61 print $rep "size\tnumber\tpercentage\n";
62 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
63 {
64 $pourcentage = 0;
65 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
66
67 print $rep "$k\t$size_mappedHashR->{$k}\t";
68 printf $rep "%.2f\n",$pourcentage;
69 }
70
71 histogram($size_mappedHashR, $png_rep, $total);
72
73
74 my $dup = $prefix.'dup_mapnum.txt';
75 my $dup_u = $prefix .'dup_unique.txt';
76 my $dup_r = $prefix .'dup_nonmapp.txt';
77 open(my $tab,">".$dup) || die "cannot open output txt file\n";
78 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
79 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
80 print $tab "sequence\tcount\tmapnum\n";
81 print $tab_u "sequence\tcount\n";
82 print $tab_r "sequence\tcount\n";
83 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
84 {
85 $duplicates{$k} = 0 unless exists($duplicates{$k});
86 $genome_hits{$k} = 0 unless exists($genome_hits{$k});
87 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
88 else {print $tab_r $k."\t".$duplicates{$k}."\n";}
89 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
90 }
91 close $dup; close $dup_r; close $dup_u;
92 }
93 return ( $ref_fai, $mappers, $mappers_uni );
94 }
95
96 sub sam_parse
97 {
98 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
99 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
100 $mappers = $mappersUnique = $reads = 0;
101
102 open my $fic, '<', $sam || die "cannot open $sam $!\n";
103 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
104 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
105 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
106 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
107 my $sequence = '';
108 while(<$fic>)
109 {
110 chomp $_;
111 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
112 {
113 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
114 {
115 $size{$1} = $2;
116 $unique_number{$1} = 0;
117 $number{$1} = 0;
118 $numberNM{$1} = 0;
119 $numberM{$1} = 0;
120 }
121 print $sam_uni $_."\n";
122 next;
123 }
124 $reads++;
125 my @line = split (/\t/,$_);
126 $sequence = $line[9];
127 if ($line[1] & 16)
128 {
129 $sequence =reverse($sequence);
130 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
131 }
132 if ($line[1] == 16 || $line[1] == 0)
133 {
134 my $len = length($sequence);
135 $size_num{$len} ++;
136 $size_num_spe{$line[2]}{$len}++;
137 $mappers ++;
138
139 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ );
140 ${$duplicate_hashR}{$sequence}++;
141 $number{$line[2]}++;
142 $numberSens{$line[2]}++ if $line[1] == 0 ;
143 $numberReverse{$line[2]}++ if $line[1] == 16 ;
144 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
145
146 if ($line[11] eq "XT:A:U")
147 {
148 $unique_number{$line[2]}++;
149 $mappersUnique++;
150 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
151 print $sam_uni $_."\n";
152 }
153 if ($_ =~ /.*XM:i:(\d+).*/)
154 {
155 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
156 }
157 }
158 else
159 {
160 ${$best_hit_number_hashR}{$sequence} = 0;
161 ${$duplicate_hashR}{$sequence}++;
162 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
163 }
164 }
165 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
166
167 print $report "Parsing $sam file\n";
168 print $report "\treads: $reads\n";
169 print $report "\tmappers: $mappers\n";
170 print $report "\tunique mappers: $mappersUnique\n";
171 print $report "-----------------------------\n";
172 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
173 }
174
175 sub get_hash_alignment
176 {
177 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ;
178 my ($reads, $mappers, $unmapped) = (0,0,0);
179 my $accep_unique;
180 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report );
181
182 open my $fic, '<', $sam || die "cannot open $sam $!\n";
183 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1;
184 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1;
185 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f;
186 #if ($name eq "snRNAs") {
187 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
188 #}
189 my $sequence = '';
190 while(<$fic>)
191 {
192 chomp $_;
193 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
194 {
195 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
196 {
197 print $fai $1."\t".$2."\n";
198 }
199 next;
200 }
201 $reads++;
202 my @line = split (/\t/,$_);
203 $sequence = $line[9];
204 if ($line[1] & 16)
205 {
206 $sequence =reverse($sequence);
207 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
208 }
209 if ($line[1] & 16 || $line[1] == 0)
210 {
211 $mappers ++;
212 if ($accept == 1 )
213 {
214 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
215 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U");
216 }
217 }
218 else
219 {
220 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1;
221 $unmapped++;
222 }
223 }
224 # close $accep_unique if ($name eq "bonafide_reads");
225 close $fic;
226 close $accepted if $accept == 1;
227 close $rejected if $reject ==1;
228 close $fai if $fai_f;
229 print $report "\treads: $reads\n";
230 print $report "\tmappers: $mappers\n";
231 print $report "\tunmapped: $unmapped\n";
232 print $report "-----------------------------\n";
233 return ($mappers, $unmapped);
234 }
235
236 sub sam_count
237 {
238 my $sam = shift;
239 my ( %number, %size );
240
241 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
242 while(<$fic>)
243 {
244 chomp $_;
245 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
246 {
247 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
248 {
249 $size{$1} = $2;
250 $number{$1} = 0;
251 }
252 }
253 else
254 {
255 my @line = split (/\t/,$_);
256 if ( $line[1] & 16 || $line[1] == 0 )
257 {
258 $number{$line[2]}++;
259 }
260 }
261 }
262 close $fic;
263 return ( \%number, \%size );
264 }
265
266 sub sam_count_mis
267 {
268
269 my $sam = shift;
270 my ( %number, %numberNM, %numberM, %size);
271
272 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
273 while(<$fic>)
274 {
275 chomp $_;
276 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
277 {
278 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
279 {
280 $size{$1} = $2;
281 $number{$1} = 0;
282 $numberNM{$1} = 0;
283 $numberM{$1} = 0;
284 }
285 }
286 else
287 {
288 my @line = split (/\t/,$_);
289 if ( $line[1] & 16 || $line[1] == 0 )
290 {
291 $number{ $line[2] }++;
292 if ($_ =~ /.*XM:i:(\d+).*/)
293 {
294 if ( $1 == 0 ){ $numberNM{$line[2]}++; } else { $numberM{$line[2]}++; }
295 }
296 }
297 }
298 }
299 return (\%number, \%size, \%numberNM, \%numberM );
300 }
301
302 sub sam_to_bam_bg
303 {
304 my ( $sam, $scale, $number_of_cpus ) = @_;
305 my ( $bam_sorted, $bedgraphM, $bedgraphP ) = ( '', '', '' );
306 if ( $sam =~ /(.*?).sam$/ )
307 {
308 $bam_sorted = $1.'_sorted.bam';
309 $bedgraphP= $1.'_plus.bedgraph';
310 $bedgraphM = $1.'_minus.bedgraph';
311 }
312 `samtools view -Shb --threads $number_of_cpus $sam | samtools sort -O BAM --threads $number_of_cpus /dev/stdin > $bam_sorted`;
313 `bedtools genomecov -scale $scale -strand + -bg -ibam $bam_sorted > $bedgraphP`;
314 `bedtools genomecov -scale $scale -strand - -bg -ibam $bam_sorted > $bedgraphM`;
315 }
316
317 sub sam_sorted_bam
318 {
319 my ( $sam, $number_of_cpus ) = @_;
320 my $bam_sorted ='';
321 if ( $sam =~ /(.*?).sam$/ )
322 {
323 $bam_sorted = $1.'_sorted.bam';
324 }
325 `samtools view -Shb --threads $number_of_cpus $sam | samtools sort -O BAM --threads $number_of_cpus /dev/stdin > $bam_sorted`;
326 }
327
328 sub BWA_call
329 {
330 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
331 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
332 print $report "-----------------------------\n";
333 print $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\n";
334 `bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `;
335 }
336
337 sub rpms_rpkm
338 {
339 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
340 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
341 print $out "ID\treads counts\tRPKM";
342 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
343 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
344 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
345 print $out "\n";
346 foreach my $k ( sort keys %{$counthashR} )
347 {
348 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
349
350 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
351 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
352
353 if ($piRNA_number != 0 )
354 {
355 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number;
356 printf $out "\t%.2f",$pirna;
357 }
358 if ($miRNA_number != 0 )
359 {
360 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number;
361 printf $out "\t%.2f",$mirna;
362 }
363 if ($bonafide_number != 0 )
364 {
365 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number;
366 printf $out "\t%.2f",$bonafide;
367 }
368 print $out "\n";
369 }
370 close $out;
371 }
372
373 sub extract_sam
374 {
375 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
376
377 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
378
379 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n";
380 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n";
381
382 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
383 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
384
385 my $sequence = '';
386 while(<$s_in>)
387 {
388 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
389 {
390 print $s_out $_ if defined ($hashRef);
391 print $s_uni_out $_;
392 next;
393 }
394 my @line = split (/\t/,$_);
395 $sequence = $line[0];
396 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )
397 {
398 my $arn = $line[9];
399 if ($line[1] & 16)
400 {
401 $arn =reverse($arn);
402 $arn =~ tr/atgcuATGCU/tacgaTACGA/;
403 }
404 #&& $line[11] eq "XT:A:U" )
405 if ( ( $line[1] == 16 || $line[1] == 0 ) )
406 {
407 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
408 print $s_out $_ if defined ($hashRef);
409 if ( $line[11] eq "XT:A:U" )
410 {
411 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
412 print $s_uni_out $_ ;
413 }
414 }
415 }
416 }
417 close $s_in; close $s_out if defined ($hashRef);
418 close $s_uni_out; close $f_out; close $f_uni_out;
419 }
420
421 sub get_fastq_seq{
422 my $fastq = shift;
423 my %hash; my $cmp = 0;
424
425 open my $fic, '<', $fastq || die "cannot open input file $! \n";
426 while(<$fic>)
427 {
428 chomp $_;
429 $cmp++;
430 if ($cmp % 4 == 1)
431 {
432 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
433 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;}
434 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;}
435 }
436 elsif ($cmp % 4 == 3 )
437 {
438 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
439 }
440 }
441 close $fic;
442 return \%hash;
443 }
444
445 1;