Previous changeset 50:d9059a181872 (2017-12-01) Next changeset 52:945d903dad5a (2017-12-01) |
Commit message:
Uploaded |
added:
align.pm |
b |
diff -r d9059a181872 -r 6bd3e935d32b align.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/align.pm Fri Dec 01 09:33:22 2017 -0500 |
[ |
b'@@ -0,0 +1,456 @@\n+package align;\n+\n+use strict;\n+use warnings;\n+use File::Basename;\n+use String::Random;\n+\n+use FindBin;\n+use lib $FindBin::Bin;\n+use Rcall qw ( histogram );\n+\n+use Exporter;\n+our @ISA = qw( Exporter );\n+our @EXPORT = qw( &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );\n+\n+sub to_build\n+{\n+ my ( $toBuildTabP, $log, $newdir ) = @_;\n+\n+ foreach my $pairs ( @{ $toBuildTabP } ) \n+ {\n+ if ( $pairs->[0] == 1 ) \n+ { \n+ my $sym = $newdir.basename(${$pairs->[1]}).\'_symlink.fa\';\n+ symlink( ${$pairs->[1]}, $sym );\n+ ${$pairs->[1]} = $sym;\n+ build_index ( ${$pairs->[1]}, $log );\n+ } \n+ }\n+}\n+\n+sub build_index\n+{\n+\tmy $to_index = shift;\n+\tmy $log = shift;\n+\tmy $index_log = $to_index.\'_index.err\';\n+\t`bwa index $to_index 2> $index_log`;\n+\tprint $log "Creating index for $to_index\\n";\n+}\n+\n+sub get_unique\n+{\n+\tmy ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;\n+\n+\tmy $fout = $col_prefix.\'_all_mappers.fastq\'; \n+\tmy $funi = $col_prefix.\'_unique_mappers.fastq\';\n+\tmy $frej = $col_prefix.\'_unmapped.fastq\';\n+\t\n+ my $repartition = $out_prefix.\'distribution.txt\';\n+\tmy $png_rep = $out_prefix.\'distribution.png\';\n+\tmy ( %duplicates, %genome_hits) ;\n+\n+\t#alignement to the first reference\n+\tmy @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \\%duplicates, \\%genome_hits, $report );\n+\tmy $ref_fai = $return[4];\n+\tmy $mappers = $return[5];\n+\tmy $mappers_uni = $return[6];\n+\tmy $size_mappedHashR = $return[7];\n+\n+\tif ( $details == 1 )\n+\t{\n+\t\t#print number of duplicates and hits number\n+\t\tmy ($pourcentage, $total) =(0,0);\n+\n+\t\t$total += $_ foreach values %{$size_mappedHashR};\n+\t\topen (my $rep, \'>\'.$repartition) || die "cannot create $repartition $!\\n";\n+\t\tprint $rep "size\\tnumber\\tpercentage\\n";\n+\t\tforeach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))\n+\t\t{\n+\t\t\t$pourcentage = 0;\n+\t\t\t$pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;\n+\n+\t\t\tprint $rep "$k\\t$size_mappedHashR->{$k}\\t";\n+\t\t\tprintf $rep "%.2f\\n",$pourcentage;\n+\t\t}\n+\n+\t\thistogram($size_mappedHashR, $png_rep, $total);\n+\n+\n+\t\tmy $dup = $out_prefix.\'dup_mapnum.txt\';\n+\t\tmy $dup_u = $out_prefix .\'dup_unique.txt\';\n+\t\tmy $dup_r = $out_prefix .\'dup_nonmapp.txt\';\n+\t\topen(my $tab,">".$dup) || die "cannot open output txt file\\n";\n+\t\topen(my $tab_r,">".$dup_r) || die "cannot open output txt file\\n";\n+\t\topen(my $tab_u,">".$dup_u) || die "cannot open output txt file\\n";\n+\t\tprint $tab "sequence\\tcount\\tmapnum\\n";\n+\t\tprint $tab_u "sequence\\tcount\\n";\n+\t\tprint $tab_r "sequence\\tcount\\n";\n+\t\tforeach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)\n+\t\t{\n+\t\t\t$duplicates{$k} = 0 unless exists($duplicates{$k});\n+\t\t\t$genome_hits{$k} = 0 unless exists($genome_hits{$k});\n+\t\t\tif ($genome_hits{$k} != 0) { print $tab $k."\\t".$duplicates{$k}."\\t".$genome_hits{$k}."\\n"; }\n+\t\t\telse {print $tab_r $k."\\t".$duplicates{$k}."\\n";}\n+\t\t\tif ($genome_hits{$k} == 1) { print $tab_u $k."\\t".$duplicates{$k}."\\n"; }\n+\t\t}\n+\tclose $dup; close $dup_r; close $dup_u;\n+\t}\n+\treturn ( $ref_fai, $mappers, $mappers_uni );\n+}\n+\n+sub sam_parse\n+{\n+\tmy ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;\n+\tmy ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);\n+\t$mappers = $mappersUnique = $reads = 0;\n+\n+\topen my $fic, \'<\', $sam || die "cannot open $sam $!\\n";\n+\topen my $accepted, \'>\', $fastq_accepted || die "cannot create $fastq_accepted $! \\n";\n+\topen my $unique, \'>\', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \\n";\n+\topen my $rejected, \'>\', $fastq_rejected || die "cannot create $fastq_rejected $! \\n";\n+\topen my $sam_uni, \'>\', $sam_unique || die "cannot create $sam_unique $! \\n";\n+\tmy $sequence = \'\';\n+\twhile(<$fic>)'..b'rt -O BAM --threads $number_of_cpus /dev/stdin 2> $sort_err > $bam_sorted`;\n+}\n+\n+sub BWA_call\n+{\n+\tmy ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;\n+\tmy ( $aln_err, $samse_err, $seq_num ) = ( $sam.\'_aln.err\', $sam.\'_samse.err\', 0 );\n+\tprint $report "-----------------------------\\n";\n+\tprint $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\\n";\n+\t`bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `;\n+}\n+\n+sub rpms_rpkm\n+{\n+ my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;\n+ open(my $out, ">".$out_file) || die "cannot open normalized file $! \\n";\n+ print $out "ID\\treads counts\\tRPKM";\n+ print $out "\\tper million of piRNAs" if ($piRNA_number != 0);\n+ print $out "\\tper million of miRNAs" if ($miRNA_number != 0);\n+ print $out "\\tper million of bonafide reads" if ($bonafide_number != 0);\n+ print $out "\\n";\n+ foreach my $k ( sort keys %{$counthashR} )\n+ {\n+ my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);\n+ \n+ $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;\n+ print $out $k."\\t".$counthashR->{$k}."\\t"; printf $out "%.2f",$rpkm;\n+ \n+ if ($piRNA_number != 0 )\n+ {\n+ $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number;\n+ printf $out "\\t%.2f",$pirna;\n+ }\n+ if ($miRNA_number != 0 )\n+ {\n+ $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number;\n+ printf $out "\\t%.2f",$mirna;\n+ }\n+ if ($bonafide_number != 0 )\n+ {\n+ $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number;\n+ printf $out "\\t%.2f",$bonafide;\n+ }\n+ print $out "\\n";\n+ \t}\n+ close $out;\n+}\n+\n+sub extract_sam\n+{\n+ my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;\n+ \n+\topen my $s_in, \'<\', $sam_in || die "cannot open $sam_in file $!\\n";\n+\n+ open my $f_out, \'>\', $fastq_out || die "cannot create $fastq_out $!\\n"; \n+ open my $f_uni_out, \'>\', $fastq_uni_out || die "cannot create $fastq_uni_out $!\\n"; \n+ \n+ open my $s_out, \'>\', $sam_out || die "cannot create $sam_out file $!\\n" if defined ($hashRef);\n+ open my $s_uni_out, \'>\', $sam_uni_out || die "cannot create $sam_uni_out file $!\\n";\n+\n+ my $sequence = \'\';\n+ while(<$s_in>)\n+ {\n+ if ($_ =~ /^\\@[A-Za-z][A-Za-z](\\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\\@CO\\t.*/ )\n+ {\n+ print $s_out $_ if defined ($hashRef);\n+ print $s_uni_out $_;\n+ next;\n+ }\n+ my @line = split (/\\t/,$_);\n+ $sequence = $line[0];\n+ if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )\n+ {\n+ my $arn = $line[9];\n+ if ($line[1] & 16)\n+ {\n+ $arn =reverse($arn);\n+ $arn =~ tr/atgcuATGCU/tacgaTACGA/;\n+ }\n+\n+ if ( ( $line[1] == 16 || $line[1] == 0 ) )\n+\t\t\t{\n+ \tprint $f_out "\\@".$line[0]."\\n".$arn."\\n+\\n".$line[10]."\\n" ;\n+ \tprint $s_out $_ if defined ($hashRef);\n+ \tif ( $line[11] eq "XT:A:U" )\n+ \t{\n+ \t\tprint $f_uni_out "\\@".$line[0]."\\n".$arn."\\n+\\n".$line[10]."\\n" ;\n+ \t\tprint $s_uni_out $_ ;\n+ \t}\n+ }\n+ } \n+ }\n+ close $s_in; close $s_out if defined ($hashRef); \n+ close $s_uni_out; close $f_out; close $f_uni_out;\n+}\n+\n+sub get_fastq_seq{\n+ my $fastq = shift;\n+ my %hash; my $cmp = 0; \n+\n+ open my $fic, \'<\', $fastq || die "cannot open input file $! \\n";\n+ while(<$fic>)\n+ {\n+ chomp $_;\n+ $cmp++;\n+ if ($cmp % 4 == 1)\n+ {\n+ die "file do not contain a @ at line $cmp\\n" unless ($_ =~ /^\\@/ );\n+ if ($_ =~ /^\\@(.*)\\s.*/) { $hash{$1} = 1;}\n+ elsif ($_ =~ /^\\@(.*)/) { $hash{$1} = 1;}\n+ }\n+ elsif ($cmp % 4 == 3 )\n+ {\n+ die "file do not contain a + at line $cmp\\n" unless $_ =~ /^\\+/;\n+ }\n+ }\n+ close $fic;\n+ return \\%hash;\n+}\n+\n+1;\n' |