Galaxy |

Changeset 27:c8090766c3f5 (2018-05-20)

Previous changeset 26:b3be854a4273 (2018-05-20) Next changeset 28:8d8357dee6b3 (2018-05-20)

Commit message:
Uploaded

modified:
bin/align.pm

diff -r b3be854a4273 -r c8090766c3f5 bin/align.pm
--- a/bin/align.pm Sun May 20 16:55:31 2018 -0400
+++ b/bin/align.pm Sun May 20 17:46:23 2018 -0400

[

b'@@ -1,539 +1,552 @@\n-package align;\n-\n-use strict;\n-use warnings;\n-use File::Basename;\n-use String::Random;\n-\n-use FindBin;\n-use lib $FindBin::Bin;\n-use Rcall qw ( histogram );\n-\n-use Exporter;\n-our @ISA = qw( Exporter );\n-our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );\n-\n-sub to_build\n-{\n- my ( $toBuildTabP, $log, $newdir ) = @_;\n-\n- foreach my $pairs ( @{ $toBuildTabP } ) \n- {\n- if ( $pairs->[0] == 1 ) \n- { \n- my $sym = $newdir.basename(${$pairs->[1]}).\'_symlink.fa\';\n- symlink( ${$pairs->[1]}, $sym );\n- ${$pairs->[1]} = $sym;\n- build_index ( ${$pairs->[1]}, $log );\n- } \n- }\n-}\n-\n-sub build_index\n-{\n-\tmy $to_index = shift;\n-\tmy $log = shift;\n-\tmy $index_log = $to_index.\'_index.err\';\n-\t`bwa index $to_index 2> $index_log`;\n-\tprint $log "Creating index for $to_index\\n";\n-}\n-\n-sub get_unique\n-{\n-\tmy ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;\n-\n-\tmy $fout = $col_prefix.\'_all_mappers.fastq\'; \n-\tmy $funi = $col_prefix.\'_unique_mappers.fastq\';\n-\tmy $frej = $col_prefix.\'_unmapped.fastq\';\n-\t\n- my $repartition = $out_prefix.\'distribution.txt\';\n-\tmy $png_rep = $out_prefix.\'distribution.png\';\n-\tmy ( %duplicates, %genome_hits) ;\n-\n-\t#alignement to the first reference\n-\tmy @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \\%duplicates, \\%genome_hits, $report );\n-\tmy $ref_fai = $return[4];\n-\tmy $mappers = $return[5];\n-\tmy $mappers_uni = $return[6];\n-\tmy $size_mappedHashR = $return[7];\n-\n-\tif ( $details == 1 )\n-\t{\n-\t\t#print number of duplicates and hits number\n-\t\tmy ($pourcentage, $total) =(0,0);\n-\n-\t\t$total += $_ foreach values %{$size_mappedHashR};\n-\t\topen (my $rep, \'>\'.$repartition) || die "cannot create $repartition $!\\n";\n-\t\tprint $rep "size\\tnumber\\tpercentage\\n";\n-\t\tforeach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))\n-\t\t{\n-\t\t\t$pourcentage = 0;\n-\t\t\t$pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;\n-\n-\t\t\tprint $rep "$k\\t$size_mappedHashR->{$k}\\t";\n-\t\t\tprintf $rep "%.2f\\n",$pourcentage;\n-\t\t}\n-\n-\t\thistogram($size_mappedHashR, $png_rep, $total);\n-\n-\n-\t\tmy $dup = $out_prefix.\'dup_mapnum.txt\';\n-\t\tmy $dup_u = $out_prefix .\'dup_unique.txt\';\n-\t\tmy $dup_r = $out_prefix .\'dup_nonmapp.txt\';\n-\t\topen(my $tab,">".$dup) || die "cannot open output txt file\\n";\n-\t\topen(my $tab_r,">".$dup_r) || die "cannot open output txt file\\n";\n-\t\topen(my $tab_u,">".$dup_u) || die "cannot open output txt file\\n";\n-\t\tprint $tab "sequence\\tcount\\tmapnum\\n";\n-\t\tprint $tab_u "sequence\\tcount\\n";\n-\t\tprint $tab_r "sequence\\tcount\\n";\n-\t\tforeach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)\n-\t\t{\n-\t\t\t$duplicates{$k} = 0 unless exists($duplicates{$k});\n-\t\t\t$genome_hits{$k} = 0 unless exists($genome_hits{$k});\n-\t\t\tif ($genome_hits{$k} != 0) { print $tab $k."\\t".$duplicates{$k}."\\t".$genome_hits{$k}."\\n"; }\n-\t\t\telse {print $tab_r $k."\\t".$duplicates{$k}."\\n";}\n-\t\t\tif ($genome_hits{$k} == 1) { print $tab_u $k."\\t".$duplicates{$k}."\\n"; }\n-\t\t}\n-\tclose $dup; close $dup_r; close $dup_u;\n-\t}\n-\treturn ( $ref_fai, $mappers, $mappers_uni );\n-}\n-\n-sub sam_parse\n-{\n-\tmy ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;\n-\tmy ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);\n-\t$mappers = $mappersUnique = $reads = 0;\n-\n-\topen my $fic, \'<\', $sam || die "cannot open $sam $!\\n";\n-\topen my $accepted, \'>\', $fastq_accepted || die "cannot create $fastq_accepted $! \\n";\n-\topen my $unique, \'>\', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \\n";\n-\topen my $rejected, \'>\', $fastq_rejected || die "cannot create $fastq_rejected $! \\n";\n-\topen my $sam_uni, \'>\', $sam_unique || die "cannot create $sam_unique $! \\n";\n-\tmy $sequence = \'\';'..b'astq, $sam, $mismatches, $number_of_cpus, $report ) = @_;\r\n+\tmy ( $aln_err, $samse_err, $seq_num ) = ( $sam.\'_aln.err\', $sam.\'_samse.err\', 0 );\r\n+\tprint $report "-----------------------------\\n";\r\n+\tprint $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\\n";\r\n+\t`bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `;\r\n+}\r\n+\r\n+sub rpms_rpkm\r\n+{\r\n+ my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;\r\n+ open(my $out, ">".$out_file) || die "cannot open normalized file $! \\n";\r\n+ print $out "ID\\treads counts\\tRPKM";\r\n+ print $out "\\tper million of piRNAs" if ($piRNA_number != 0);\r\n+ print $out "\\tper million of miRNAs" if ($miRNA_number != 0);\r\n+ print $out "\\tper million of bonafide reads" if ($bonafide_number != 0);\r\n+ print $out "\\n";\r\n+ foreach my $k ( sort keys %{$counthashR} )\r\n+ {\r\n+ my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);\r\n+ \r\n+ $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;\r\n+ print $out $k."\\t".$counthashR->{$k}."\\t"; printf $out "%.2f",$rpkm;\r\n+ \r\n+ if ($piRNA_number != 0 )\r\n+ {\r\n+ $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number;\r\n+ printf $out "\\t%.2f",$pirna;\r\n+ }\r\n+ if ($miRNA_number != 0 )\r\n+ {\r\n+ $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number;\r\n+ printf $out "\\t%.2f",$mirna;\r\n+ }\r\n+ if ($bonafide_number != 0 )\r\n+ {\r\n+ $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number;\r\n+ printf $out "\\t%.2f",$bonafide;\r\n+ }\r\n+ print $out "\\n";\r\n+ \t}\r\n+ close $out;\r\n+}\r\n+\r\n+sub extract_sam\r\n+{\r\n+ my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;\r\n+ \r\n+\topen my $s_in, \'<\', $sam_in || die "cannot open $sam_in file $!\\n";\r\n+\r\n+ open my $f_out, \'>\', $fastq_out || die "cannot create $fastq_out $!\\n"; \r\n+ open my $f_uni_out, \'>\', $fastq_uni_out || die "cannot create $fastq_uni_out $!\\n"; \r\n+ \r\n+ open my $s_out, \'>\', $sam_out || die "cannot create $sam_out file $!\\n" if defined ($hashRef);\r\n+ open my $s_uni_out, \'>\', $sam_uni_out || die "cannot create $sam_uni_out file $!\\n";\r\n+\r\n+ my $sequence = \'\';\r\n+ while(<$s_in>)\r\n+ {\r\n+ if ($_ =~ /^\\@[A-Za-z][A-Za-z](\\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\\@CO\\t.*/ )\r\n+ {\r\n+ print $s_out $_ if defined ($hashRef);\r\n+ print $s_uni_out $_;\r\n+ next;\r\n+ }\r\n+ my @line = split (/\\t/,$_);\r\n+ $sequence = $line[0];\r\n+ if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )\r\n+ {\r\n+ my $arn = $line[9];\r\n+ if ($line[1] & 16)\r\n+ {\r\n+ $arn =reverse($arn);\r\n+ $arn =~ tr/atgcuATGCU/tacgaTACGA/;\r\n+ }\r\n+\r\n+ if ( ( $line[1] == 16 || $line[1] == 0 ) )\r\n+\t\t\t{\r\n+ \tprint $f_out "\\@".$line[0]."\\n".$arn."\\n+\\n".$line[10]."\\n" ;\r\n+ \tprint $s_out $_ if defined ($hashRef);\r\n+ \tif ( $line[11] eq "XT:A:U" )\r\n+ \t{\r\n+ \t\tprint $f_uni_out "\\@".$line[0]."\\n".$arn."\\n+\\n".$line[10]."\\n" ;\r\n+ \t\tprint $s_uni_out $_ ;\r\n+ \t}\r\n+ }\r\n+ } \r\n+ }\r\n+ close $s_in; close $s_out if defined ($hashRef); \r\n+ close $s_uni_out; close $f_out; close $f_uni_out;\r\n+}\r\n+\r\n+sub get_fastq_seq\r\n+{\r\n+ my $fastq = shift;\r\n+ my %hash; my $cmp = 0; \r\n+\r\n+ open my $fic, \'<\', $fastq || die "cannot open input file $! \\n";\r\n+ while(<$fic>)\r\n+ {\r\n+ chomp $_;\r\n+ $cmp++;\r\n+ if ($cmp % 4 == 1)\r\n+ {\r\n+ die "file do not contain a @ at line $cmp\\n" unless ($_ =~ /^\\@/ );\r\n+ if ($_ =~ /^\\@(.*)\\s.*/) { $hash{$1} = 1;}\r\n+ elsif ($_ =~ /^\\@(.*)/) { $hash{$1} = 1;}\r\n+ }\r\n+ elsif ($cmp % 4 == 3 )\r\n+ {\r\n+ die "file do not contain a + at line $cmp\\n" unless $_ =~ /^\\+/;\r\n+ }\r\n+ }\r\n+ close $fic;\r\n+ return \\%hash;\r\n+}\r\n+\r\n+1;\r\n'