Mercurial > repos > romaingred > pirna_pipeline
diff bin/ppp.pm @ 17:8031792a6e2c draft
Uploaded
author | romaingred |
---|---|
date | Tue, 17 Oct 2017 09:34:09 -0400 |
parents | 198009598544 |
children |
line wrap: on
line diff
--- a/bin/ppp.pm Mon Oct 16 04:11:32 2017 -0400 +++ b/bin/ppp.pm Tue Oct 17 09:34:09 2017 -0400 @@ -14,19 +14,19 @@ sub ping_pong_partners { my ( $TE_fai, $sam, $dir, $max ) = @_; - my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam ); + my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam ); my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); - my ( $SP, $AP, $SN, $AN, $txt); + my ( $SP, $AP, $SN, $AN, $txt ); my $flag = 0; my @distri_overlap = (); my @overlaps_names = (); - open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; + open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; foreach my $k ( sort keys %{$hashRef} ) { my $v = $hashRef->{$k}; my $TE_dir = $dir.$k.'/'; - %num_per_overlap_size = (); $overlap_number = 0; + %num_per_overlap_size = (); $overlap_number = 0; $flag = 0; for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) { @@ -41,33 +41,22 @@ { $sensR = ${$v->[0]}[$j]; $begSens = $sensR->[0]; - $endSens = $begSens + length($sensR->[1]) - 1; if ( $begSens <= $endRev && $endSens > $endRev ) { $flag = 1; - mkdir $TE_dir; - open $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; - open $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; - open $SN, '>>', $TE_dir."sens.txt" || die "cannot create sens\n"; - open $AN, '>>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; - open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; - + open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; + $overlap = $endRev - $begSens + 1; $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; if ( $overlap == 10 ) { - print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; - print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; - } - else - { - print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; - print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; + $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1; + $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1; } next if $overlap > $max; if ( $snum < $rnum ) @@ -83,21 +72,56 @@ } } } - if ( $max != 0 ) { - my @overlaps = (); - push @overlaps_names, $k; + my @overlaps = (); + push @overlaps_names, $k; for my $i (1..$max) - { + { $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); push @overlaps, $num_per_overlap_size{$i}; } push @distri_overlap, \@overlaps; - } + } if ( $flag == 1 ) { + open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; + open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; + for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) + { + $reverseR = ${$v->[1]}[$i] ; + my $revR = reverse($reverseR->[1]); + $revR =~ tr/atgcuATGCU/tacgaTACGA/; + $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; + if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 ) + { + print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; + } + else + { + print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; + } + } + close $AP; close $AN; + + open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; + open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n"; + for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) + { + $sensR = ${$v->[0]}[$j]; + $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; + if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 ) + { + print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; + } + else + { + print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; + } + } + close $SP; close $SN; + my $histo_png = $TE_dir.'histogram.png'; histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); print $txt "size\tnumber\tpercentage of the total overlap number\n"; @@ -105,12 +129,10 @@ { my $percentage = 0; $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; - print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; - + print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; } close $txt; } - } foreach my $tabP ( @distri_overlap ) @@ -125,13 +147,13 @@ $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); } - close $ppp_f; -} + close $ppp_f; +} sub count_mapped { my ( $fai, $in_file ) = @_; - my ( %mapped, %dup ); + my ( %mapped, %dup, %has_ppp ); open my $f, '<', $fai || die "cannot open $fai $! \n"; while(<$f>) @@ -139,7 +161,7 @@ if ($_ =~ /(.*)\t(\d+)\n/) { $mapped{$1} = []; - $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; + $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; } } close $f; @@ -148,22 +170,30 @@ while(<$infile>) { unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) - { + { my @line = split (/\t/,$_); if ($line[1] == 0) { - push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); + unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) + { + push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]]; + $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0; + } $dup{$line[3].$line[9].$line[1].$line[2]}+=1; } elsif ($line[1] == 16) { - push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); - $dup{$line[3].$line[9].$line[1].$line[2]}+=1; + unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) + { + push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]]; + $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0; + } + $dup{$line[3].$line[9].$line[1].$line[2]}+=1 } } } close $infile; - return (\%mapped, \%dup ); + return (\%mapped, \%dup, \%has_ppp ); } sub sum @@ -185,7 +215,7 @@ sub standard_deviation { my ($arrayref, $mean) = @_; - return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); + return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); } sub z_significance @@ -197,3 +227,4 @@ } 1; +