changeset 17:8031792a6e2c draft

Uploaded
author romaingred
date Tue, 17 Oct 2017 09:34:09 -0400
parents 2be918bf0efe
children fcaea740c23a
files bin/ppp.pm
diffstat 1 files changed, 68 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ppp.pm	Mon Oct 16 04:11:32 2017 -0400
+++ b/bin/ppp.pm	Tue Oct 17 09:34:09 2017 -0400
@@ -14,19 +14,19 @@
 sub ping_pong_partners
 {
   my ( $TE_fai, $sam, $dir, $max ) = @_;
-  my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam );
+  my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam );
   my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
-  my ( $SP, $AP, $SN, $AN, $txt); 
+  my ( $SP, $AP, $SN, $AN, $txt );
   my $flag = 0;
   my @distri_overlap = (); my @overlaps_names = ();
 
-  open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";   
+  open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
   foreach my $k ( sort keys %{$hashRef} )
   {
     my $v = $hashRef->{$k};
     my $TE_dir = $dir.$k.'/';
 
-    %num_per_overlap_size = (); $overlap_number = 0;      
+    %num_per_overlap_size = (); $overlap_number = 0;
     $flag = 0;
     for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
     {
@@ -41,33 +41,22 @@
       {
         $sensR = ${$v->[0]}[$j];
         $begSens = $sensR->[0];
-
         $endSens = $begSens + length($sensR->[1]) - 1;
 
         if ( $begSens <= $endRev && $endSens > $endRev )
         {
           $flag = 1;
-          
           mkdir $TE_dir;
-          open  $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
-          open  $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";      
-          open  $SN, '>>', $TE_dir."sens.txt"  || die "cannot create sens\n";
-          open  $AN, '>>', $TE_dir."antisens.txt"  || die "cannot create antisens\n";
-          open  $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";      
-          
+          open  $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
+
           $overlap = $endRev - $begSens + 1;
           $snum =  $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
           $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
 
           if ( $overlap == 10 )
           {
-           print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
-           print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
-          }
-          else
-          {
-            print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
-            print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
+            $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1;
+            $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1;
           }
           next if $overlap > $max;
           if ( $snum < $rnum )
@@ -83,21 +72,56 @@
         }
       }
     }
-
     if ( $max != 0 )
     {
-      my @overlaps = (); 
-      push @overlaps_names, $k;  
+      my @overlaps = ();
+      push @overlaps_names, $k;
       for my $i (1..$max)
-      { 
+      {
         $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
         push @overlaps, $num_per_overlap_size{$i};
       }
       push @distri_overlap, \@overlaps;
-    } 
+    }
 
     if ( $flag == 1 )
     {
+      open  $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
+      open  $AN, '>', $TE_dir."antisens.txt"  || die "cannot create antisens\n";
+      for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
+      {
+        $reverseR = ${$v->[1]}[$i] ;
+        my $revR = reverse($reverseR->[1]);
+        $revR =~ tr/atgcuATGCU/tacgaTACGA/;
+        $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
+        if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 )
+        {
+          print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
+        }
+        else
+        {
+          print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
+        }
+      }
+      close $AP; close $AN;
+
+      open  $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
+      open  $SN, '>', $TE_dir."sens.txt"  || die "cannot create sens\n";
+      for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
+      {
+        $sensR = ${$v->[0]}[$j];
+        $snum =  $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
+        if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 )
+        {
+          print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
+        }
+        else
+        {
+          print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
+        }
+      }
+      close $SP; close $SN;
+
       my $histo_png = $TE_dir.'histogram.png';
       histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
       print $txt "size\tnumber\tpercentage of the total overlap number\n";
@@ -105,12 +129,10 @@
       {
         my $percentage = 0;
         $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
-        print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;   
-          
+        print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
       }
       close $txt;
     }
-
   }
 
   foreach my $tabP (  @distri_overlap )
@@ -125,13 +147,13 @@
     $prob =  1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
     print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
   }
-  close $ppp_f; 
-} 
+  close $ppp_f;
+}
 
 sub count_mapped
 {
   my ( $fai, $in_file ) = @_;
-  my ( %mapped, %dup );
+  my ( %mapped, %dup, %has_ppp );
 
   open my $f, '<', $fai || die "cannot open $fai $! \n";
   while(<$f>)
@@ -139,7 +161,7 @@
     if ($_ =~ /(.*)\t(\d+)\n/)
     {
       $mapped{$1} = [];
-      $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];        
+      $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
     }
   }
   close $f;
@@ -148,22 +170,30 @@
   while(<$infile>)
   {
     unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-    {    
+    {
       my @line = split (/\t/,$_);
       if ($line[1] == 0)
       {
-        push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1],  $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
+        unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
+        {
+          push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1],  $line[2]];
+          $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0;
+        }
         $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
       }
       elsif ($line[1] == 16)
       {
-        push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1],  $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
-        $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
+        unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
+        {
+          push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1],  $line[2]];
+          $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0;
+        }
+        $dup{$line[3].$line[9].$line[1].$line[2]}+=1
       }
     }
   }
   close $infile;
-  return (\%mapped, \%dup );
+  return (\%mapped, \%dup, \%has_ppp );
 }
 
 sub sum
@@ -185,7 +215,7 @@
 sub standard_deviation
 {
   my ($arrayref, $mean) =  @_;
-  return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));  
+  return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
 }
 
 sub z_significance
@@ -197,3 +227,4 @@
 }
 
 1;
+