Mercurial > repos > romaingred > srnapipe
comparison bin/ppp.pm @ 4:bf3c8bf8b819 draft
Deleted selected files
| author | romaingred |
|---|---|
| date | Wed, 13 Dec 2017 10:24:17 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:e9c9cee888cf | 4:bf3c8bf8b819 |
|---|---|
| 1 package ppp; | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use FindBin; | |
| 6 use lib $FindBin::Bin; | |
| 7 use Rcall qw ( histogram ); | |
| 8 use Math::CDF; | |
| 9 | |
| 10 use Exporter; | |
| 11 our @ISA = qw( Exporter ); | |
| 12 our @EXPORT_OK = qw( &ping_pong_partners ); | |
| 13 | |
| 14 sub ping_pong_partners | |
| 15 { | |
| 16 my ( $TE_fai, $sam, $dir, $max ) = @_; | |
| 17 my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam ); | |
| 18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); | |
| 19 my ( $SP, $AP, $SN, $AN, $txt ); | |
| 20 my $flag = 0; | |
| 21 my @distri_overlap = (); my @overlaps_names = (); | |
| 22 | |
| 23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; | |
| 24 foreach my $k ( sort keys %{$hashRef} ) | |
| 25 { | |
| 26 my $v = $hashRef->{$k}; | |
| 27 my $TE_dir = $dir.$k.'/'; | |
| 28 | |
| 29 %num_per_overlap_size = (); $overlap_number = 0; | |
| 30 $flag = 0; | |
| 31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) | |
| 32 { | |
| 33 $reverseR = ${$v->[1]}[$i] ; | |
| 34 $begRev = $reverseR->[0]; | |
| 35 $endRev = $begRev + length($reverseR->[1]) - 1; | |
| 36 | |
| 37 my $revR = reverse($reverseR->[1]); | |
| 38 $revR =~ tr/atgcuATGCU/tacgaTACGA/; | |
| 39 | |
| 40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) | |
| 41 { | |
| 42 $sensR = ${$v->[0]}[$j]; | |
| 43 $begSens = $sensR->[0]; | |
| 44 $endSens = $begSens + length($sensR->[1]) - 1; | |
| 45 | |
| 46 if ( $begSens <= $endRev && $endSens > $endRev ) | |
| 47 { | |
| 48 $flag = 1; | |
| 49 mkdir $TE_dir; | |
| 50 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; | |
| 51 | |
| 52 $overlap = $endRev - $begSens + 1; | |
| 53 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; | |
| 54 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; | |
| 55 | |
| 56 if ( $overlap == 10 ) | |
| 57 { | |
| 58 $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1; | |
| 59 $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1; | |
| 60 } | |
| 61 next if $overlap > $max; | |
| 62 if ( $snum < $rnum ) | |
| 63 { | |
| 64 $num_per_overlap_size{$overlap} += $snum; | |
| 65 $overlap_number += $snum; | |
| 66 } | |
| 67 else | |
| 68 { | |
| 69 $num_per_overlap_size{$overlap} += $rnum ; | |
| 70 $overlap_number += $rnum ; | |
| 71 } | |
| 72 } | |
| 73 } | |
| 74 } | |
| 75 if ( $max != 0 ) | |
| 76 { | |
| 77 my @overlaps = (); | |
| 78 push @overlaps_names, $k; | |
| 79 for my $i (1..$max) | |
| 80 { | |
| 81 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); | |
| 82 push @overlaps, $num_per_overlap_size{$i}; | |
| 83 } | |
| 84 push @distri_overlap, \@overlaps; | |
| 85 } | |
| 86 | |
| 87 if ( $flag == 1 ) | |
| 88 { | |
| 89 open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; | |
| 90 open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; | |
| 91 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) | |
| 92 { | |
| 93 $reverseR = ${$v->[1]}[$i] ; | |
| 94 my $revR = reverse($reverseR->[1]); | |
| 95 $revR =~ tr/atgcuATGCU/tacgaTACGA/; | |
| 96 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; | |
| 97 if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 ) | |
| 98 { | |
| 99 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | |
| 100 } | |
| 101 else | |
| 102 { | |
| 103 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | |
| 104 } | |
| 105 } | |
| 106 close $AP; close $AN; | |
| 107 | |
| 108 open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; | |
| 109 open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n"; | |
| 110 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) | |
| 111 { | |
| 112 $sensR = ${$v->[0]}[$j]; | |
| 113 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; | |
| 114 if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 ) | |
| 115 { | |
| 116 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | |
| 117 } | |
| 118 else | |
| 119 { | |
| 120 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | |
| 121 } | |
| 122 } | |
| 123 close $SP; close $SN; | |
| 124 | |
| 125 my $histo_png = $TE_dir.'histogram.png'; | |
| 126 histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); | |
| 127 print $txt "size\tnumber\tpercentage of the total overlap number\n"; | |
| 128 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) | |
| 129 { | |
| 130 my $percentage = 0; | |
| 131 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; | |
| 132 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; | |
| 133 } | |
| 134 close $txt; | |
| 135 } | |
| 136 } | |
| 137 | |
| 138 foreach my $tabP ( @distri_overlap ) | |
| 139 { | |
| 140 my $sum = sum($tabP); | |
| 141 my $ten = $tabP->[9]; | |
| 142 my $mean = mean($tabP); | |
| 143 my $std = standard_deviation($tabP, $mean); | |
| 144 my $zsc = z_significance($ten, $mean, $std); | |
| 145 my $name = shift @overlaps_names; | |
| 146 my $prob = 'NA'; | |
| 147 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; | |
| 148 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); | |
| 149 } | |
| 150 close $ppp_f; | |
| 151 } | |
| 152 | |
| 153 sub count_mapped | |
| 154 { | |
| 155 my ( $fai, $in_file ) = @_; | |
| 156 my ( %mapped, %dup, %has_ppp ); | |
| 157 | |
| 158 open my $f, '<', $fai || die "cannot open $fai $! \n"; | |
| 159 while(<$f>) | |
| 160 { | |
| 161 if ($_ =~ /(.*)\t(\d+)\n/) | |
| 162 { | |
| 163 $mapped{$1} = []; | |
| 164 $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; | |
| 165 } | |
| 166 } | |
| 167 close $f; | |
| 168 | |
| 169 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n"; | |
| 170 while(<$infile>) | |
| 171 { | |
| 172 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
| 173 { | |
| 174 my @line = split (/\t/,$_); | |
| 175 if ($line[1] == 0) | |
| 176 { | |
| 177 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) | |
| 178 { | |
| 179 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]]; | |
| 180 $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0; | |
| 181 } | |
| 182 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; | |
| 183 } | |
| 184 elsif ($line[1] == 16) | |
| 185 { | |
| 186 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) | |
| 187 { | |
| 188 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]]; | |
| 189 $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0; | |
| 190 } | |
| 191 $dup{$line[3].$line[9].$line[1].$line[2]}+=1 | |
| 192 } | |
| 193 } | |
| 194 } | |
| 195 close $infile; | |
| 196 return (\%mapped, \%dup, \%has_ppp ); | |
| 197 } | |
| 198 | |
| 199 sub sum | |
| 200 { | |
| 201 my $arrayref = shift; | |
| 202 my $result = 0; | |
| 203 foreach (@$arrayref) {$result += $_} | |
| 204 return $result; | |
| 205 } | |
| 206 | |
| 207 sub mean | |
| 208 { | |
| 209 my $arrayref = shift; | |
| 210 my $result; | |
| 211 foreach (@$arrayref) {$result += $_} | |
| 212 return $result / scalar(@$arrayref); | |
| 213 } | |
| 214 | |
| 215 sub standard_deviation | |
| 216 { | |
| 217 my ($arrayref, $mean) = @_; | |
| 218 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); | |
| 219 } | |
| 220 | |
| 221 sub z_significance | |
| 222 { | |
| 223 my ($ten, $mean, $std) = @_; | |
| 224 my $z = 'NA'; | |
| 225 $z = (($ten - $mean) / $std) if $std != 0; | |
| 226 return $z; | |
| 227 } | |
| 228 | |
| 229 1; | |
| 230 |
