comparison bin/ppp.pm @ 0:198009598544 draft

Uploaded
author romaingred
date Wed, 11 Oct 2017 09:57:58 -0400
parents
children 8031792a6e2c
comparison
equal deleted inserted replaced
-1:000000000000 0:198009598544
1 package ppp;
2
3 use strict;
4 use warnings;
5 use FindBin;
6 use lib $FindBin::Bin;
7 use Rcall qw ( histogram );
8 use Math::CDF;
9
10 use Exporter;
11 our @ISA = qw( Exporter );
12 our @EXPORT_OK = qw( &ping_pong_partners );
13
14 sub ping_pong_partners
15 {
16 my ( $TE_fai, $sam, $dir, $max ) = @_;
17 my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam );
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
19 my ( $SP, $AP, $SN, $AN, $txt);
20 my $flag = 0;
21 my @distri_overlap = (); my @overlaps_names = ();
22
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
24 foreach my $k ( sort keys %{$hashRef} )
25 {
26 my $v = $hashRef->{$k};
27 my $TE_dir = $dir.$k.'/';
28
29 %num_per_overlap_size = (); $overlap_number = 0;
30 $flag = 0;
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
32 {
33 $reverseR = ${$v->[1]}[$i] ;
34 $begRev = $reverseR->[0];
35 $endRev = $begRev + length($reverseR->[1]) - 1;
36
37 my $revR = reverse($reverseR->[1]);
38 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
39
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
41 {
42 $sensR = ${$v->[0]}[$j];
43 $begSens = $sensR->[0];
44
45 $endSens = $begSens + length($sensR->[1]) - 1;
46
47 if ( $begSens <= $endRev && $endSens > $endRev )
48 {
49 $flag = 1;
50
51 mkdir $TE_dir;
52 open $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
53 open $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
54 open $SN, '>>', $TE_dir."sens.txt" || die "cannot create sens\n";
55 open $AN, '>>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
56 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
57
58 $overlap = $endRev - $begSens + 1;
59 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
60 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
61
62 if ( $overlap == 10 )
63 {
64 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
65 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
66 }
67 else
68 {
69 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
70 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
71 }
72 next if $overlap > $max;
73 if ( $snum < $rnum )
74 {
75 $num_per_overlap_size{$overlap} += $snum;
76 $overlap_number += $snum;
77 }
78 else
79 {
80 $num_per_overlap_size{$overlap} += $rnum ;
81 $overlap_number += $rnum ;
82 }
83 }
84 }
85 }
86
87 if ( $max != 0 )
88 {
89 my @overlaps = ();
90 push @overlaps_names, $k;
91 for my $i (1..$max)
92 {
93 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
94 push @overlaps, $num_per_overlap_size{$i};
95 }
96 push @distri_overlap, \@overlaps;
97 }
98
99 if ( $flag == 1 )
100 {
101 my $histo_png = $TE_dir.'histogram.png';
102 histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
103 print $txt "size\tnumber\tpercentage of the total overlap number\n";
104 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size )
105 {
106 my $percentage = 0;
107 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
108 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
109
110 }
111 close $txt;
112 }
113
114 }
115
116 foreach my $tabP ( @distri_overlap )
117 {
118 my $sum = sum($tabP);
119 my $ten = $tabP->[9];
120 my $mean = mean($tabP);
121 my $std = standard_deviation($tabP, $mean);
122 my $zsc = z_significance($ten, $mean, $std);
123 my $name = shift @overlaps_names;
124 my $prob = 'NA';
125 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
126 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
127 }
128 close $ppp_f;
129 }
130
131 sub count_mapped
132 {
133 my ( $fai, $in_file ) = @_;
134 my ( %mapped, %dup );
135
136 open my $f, '<', $fai || die "cannot open $fai $! \n";
137 while(<$f>)
138 {
139 if ($_ =~ /(.*)\t(\d+)\n/)
140 {
141 $mapped{$1} = [];
142 $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
143 }
144 }
145 close $f;
146
147 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n";
148 while(<$infile>)
149 {
150 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
151 {
152 my @line = split (/\t/,$_);
153 if ($line[1] == 0)
154 {
155 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
156 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
157 }
158 elsif ($line[1] == 16)
159 {
160 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
161 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
162 }
163 }
164 }
165 close $infile;
166 return (\%mapped, \%dup );
167 }
168
169 sub sum
170 {
171 my $arrayref = shift;
172 my $result = 0;
173 foreach (@$arrayref) {$result += $_}
174 return $result;
175 }
176
177 sub mean
178 {
179 my $arrayref = shift;
180 my $result;
181 foreach (@$arrayref) {$result += $_}
182 return $result / scalar(@$arrayref);
183 }
184
185 sub standard_deviation
186 {
187 my ($arrayref, $mean) = @_;
188 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
189 }
190
191 sub z_significance
192 {
193 my ($ten, $mean, $std) = @_;
194 my $z = 'NA';
195 $z = (($ten - $mean) / $std) if $std != 0;
196 return $z;
197 }
198
199 1;