0
|
1 package ppp;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use FindBin;
|
|
6 use lib $FindBin::Bin;
|
|
7 use Rcall qw ( histogram );
|
|
8 use Math::CDF;
|
|
9
|
|
10 use Exporter;
|
|
11 our @ISA = qw( Exporter );
|
|
12 our @EXPORT_OK = qw( &ping_pong_partners );
|
|
13
|
|
14 sub ping_pong_partners
|
|
15 {
|
|
16 my ( $TE_fai, $sam, $dir, $max ) = @_;
|
|
17 my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam );
|
|
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
|
|
19 my ( $SP, $AP, $SN, $AN, $txt);
|
|
20 my $flag = 0;
|
|
21 my @distri_overlap = (); my @overlaps_names = ();
|
|
22
|
|
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
|
|
24 foreach my $k ( sort keys %{$hashRef} )
|
|
25 {
|
|
26 my $v = $hashRef->{$k};
|
|
27 my $TE_dir = $dir.$k.'/';
|
|
28
|
|
29 %num_per_overlap_size = (); $overlap_number = 0;
|
|
30 $flag = 0;
|
|
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
|
|
32 {
|
|
33 $reverseR = ${$v->[1]}[$i] ;
|
|
34 $begRev = $reverseR->[0];
|
|
35 $endRev = $begRev + length($reverseR->[1]) - 1;
|
|
36
|
|
37 my $revR = reverse($reverseR->[1]);
|
|
38 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
39
|
|
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
|
|
41 {
|
|
42 $sensR = ${$v->[0]}[$j];
|
|
43 $begSens = $sensR->[0];
|
|
44
|
|
45 $endSens = $begSens + length($sensR->[1]) - 1;
|
|
46
|
|
47 if ( $begSens <= $endRev && $endSens > $endRev )
|
|
48 {
|
|
49 $flag = 1;
|
|
50
|
|
51 mkdir $TE_dir;
|
|
52 open $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
|
|
53 open $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
|
|
54 open $SN, '>>', $TE_dir."sens.txt" || die "cannot create sens\n";
|
|
55 open $AN, '>>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
|
|
56 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
|
|
57
|
|
58 $overlap = $endRev - $begSens + 1;
|
|
59 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
|
|
60 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
|
|
61
|
|
62 if ( $overlap == 10 )
|
|
63 {
|
|
64 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
|
|
65 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
|
|
66 }
|
|
67 else
|
|
68 {
|
|
69 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
|
|
70 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
|
|
71 }
|
|
72 next if $overlap > $max;
|
|
73 if ( $snum < $rnum )
|
|
74 {
|
|
75 $num_per_overlap_size{$overlap} += $snum;
|
|
76 $overlap_number += $snum;
|
|
77 }
|
|
78 else
|
|
79 {
|
|
80 $num_per_overlap_size{$overlap} += $rnum ;
|
|
81 $overlap_number += $rnum ;
|
|
82 }
|
|
83 }
|
|
84 }
|
|
85 }
|
|
86
|
|
87 if ( $max != 0 )
|
|
88 {
|
|
89 my @overlaps = ();
|
|
90 push @overlaps_names, $k;
|
|
91 for my $i (1..$max)
|
|
92 {
|
|
93 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
|
|
94 push @overlaps, $num_per_overlap_size{$i};
|
|
95 }
|
|
96 push @distri_overlap, \@overlaps;
|
|
97 }
|
|
98
|
|
99 if ( $flag == 1 )
|
|
100 {
|
|
101 my $histo_png = $TE_dir.'histogram.png';
|
|
102 histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
|
|
103 print $txt "size\tnumber\tpercentage of the total overlap number\n";
|
|
104 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size )
|
|
105 {
|
|
106 my $percentage = 0;
|
|
107 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
|
|
108 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
|
|
109
|
|
110 }
|
|
111 close $txt;
|
|
112 }
|
|
113
|
|
114 }
|
|
115
|
|
116 foreach my $tabP ( @distri_overlap )
|
|
117 {
|
|
118 my $sum = sum($tabP);
|
|
119 my $ten = $tabP->[9];
|
|
120 my $mean = mean($tabP);
|
|
121 my $std = standard_deviation($tabP, $mean);
|
|
122 my $zsc = z_significance($ten, $mean, $std);
|
|
123 my $name = shift @overlaps_names;
|
|
124 my $prob = 'NA';
|
|
125 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
|
|
126 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
|
|
127 }
|
|
128 close $ppp_f;
|
|
129 }
|
|
130
|
|
131 sub count_mapped
|
|
132 {
|
|
133 my ( $fai, $in_file ) = @_;
|
|
134 my ( %mapped, %dup );
|
|
135
|
|
136 open my $f, '<', $fai || die "cannot open $fai $! \n";
|
|
137 while(<$f>)
|
|
138 {
|
|
139 if ($_ =~ /(.*)\t(\d+)\n/)
|
|
140 {
|
|
141 $mapped{$1} = [];
|
|
142 $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
|
|
143 }
|
|
144 }
|
|
145 close $f;
|
|
146
|
|
147 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n";
|
|
148 while(<$infile>)
|
|
149 {
|
|
150 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
151 {
|
|
152 my @line = split (/\t/,$_);
|
|
153 if ($line[1] == 0)
|
|
154 {
|
|
155 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
|
|
156 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
|
|
157 }
|
|
158 elsif ($line[1] == 16)
|
|
159 {
|
|
160 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]});
|
|
161 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
|
|
162 }
|
|
163 }
|
|
164 }
|
|
165 close $infile;
|
|
166 return (\%mapped, \%dup );
|
|
167 }
|
|
168
|
|
169 sub sum
|
|
170 {
|
|
171 my $arrayref = shift;
|
|
172 my $result = 0;
|
|
173 foreach (@$arrayref) {$result += $_}
|
|
174 return $result;
|
|
175 }
|
|
176
|
|
177 sub mean
|
|
178 {
|
|
179 my $arrayref = shift;
|
|
180 my $result;
|
|
181 foreach (@$arrayref) {$result += $_}
|
|
182 return $result / scalar(@$arrayref);
|
|
183 }
|
|
184
|
|
185 sub standard_deviation
|
|
186 {
|
|
187 my ($arrayref, $mean) = @_;
|
|
188 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
|
|
189 }
|
|
190
|
|
191 sub z_significance
|
|
192 {
|
|
193 my ($ten, $mean, $std) = @_;
|
|
194 my $z = 'NA';
|
|
195 $z = (($ten - $mean) / $std) if $std != 0;
|
|
196 return $z;
|
|
197 }
|
|
198
|
|
199 1;
|