comparison bin/ppp.pm @ 17:8031792a6e2c draft

Uploaded
author romaingred
date Tue, 17 Oct 2017 09:34:09 -0400
parents 198009598544
children
comparison
equal deleted inserted replaced
16:2be918bf0efe 17:8031792a6e2c
12 our @EXPORT_OK = qw( &ping_pong_partners ); 12 our @EXPORT_OK = qw( &ping_pong_partners );
13 13
14 sub ping_pong_partners 14 sub ping_pong_partners
15 { 15 {
16 my ( $TE_fai, $sam, $dir, $max ) = @_; 16 my ( $TE_fai, $sam, $dir, $max ) = @_;
17 my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam ); 17 my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam );
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); 18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
19 my ( $SP, $AP, $SN, $AN, $txt); 19 my ( $SP, $AP, $SN, $AN, $txt );
20 my $flag = 0; 20 my $flag = 0;
21 my @distri_overlap = (); my @overlaps_names = (); 21 my @distri_overlap = (); my @overlaps_names = ();
22 22
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; 23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
24 foreach my $k ( sort keys %{$hashRef} ) 24 foreach my $k ( sort keys %{$hashRef} )
25 { 25 {
26 my $v = $hashRef->{$k}; 26 my $v = $hashRef->{$k};
27 my $TE_dir = $dir.$k.'/'; 27 my $TE_dir = $dir.$k.'/';
28 28
29 %num_per_overlap_size = (); $overlap_number = 0; 29 %num_per_overlap_size = (); $overlap_number = 0;
30 $flag = 0; 30 $flag = 0;
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) 31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
32 { 32 {
33 $reverseR = ${$v->[1]}[$i] ; 33 $reverseR = ${$v->[1]}[$i] ;
34 $begRev = $reverseR->[0]; 34 $begRev = $reverseR->[0];
39 39
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) 40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
41 { 41 {
42 $sensR = ${$v->[0]}[$j]; 42 $sensR = ${$v->[0]}[$j];
43 $begSens = $sensR->[0]; 43 $begSens = $sensR->[0];
44
45 $endSens = $begSens + length($sensR->[1]) - 1; 44 $endSens = $begSens + length($sensR->[1]) - 1;
46 45
47 if ( $begSens <= $endRev && $endSens > $endRev ) 46 if ( $begSens <= $endRev && $endSens > $endRev )
48 { 47 {
49 $flag = 1; 48 $flag = 1;
50
51 mkdir $TE_dir; 49 mkdir $TE_dir;
52 open $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; 50 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
53 open $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; 51
54 open $SN, '>>', $TE_dir."sens.txt" || die "cannot create sens\n";
55 open $AN, '>>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
56 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
57
58 $overlap = $endRev - $begSens + 1; 52 $overlap = $endRev - $begSens + 1;
59 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; 53 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
60 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; 54 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
61 55
62 if ( $overlap == 10 ) 56 if ( $overlap == 10 )
63 { 57 {
64 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; 58 $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1;
65 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; 59 $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1;
66 }
67 else
68 {
69 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
70 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
71 } 60 }
72 next if $overlap > $max; 61 next if $overlap > $max;
73 if ( $snum < $rnum ) 62 if ( $snum < $rnum )
74 { 63 {
75 $num_per_overlap_size{$overlap} += $snum; 64 $num_per_overlap_size{$overlap} += $snum;
81 $overlap_number += $rnum ; 70 $overlap_number += $rnum ;
82 } 71 }
83 } 72 }
84 } 73 }
85 } 74 }
86
87 if ( $max != 0 ) 75 if ( $max != 0 )
88 { 76 {
89 my @overlaps = (); 77 my @overlaps = ();
90 push @overlaps_names, $k; 78 push @overlaps_names, $k;
91 for my $i (1..$max) 79 for my $i (1..$max)
92 { 80 {
93 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); 81 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
94 push @overlaps, $num_per_overlap_size{$i}; 82 push @overlaps, $num_per_overlap_size{$i};
95 } 83 }
96 push @distri_overlap, \@overlaps; 84 push @distri_overlap, \@overlaps;
97 } 85 }
98 86
99 if ( $flag == 1 ) 87 if ( $flag == 1 )
100 { 88 {
89 open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
90 open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
91 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
92 {
93 $reverseR = ${$v->[1]}[$i] ;
94 my $revR = reverse($reverseR->[1]);
95 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
96 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
97 if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 )
98 {
99 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
100 }
101 else
102 {
103 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
104 }
105 }
106 close $AP; close $AN;
107
108 open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
109 open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n";
110 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
111 {
112 $sensR = ${$v->[0]}[$j];
113 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
114 if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 )
115 {
116 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
117 }
118 else
119 {
120 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
121 }
122 }
123 close $SP; close $SN;
124
101 my $histo_png = $TE_dir.'histogram.png'; 125 my $histo_png = $TE_dir.'histogram.png';
102 histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); 126 histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
103 print $txt "size\tnumber\tpercentage of the total overlap number\n"; 127 print $txt "size\tnumber\tpercentage of the total overlap number\n";
104 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) 128 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size )
105 { 129 {
106 my $percentage = 0; 130 my $percentage = 0;
107 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; 131 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
108 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; 132 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
109
110 } 133 }
111 close $txt; 134 close $txt;
112 } 135 }
113
114 } 136 }
115 137
116 foreach my $tabP ( @distri_overlap ) 138 foreach my $tabP ( @distri_overlap )
117 { 139 {
118 my $sum = sum($tabP); 140 my $sum = sum($tabP);
123 my $name = shift @overlaps_names; 145 my $name = shift @overlaps_names;
124 my $prob = 'NA'; 146 my $prob = 'NA';
125 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; 147 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
126 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); 148 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
127 } 149 }
128 close $ppp_f; 150 close $ppp_f;
129 } 151 }
130 152
131 sub count_mapped 153 sub count_mapped
132 { 154 {
133 my ( $fai, $in_file ) = @_; 155 my ( $fai, $in_file ) = @_;
134 my ( %mapped, %dup ); 156 my ( %mapped, %dup, %has_ppp );
135 157
136 open my $f, '<', $fai || die "cannot open $fai $! \n"; 158 open my $f, '<', $fai || die "cannot open $fai $! \n";
137 while(<$f>) 159 while(<$f>)
138 { 160 {
139 if ($_ =~ /(.*)\t(\d+)\n/) 161 if ($_ =~ /(.*)\t(\d+)\n/)
140 { 162 {
141 $mapped{$1} = []; 163 $mapped{$1} = [];
142 $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; 164 $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
143 } 165 }
144 } 166 }
145 close $f; 167 close $f;
146 168
147 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n"; 169 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n";
148 while(<$infile>) 170 while(<$infile>)
149 { 171 {
150 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) 172 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
151 { 173 {
152 my @line = split (/\t/,$_); 174 my @line = split (/\t/,$_);
153 if ($line[1] == 0) 175 if ($line[1] == 0)
154 { 176 {
155 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); 177 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
178 {
179 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]];
180 $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0;
181 }
156 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; 182 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
157 } 183 }
158 elsif ($line[1] == 16) 184 elsif ($line[1] == 16)
159 { 185 {
160 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); 186 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
161 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; 187 {
188 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]];
189 $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0;
190 }
191 $dup{$line[3].$line[9].$line[1].$line[2]}+=1
162 } 192 }
163 } 193 }
164 } 194 }
165 close $infile; 195 close $infile;
166 return (\%mapped, \%dup ); 196 return (\%mapped, \%dup, \%has_ppp );
167 } 197 }
168 198
169 sub sum 199 sub sum
170 { 200 {
171 my $arrayref = shift; 201 my $arrayref = shift;
183 } 213 }
184 214
185 sub standard_deviation 215 sub standard_deviation
186 { 216 {
187 my ($arrayref, $mean) = @_; 217 my ($arrayref, $mean) = @_;
188 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); 218 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
189 } 219 }
190 220
191 sub z_significance 221 sub z_significance
192 { 222 {
193 my ($ten, $mean, $std) = @_; 223 my ($ten, $mean, $std) = @_;
195 $z = (($ten - $mean) / $std) if $std != 0; 225 $z = (($ten - $mean) / $std) if $std != 0;
196 return $z; 226 return $z;
197 } 227 }
198 228
199 1; 229 1;
230