Mercurial > repos > romaingred > pirna_pipeline
comparison bin/ppp.pm @ 17:8031792a6e2c draft
Uploaded
author | romaingred |
---|---|
date | Tue, 17 Oct 2017 09:34:09 -0400 |
parents | 198009598544 |
children |
comparison
equal
deleted
inserted
replaced
16:2be918bf0efe | 17:8031792a6e2c |
---|---|
12 our @EXPORT_OK = qw( &ping_pong_partners ); | 12 our @EXPORT_OK = qw( &ping_pong_partners ); |
13 | 13 |
14 sub ping_pong_partners | 14 sub ping_pong_partners |
15 { | 15 { |
16 my ( $TE_fai, $sam, $dir, $max ) = @_; | 16 my ( $TE_fai, $sam, $dir, $max ) = @_; |
17 my ( $hashRef, $dupRef ) = count_mapped ( $TE_fai, $sam ); | 17 my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam ); |
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); | 18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); |
19 my ( $SP, $AP, $SN, $AN, $txt); | 19 my ( $SP, $AP, $SN, $AN, $txt ); |
20 my $flag = 0; | 20 my $flag = 0; |
21 my @distri_overlap = (); my @overlaps_names = (); | 21 my @distri_overlap = (); my @overlaps_names = (); |
22 | 22 |
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; | 23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; |
24 foreach my $k ( sort keys %{$hashRef} ) | 24 foreach my $k ( sort keys %{$hashRef} ) |
25 { | 25 { |
26 my $v = $hashRef->{$k}; | 26 my $v = $hashRef->{$k}; |
27 my $TE_dir = $dir.$k.'/'; | 27 my $TE_dir = $dir.$k.'/'; |
28 | 28 |
29 %num_per_overlap_size = (); $overlap_number = 0; | 29 %num_per_overlap_size = (); $overlap_number = 0; |
30 $flag = 0; | 30 $flag = 0; |
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) | 31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) |
32 { | 32 { |
33 $reverseR = ${$v->[1]}[$i] ; | 33 $reverseR = ${$v->[1]}[$i] ; |
34 $begRev = $reverseR->[0]; | 34 $begRev = $reverseR->[0]; |
39 | 39 |
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) | 40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) |
41 { | 41 { |
42 $sensR = ${$v->[0]}[$j]; | 42 $sensR = ${$v->[0]}[$j]; |
43 $begSens = $sensR->[0]; | 43 $begSens = $sensR->[0]; |
44 | |
45 $endSens = $begSens + length($sensR->[1]) - 1; | 44 $endSens = $begSens + length($sensR->[1]) - 1; |
46 | 45 |
47 if ( $begSens <= $endRev && $endSens > $endRev ) | 46 if ( $begSens <= $endRev && $endSens > $endRev ) |
48 { | 47 { |
49 $flag = 1; | 48 $flag = 1; |
50 | |
51 mkdir $TE_dir; | 49 mkdir $TE_dir; |
52 open $SP, '>>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; | 50 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; |
53 open $AP, '>>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; | 51 |
54 open $SN, '>>', $TE_dir."sens.txt" || die "cannot create sens\n"; | |
55 open $AN, '>>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; | |
56 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; | |
57 | |
58 $overlap = $endRev - $begSens + 1; | 52 $overlap = $endRev - $begSens + 1; |
59 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; | 53 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; |
60 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; | 54 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; |
61 | 55 |
62 if ( $overlap == 10 ) | 56 if ( $overlap == 10 ) |
63 { | 57 { |
64 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | 58 $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1; |
65 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | 59 $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1; |
66 } | |
67 else | |
68 { | |
69 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | |
70 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | |
71 } | 60 } |
72 next if $overlap > $max; | 61 next if $overlap > $max; |
73 if ( $snum < $rnum ) | 62 if ( $snum < $rnum ) |
74 { | 63 { |
75 $num_per_overlap_size{$overlap} += $snum; | 64 $num_per_overlap_size{$overlap} += $snum; |
81 $overlap_number += $rnum ; | 70 $overlap_number += $rnum ; |
82 } | 71 } |
83 } | 72 } |
84 } | 73 } |
85 } | 74 } |
86 | |
87 if ( $max != 0 ) | 75 if ( $max != 0 ) |
88 { | 76 { |
89 my @overlaps = (); | 77 my @overlaps = (); |
90 push @overlaps_names, $k; | 78 push @overlaps_names, $k; |
91 for my $i (1..$max) | 79 for my $i (1..$max) |
92 { | 80 { |
93 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); | 81 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); |
94 push @overlaps, $num_per_overlap_size{$i}; | 82 push @overlaps, $num_per_overlap_size{$i}; |
95 } | 83 } |
96 push @distri_overlap, \@overlaps; | 84 push @distri_overlap, \@overlaps; |
97 } | 85 } |
98 | 86 |
99 if ( $flag == 1 ) | 87 if ( $flag == 1 ) |
100 { | 88 { |
89 open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; | |
90 open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; | |
91 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) | |
92 { | |
93 $reverseR = ${$v->[1]}[$i] ; | |
94 my $revR = reverse($reverseR->[1]); | |
95 $revR =~ tr/atgcuATGCU/tacgaTACGA/; | |
96 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; | |
97 if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 ) | |
98 { | |
99 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | |
100 } | |
101 else | |
102 { | |
103 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; | |
104 } | |
105 } | |
106 close $AP; close $AN; | |
107 | |
108 open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; | |
109 open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n"; | |
110 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) | |
111 { | |
112 $sensR = ${$v->[0]}[$j]; | |
113 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; | |
114 if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 ) | |
115 { | |
116 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | |
117 } | |
118 else | |
119 { | |
120 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; | |
121 } | |
122 } | |
123 close $SP; close $SN; | |
124 | |
101 my $histo_png = $TE_dir.'histogram.png'; | 125 my $histo_png = $TE_dir.'histogram.png'; |
102 histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); | 126 histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); |
103 print $txt "size\tnumber\tpercentage of the total overlap number\n"; | 127 print $txt "size\tnumber\tpercentage of the total overlap number\n"; |
104 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) | 128 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) |
105 { | 129 { |
106 my $percentage = 0; | 130 my $percentage = 0; |
107 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; | 131 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; |
108 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; | 132 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; |
109 | |
110 } | 133 } |
111 close $txt; | 134 close $txt; |
112 } | 135 } |
113 | |
114 } | 136 } |
115 | 137 |
116 foreach my $tabP ( @distri_overlap ) | 138 foreach my $tabP ( @distri_overlap ) |
117 { | 139 { |
118 my $sum = sum($tabP); | 140 my $sum = sum($tabP); |
123 my $name = shift @overlaps_names; | 145 my $name = shift @overlaps_names; |
124 my $prob = 'NA'; | 146 my $prob = 'NA'; |
125 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; | 147 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; |
126 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); | 148 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); |
127 } | 149 } |
128 close $ppp_f; | 150 close $ppp_f; |
129 } | 151 } |
130 | 152 |
131 sub count_mapped | 153 sub count_mapped |
132 { | 154 { |
133 my ( $fai, $in_file ) = @_; | 155 my ( $fai, $in_file ) = @_; |
134 my ( %mapped, %dup ); | 156 my ( %mapped, %dup, %has_ppp ); |
135 | 157 |
136 open my $f, '<', $fai || die "cannot open $fai $! \n"; | 158 open my $f, '<', $fai || die "cannot open $fai $! \n"; |
137 while(<$f>) | 159 while(<$f>) |
138 { | 160 { |
139 if ($_ =~ /(.*)\t(\d+)\n/) | 161 if ($_ =~ /(.*)\t(\d+)\n/) |
140 { | 162 { |
141 $mapped{$1} = []; | 163 $mapped{$1} = []; |
142 $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; | 164 $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; |
143 } | 165 } |
144 } | 166 } |
145 close $f; | 167 close $f; |
146 | 168 |
147 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n"; | 169 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n"; |
148 while(<$infile>) | 170 while(<$infile>) |
149 { | 171 { |
150 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | 172 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
151 { | 173 { |
152 my @line = split (/\t/,$_); | 174 my @line = split (/\t/,$_); |
153 if ($line[1] == 0) | 175 if ($line[1] == 0) |
154 { | 176 { |
155 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); | 177 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) |
178 { | |
179 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]]; | |
180 $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0; | |
181 } | |
156 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; | 182 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; |
157 } | 183 } |
158 elsif ($line[1] == 16) | 184 elsif ($line[1] == 16) |
159 { | 185 { |
160 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]] unless exists ($dup{$line[3].$line[9].$line[1].$line[2]}); | 186 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) |
161 $dup{$line[3].$line[9].$line[1].$line[2]}+=1; | 187 { |
188 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]]; | |
189 $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0; | |
190 } | |
191 $dup{$line[3].$line[9].$line[1].$line[2]}+=1 | |
162 } | 192 } |
163 } | 193 } |
164 } | 194 } |
165 close $infile; | 195 close $infile; |
166 return (\%mapped, \%dup ); | 196 return (\%mapped, \%dup, \%has_ppp ); |
167 } | 197 } |
168 | 198 |
169 sub sum | 199 sub sum |
170 { | 200 { |
171 my $arrayref = shift; | 201 my $arrayref = shift; |
183 } | 213 } |
184 | 214 |
185 sub standard_deviation | 215 sub standard_deviation |
186 { | 216 { |
187 my ($arrayref, $mean) = @_; | 217 my ($arrayref, $mean) = @_; |
188 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); | 218 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); |
189 } | 219 } |
190 | 220 |
191 sub z_significance | 221 sub z_significance |
192 { | 222 { |
193 my ($ten, $mean, $std) = @_; | 223 my ($ten, $mean, $std) = @_; |
195 $z = (($ten - $mean) / $std) if $std != 0; | 225 $z = (($ten - $mean) / $std) if $std != 0; |
196 return $z; | 226 return $z; |
197 } | 227 } |
198 | 228 |
199 1; | 229 1; |
230 |