Mercurial > repos > dcouvin > nuclescore
comparison nucleScore.pl @ 0:82dce1eb9074 draft default tip
Uploaded
author | dcouvin |
---|---|
date | Fri, 03 Sep 2021 22:36:56 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:82dce1eb9074 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Bio::SeqIO; | |
6 #use Shannon::Entropy qw/entropy/; | |
7 use File::Basename; | |
8 #use Bio::Species; | |
9 | |
10 #use FindBin; | |
11 #use lib "$FindBin::RealBin/../perl5"; | |
12 | |
13 #my $input = $ARGV[0]; | |
14 #chercher comment faire une liste perl pour input | |
15 #my @liste = split(/,/, $input); | |
16 #my $recap_total_seq = $ARGV[1]; | |
17 | |
18 #my ($input, $recap_total_seq) = @ARGV; | |
19 | |
20 #my $start = time(); | |
21 | |
22 #my $file = ""; #= $ARGV[0]; | |
23 #my $recap_total_seq = "nucleScore_result.xls"; | |
24 | |
25 #open (RECAP,'>', $recap_total_seq) or die "could not open $!"; | |
26 print "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tATG\tTGA\tTAG\tTAA\tGenome size (bp)\n"; | |
27 #close(RECAP); | |
28 | |
29 | |
30 #FASTA files | |
31 #if(@ARGV){ | |
32 | |
33 #for (my $i = 0; $i <= $#ARGV; $i++) { | |
34 #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { | |
35 # $recap_total_seq = $ARGV[$i+1]; | |
36 #} | |
37 #} | |
38 | |
39 | |
40 #open (RECAP,'>>', $recap_total_seq) or die "could not open $!"; | |
41 | |
42 #refaire le for pour la liste input | |
43 for my $arg (@ARGV){ | |
44 #for my $arg (@liste){ | |
45 # if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){ | |
46 | |
47 #print "Traitement du fichier de sequence: $arg\n"; | |
48 #print "Traitement du fichier de sequence: $arg\n"; | |
49 #my $file = $arg; | |
50 my $file = $arg; | |
51 | |
52 | |
53 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); | |
54 my $globalSeq = ""; | |
55 while (my $seq = $seqIO->next_seq()) { | |
56 my $seqID = $seq->id; | |
57 my $seqNuc = $seq->seq; | |
58 $globalSeq .= $seqNuc; | |
59 #push @arrayID, $seqID; | |
60 #$hSeq{$seqID} = $seqNuc; | |
61 #my @seqArray = split //, $seqNuc; | |
62 } | |
63 | |
64 my $gcpercent = gc_percent($globalSeq); | |
65 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file); | |
66 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); | |
67 | |
68 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); | |
69 | |
70 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); | |
71 | |
72 my $variance = shift_data_variance(@percentList); | |
73 my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length); | |
74 #my $entropy = entropy($globalSeq); | |
75 | |
76 #print "The sequence length for $file is: $length\n"; | |
77 #print "A percent: $aPercent\n"; | |
78 #print "T percent: $tPercent\n"; | |
79 #print "G percent: $gPercent\n"; | |
80 #print "C percent: $cPercent\n"; | |
81 #print "N percent: $nPercent\n"; | |
82 | |
83 #print "GC percent: $gcpercent\n"; | |
84 | |
85 #print "AT/GC ratio: $atgcRatio\n"; | |
86 | |
87 #print "NucleScore: $nucleScore\n"; | |
88 | |
89 #print "Shannon Entropy: $entropy\n\n"; | |
90 | |
91 #print "3 digits:\n"; | |
92 my @trinucs=($globalSeq=~/(?=(.{3}))/g); | |
93 my %tri_count=(); | |
94 $tri_count{$_}++ for @trinucs; | |
95 #print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count); | |
96 #print "\n2 digits:\n"; | |
97 my @trinucs2=($globalSeq=~/(?=(.{2}))/g); | |
98 my %tri_count2=(); | |
99 $tri_count2{$_}++ for @trinucs2; | |
100 #print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2); | |
101 | |
102 my $atg = $tri_count{'ATG'}; | |
103 my $tga = $tri_count{'TGA'}; | |
104 my $tag = $tri_count{'TAG'}; | |
105 my $taa = $tri_count{'TAA'}; | |
106 | |
107 #print "--------------------------------------\n\n"; | |
108 | |
109 my $label = basename($file); | |
110 | |
111 | |
112 #Summary file | |
113 #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; | |
114 print "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$atg\t$tga\t$tag\t$taa\t$length\n"; | |
115 #} | |
116 } | |
117 #close (RECAP) or die "close file error : $!"; | |
118 #} | |
119 | |
120 #my $end = time(); | |
121 | |
122 #my $total = $end - $start; | |
123 | |
124 #print "***** Total time (in seconds) is: $total *****\n"; | |
125 | |
126 #------------------------------------------------------------------------------ | |
127 # number nucleotid and length | |
128 sub number_nuc_length_seq { | |
129 my ($fastaFile) = @_; | |
130 my $ade = 0; | |
131 my $thy = 0; | |
132 my $gua = 0; | |
133 my $cyt = 0; | |
134 my $n = 0; | |
135 my $length = 0; | |
136 | |
137 open (FASTA, "<", $fastaFile) or die "Could not open $!"; | |
138 while (<FASTA>) { | |
139 chomp; | |
140 if ($_ !~ />/) { | |
141 my @seq = split //, $_; | |
142 | |
143 for my $nuc (@seq) { | |
144 $length +=1 ; | |
145 if ($nuc =~ /a/i) {$ade+=1;} | |
146 elsif ($nuc =~ /t/i) {$thy+=1;} | |
147 elsif ($nuc =~ /g/i) {$gua+=1;} | |
148 elsif ($nuc =~ /c/i) {$cyt+=1;} | |
149 elsif ($nuc =~ /n/i) {$n+=1;} | |
150 } | |
151 } | |
152 } | |
153 close(FASTA) or die "Error close file :$!"; | |
154 return ($ade, $thy, $gua, $cyt, $n, $length); | |
155 | |
156 } | |
157 | |
158 #------------------------------------------------------------------------------ | |
159 # compute percentage of nucleotid | |
160 sub nucleotid_percent { | |
161 my($ade, $thy, $gua, $cyt, $n, $length) = @_; | |
162 | |
163 my $adePercent = $ade / $length * 100; | |
164 my $thyPercent = $thy / $length * 100; | |
165 my $guaPercent = $gua / $length * 100; | |
166 my $cytPercent = $cyt / $length * 100; | |
167 my $nPercent = $n / $length * 100; | |
168 | |
169 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); | |
170 | |
171 } | |
172 | |
173 #------------------------------------------------------------------------------ | |
174 # compute GC pourcent | |
175 sub gc_percent { | |
176 my ($seq) = @_; | |
177 | |
178 my @charSeq = split(//, uc($seq)); | |
179 my %hashFlank = (); | |
180 | |
181 foreach my $v (@charSeq) { | |
182 $hashFlank{$v} += 1; | |
183 } | |
184 | |
185 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} | |
186 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} | |
187 | |
188 if(length($seq) == 0) { | |
189 return 0; | |
190 } | |
191 else { | |
192 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; | |
193 } | |
194 | |
195 } | |
196 #------------------------------------------------------------------------------ | |
197 # compute ATGC ratio | |
198 sub atgc_ratio { | |
199 my ($ade, $thy, $gua, $cyt) = @_; | |
200 | |
201 return (($ade + $thy) / ($gua + $cyt)); | |
202 | |
203 } | |
204 #------------------------------------------------------------------------------ | |
205 # variance | |
206 sub shift_data_variance { | |
207 my (@data) = @_; | |
208 | |
209 if ($#data + 1 < 2) { return 0.0; } | |
210 | |
211 my $K = $data[0]; | |
212 my ($n, $Ex, $Ex2) = 0.0; | |
213 | |
214 for my $x (@data) { | |
215 $n = $n + 1; | |
216 $Ex += $x - $K; | |
217 $Ex2 += ($x - $K) * ($x - $K); | |
218 } | |
219 | |
220 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) | |
221 | |
222 return $variance; | |
223 | |
224 } | |
225 #------------------------------------------------------------------------------ | |
226 # nucle score | |
227 #sub nucle_score { | |
228 # my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
229 # | |
230 # return (($variance * $gcPercent * $atgcRatio) / $length); | |
231 #} | |
232 sub nucle_score { | |
233 my ($variance, $gcPercent, $atgcRatio, $length) = @_; | |
234 return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length)); | |
235 } | |
236 | |
237 #------------------------------------------------------------------------------ | |
238 sub log2 { | |
239 my $n = shift; | |
240 return (log($n) / log(2)); | |
241 } |