0
|
1 <tool id="nuclescoretool" name="nuclescore" version="0.1.0">
|
|
2 <description>nuclescore</description>
|
|
3
|
|
4
|
|
5 <command detect_errors="aggressive"><![CDATA[
|
|
6
|
|
7 #import re
|
|
8 ## Creates symlinks for each input file based on the Galaxy 'element_identifier'
|
|
9 ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat')
|
|
10 #set $named_input_files = ''
|
|
11 #for $input_file in $input_files
|
|
12 ## Add single quotes around each input file identifier
|
|
13 #set $_input_file = "'{}'".format($input_file.element_identifier)
|
|
14 ln -s '${input_file}' ${_input_file} &&
|
|
15 #set $named_input_files = $named_input_files + ' ' + $_input_file
|
|
16 #end for
|
|
17
|
|
18 perl '$__tool_directory__/nucleScore.pl' $_input_file $output
|
|
19
|
|
20
|
|
21
|
|
22
|
|
23 ]]></command>
|
|
24 <!-- ./nuclescore.sh ${named_input_files} > "$output" -->
|
|
25
|
|
26 <inputs>
|
|
27 <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/>
|
|
28 </inputs>
|
|
29
|
|
30 <outputs>
|
|
31 <data format="tabular" name="output" />
|
|
32 </outputs>
|
|
33
|
|
34 <help>
|
|
35 No documentation
|
|
36 </help>
|
|
37
|
|
38 </tool>
|
|
39
|
|
40
|
|
41 -------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
42
|
|
43 #!/usr/bin/perl
|
|
44
|
|
45 use strict;
|
|
46 use warnings;
|
|
47 use Bio::SeqIO;
|
|
48 use Shannon::Entropy qw/entropy/;
|
|
49 use File::Basename;
|
|
50 #use Bio::Species;
|
|
51
|
|
52 #use FindBin;
|
|
53 #use lib "$FindBin::RealBin/../perl5";
|
|
54
|
|
55 my $input = $ARGV[0];
|
|
56 #chercher comment faire une liste perl pour input
|
|
57 my @liste = split(/,/, $input);
|
|
58 my $recap_total_seq = $ARGV[1];
|
|
59
|
|
60 #my ($input, $recap_total_seq) = @ARGV;
|
|
61
|
|
62 my $start = time();
|
|
63
|
|
64 #my $file = ""; #= $ARGV[0];
|
|
65 #my $recap_total_seq = "nucleScore_result.xls";
|
|
66
|
|
67 open (RECAP,'>', $recap_total_seq) or die "could not open $!";
|
|
68 print RECAP "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tShannon Entropy\tAAA\tAAT\n";
|
|
69 close(RECAP);
|
|
70
|
|
71
|
|
72 #FASTA files
|
|
73 #if(@ARGV){
|
|
74
|
|
75 #for (my $i = 0; $i <= $#ARGV; $i++) {
|
|
76 #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
|
|
77 # $recap_total_seq = $ARGV[$i+1];
|
|
78 #}
|
|
79 #}
|
|
80
|
|
81
|
|
82 open (RECAP,'>>', $recap_total_seq) or die "could not open $!";
|
|
83
|
|
84 #refaire le for pour la liste input
|
|
85 #for my $arg (@ARGV){
|
|
86 for my $arg (@liste){
|
|
87 # if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){
|
|
88
|
|
89 #print "Traitement du fichier de sequence: $arg\n";
|
|
90 print "Traitement du fichier de sequence: $arg\n";
|
|
91 #my $file = $arg;
|
|
92 my $file = $arg;
|
|
93
|
|
94
|
|
95 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
|
|
96 my $globalSeq = "";
|
|
97 while (my $seq = $seqIO->next_seq()) {
|
|
98 my $seqID = $seq->id;
|
|
99 my $seqNuc = $seq->seq;
|
|
100 $globalSeq .= $seqNuc;
|
|
101 #push @arrayID, $seqID;
|
|
102 #$hSeq{$seqID} = $seqNuc;
|
|
103 #my @seqArray = split //, $seqNuc;
|
|
104 }
|
|
105
|
|
106 my $gcpercent = gc_percent($globalSeq);
|
|
107 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file);
|
|
108 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
|
|
109
|
|
110 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt);
|
|
111
|
|
112 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
|
|
113
|
|
114 my $variance = shift_data_variance(@percentList);
|
|
115 my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length);
|
|
116 my $entropy = entropy($globalSeq);
|
|
117
|
|
118 print "The sequence length for $file is: $length\n";
|
|
119 print "A percent: $aPercent\n";
|
|
120 print "T percent: $tPercent\n";
|
|
121 print "G percent: $gPercent\n";
|
|
122 print "C percent: $cPercent\n";
|
|
123 print "N percent: $nPercent\n";
|
|
124
|
|
125 print "GC percent: $gcpercent\n";
|
|
126
|
|
127 print "AT/GC ratio: $atgcRatio\n";
|
|
128
|
|
129 print "NucleScore: $nucleScore\n";
|
|
130
|
|
131 print "Shannon Entropy: $entropy\n\n";
|
|
132
|
|
133 print "3 digits:\n";
|
|
134 my @trinucs=($globalSeq=~/(?=(.{3}))/g);
|
|
135 my %tri_count=();
|
|
136 $tri_count{$_}++ for @trinucs;
|
|
137 print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count);
|
|
138 print "\n2 digits:\n";
|
|
139 my @trinucs2=($globalSeq=~/(?=(.{2}))/g);
|
|
140 my %tri_count2=();
|
|
141 $tri_count2{$_}++ for @trinucs2;
|
|
142 print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2);
|
|
143
|
|
144 my $aaa = $tri_count{'AAA'};
|
|
145 my $aat = $tri_count{'AAT'};
|
|
146
|
|
147 print "--------------------------------------\n\n";
|
|
148
|
|
149 my $label = basename($file);
|
|
150
|
|
151
|
|
152 #Summary file
|
|
153 #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
|
|
154 print RECAP "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
|
|
155 #}
|
|
156 }
|
|
157 close (RECAP) or die "close file error : $!";
|
|
158 #}
|
|
159
|
|
160 my $end = time();
|
|
161
|
|
162 my $total = $end - $start;
|
|
163
|
|
164 print "***** Total time (in seconds) is: $total *****\n";
|
|
165
|
|
166 #------------------------------------------------------------------------------
|
|
167 # number nucleotid and length
|
|
168 sub number_nuc_length_seq {
|
|
169 my ($fastaFile) = @_;
|
|
170 my $ade = 0;
|
|
171 my $thy = 0;
|
|
172 my $gua = 0;
|
|
173 my $cyt = 0;
|
|
174 my $n = 0;
|
|
175 my $length = 0;
|
|
176
|
|
177 open (FASTA, "<", $fastaFile) or die "Could not open $!";
|
|
178 while (<FASTA>) {
|
|
179 chomp;
|
|
180 if ($_ !~ />/) {
|
|
181 my @seq = split //, $_;
|
|
182
|
|
183 for my $nuc (@seq) {
|
|
184 $length +=1 ;
|
|
185 if ($nuc =~ /a/i) {$ade+=1;}
|
|
186 elsif ($nuc =~ /t/i) {$thy+=1;}
|
|
187 elsif ($nuc =~ /g/i) {$gua+=1;}
|
|
188 elsif ($nuc =~ /c/i) {$cyt+=1;}
|
|
189 elsif ($nuc =~ /n/i) {$n+=1;}
|
|
190 }
|
|
191 }
|
|
192 }
|
|
193 close(FASTA) or die "Error close file :$!";
|
|
194 return ($ade, $thy, $gua, $cyt, $n, $length);
|
|
195
|
|
196 }
|
|
197
|
|
198 #------------------------------------------------------------------------------
|
|
199 # compute percentage of nucleotid
|
|
200 sub nucleotid_percent {
|
|
201 my($ade, $thy, $gua, $cyt, $n, $length) = @_;
|
|
202
|
|
203 my $adePercent = $ade / $length * 100;
|
|
204 my $thyPercent = $thy / $length * 100;
|
|
205 my $guaPercent = $gua / $length * 100;
|
|
206 my $cytPercent = $cyt / $length * 100;
|
|
207 my $nPercent = $n / $length * 100;
|
|
208
|
|
209 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
|
|
210
|
|
211 }
|
|
212
|
|
213 #------------------------------------------------------------------------------
|
|
214 # compute GC pourcent
|
|
215 sub gc_percent {
|
|
216 my ($seq) = @_;
|
|
217
|
|
218 my @charSeq = split(//, uc($seq));
|
|
219 my %hashFlank = ();
|
|
220
|
|
221 foreach my $v (@charSeq) {
|
|
222 $hashFlank{$v} += 1;
|
|
223 }
|
|
224
|
|
225 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
|
|
226 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
|
|
227
|
|
228 if(length($seq) == 0) {
|
|
229 return 0;
|
|
230 }
|
|
231 else {
|
|
232 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
|
|
233 }
|
|
234
|
|
235 }
|
|
236 #------------------------------------------------------------------------------
|
|
237 # compute ATGC ratio
|
|
238 sub atgc_ratio {
|
|
239 my ($ade, $thy, $gua, $cyt) = @_;
|
|
240
|
|
241 return (($ade + $thy) / ($gua + $cyt));
|
|
242
|
|
243 }
|
|
244 #------------------------------------------------------------------------------
|
|
245 # variance
|
|
246 sub shift_data_variance {
|
|
247 my (@data) = @_;
|
|
248
|
|
249 if ($#data + 1 < 2) { return 0.0; }
|
|
250
|
|
251 my $K = $data[0];
|
|
252 my ($n, $Ex, $Ex2) = 0.0;
|
|
253
|
|
254 for my $x (@data) {
|
|
255 $n = $n + 1;
|
|
256 $Ex += $x - $K;
|
|
257 $Ex2 += ($x - $K) * ($x - $K);
|
|
258 }
|
|
259
|
|
260 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
|
|
261
|
|
262 return $variance;
|
|
263
|
|
264 }
|
|
265 #------------------------------------------------------------------------------
|
|
266 # nucle score
|
|
267 #sub nucle_score {
|
|
268 # my ($variance, $gcPercent, $atgcRatio, $length) = @_;
|
|
269 #
|
|
270 # return (($variance * $gcPercent * $atgcRatio) / $length);
|
|
271 #}
|
|
272 sub nucle_score {
|
|
273 my ($variance, $gcPercent, $atgcRatio, $length) = @_;
|
|
274 return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
|
|
275 }
|
|
276
|
|
277 #------------------------------------------------------------------------------
|
|
278 sub log2 {
|
|
279 my $n = shift;
|
|
280 return (log($n) / log(2));
|
|
281 }
|