comparison nuclescore.sample @ 0:82dce1eb9074 draft default tip

Uploaded
author dcouvin
date Fri, 03 Sep 2021 22:36:56 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:82dce1eb9074
1 <tool id="nuclescoretool" name="nuclescore" version="0.1.0">
2 <description>nuclescore</description>
3
4
5 <command detect_errors="aggressive"><![CDATA[
6
7 #import re
8 ## Creates symlinks for each input file based on the Galaxy 'element_identifier'
9 ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat')
10 #set $named_input_files = ''
11 #for $input_file in $input_files
12 ## Add single quotes around each input file identifier
13 #set $_input_file = "'{}'".format($input_file.element_identifier)
14 ln -s '${input_file}' ${_input_file} &&
15 #set $named_input_files = $named_input_files + ' ' + $_input_file
16 #end for
17
18 perl '$__tool_directory__/nucleScore.pl' $_input_file $output
19
20
21
22
23 ]]></command>
24 <!-- ./nuclescore.sh ${named_input_files} > "$output" -->
25
26 <inputs>
27 <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/>
28 </inputs>
29
30 <outputs>
31 <data format="tabular" name="output" />
32 </outputs>
33
34 <help>
35 No documentation
36 </help>
37
38 </tool>
39
40
41 -------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
43 #!/usr/bin/perl
44
45 use strict;
46 use warnings;
47 use Bio::SeqIO;
48 use Shannon::Entropy qw/entropy/;
49 use File::Basename;
50 #use Bio::Species;
51
52 #use FindBin;
53 #use lib "$FindBin::RealBin/../perl5";
54
55 my $input = $ARGV[0];
56 #chercher comment faire une liste perl pour input
57 my @liste = split(/,/, $input);
58 my $recap_total_seq = $ARGV[1];
59
60 #my ($input, $recap_total_seq) = @ARGV;
61
62 my $start = time();
63
64 #my $file = ""; #= $ARGV[0];
65 #my $recap_total_seq = "nucleScore_result.xls";
66
67 open (RECAP,'>', $recap_total_seq) or die "could not open $!";
68 print RECAP "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tShannon Entropy\tAAA\tAAT\n";
69 close(RECAP);
70
71
72 #FASTA files
73 #if(@ARGV){
74
75 #for (my $i = 0; $i <= $#ARGV; $i++) {
76 #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) {
77 # $recap_total_seq = $ARGV[$i+1];
78 #}
79 #}
80
81
82 open (RECAP,'>>', $recap_total_seq) or die "could not open $!";
83
84 #refaire le for pour la liste input
85 #for my $arg (@ARGV){
86 for my $arg (@liste){
87 # if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){
88
89 #print "Traitement du fichier de sequence: $arg\n";
90 print "Traitement du fichier de sequence: $arg\n";
91 #my $file = $arg;
92 my $file = $arg;
93
94
95 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
96 my $globalSeq = "";
97 while (my $seq = $seqIO->next_seq()) {
98 my $seqID = $seq->id;
99 my $seqNuc = $seq->seq;
100 $globalSeq .= $seqNuc;
101 #push @arrayID, $seqID;
102 #$hSeq{$seqID} = $seqNuc;
103 #my @seqArray = split //, $seqNuc;
104 }
105
106 my $gcpercent = gc_percent($globalSeq);
107 my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file);
108 my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length);
109
110 my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt);
111
112 my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent);
113
114 my $variance = shift_data_variance(@percentList);
115 my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length);
116 my $entropy = entropy($globalSeq);
117
118 print "The sequence length for $file is: $length\n";
119 print "A percent: $aPercent\n";
120 print "T percent: $tPercent\n";
121 print "G percent: $gPercent\n";
122 print "C percent: $cPercent\n";
123 print "N percent: $nPercent\n";
124
125 print "GC percent: $gcpercent\n";
126
127 print "AT/GC ratio: $atgcRatio\n";
128
129 print "NucleScore: $nucleScore\n";
130
131 print "Shannon Entropy: $entropy\n\n";
132
133 print "3 digits:\n";
134 my @trinucs=($globalSeq=~/(?=(.{3}))/g);
135 my %tri_count=();
136 $tri_count{$_}++ for @trinucs;
137 print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count);
138 print "\n2 digits:\n";
139 my @trinucs2=($globalSeq=~/(?=(.{2}))/g);
140 my %tri_count2=();
141 $tri_count2{$_}++ for @trinucs2;
142 print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2);
143
144 my $aaa = $tri_count{'AAA'};
145 my $aat = $tri_count{'AAT'};
146
147 print "--------------------------------------\n\n";
148
149 my $label = basename($file);
150
151
152 #Summary file
153 #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
154 print RECAP "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n";
155 #}
156 }
157 close (RECAP) or die "close file error : $!";
158 #}
159
160 my $end = time();
161
162 my $total = $end - $start;
163
164 print "***** Total time (in seconds) is: $total *****\n";
165
166 #------------------------------------------------------------------------------
167 # number nucleotid and length
168 sub number_nuc_length_seq {
169 my ($fastaFile) = @_;
170 my $ade = 0;
171 my $thy = 0;
172 my $gua = 0;
173 my $cyt = 0;
174 my $n = 0;
175 my $length = 0;
176
177 open (FASTA, "<", $fastaFile) or die "Could not open $!";
178 while (<FASTA>) {
179 chomp;
180 if ($_ !~ />/) {
181 my @seq = split //, $_;
182
183 for my $nuc (@seq) {
184 $length +=1 ;
185 if ($nuc =~ /a/i) {$ade+=1;}
186 elsif ($nuc =~ /t/i) {$thy+=1;}
187 elsif ($nuc =~ /g/i) {$gua+=1;}
188 elsif ($nuc =~ /c/i) {$cyt+=1;}
189 elsif ($nuc =~ /n/i) {$n+=1;}
190 }
191 }
192 }
193 close(FASTA) or die "Error close file :$!";
194 return ($ade, $thy, $gua, $cyt, $n, $length);
195
196 }
197
198 #------------------------------------------------------------------------------
199 # compute percentage of nucleotid
200 sub nucleotid_percent {
201 my($ade, $thy, $gua, $cyt, $n, $length) = @_;
202
203 my $adePercent = $ade / $length * 100;
204 my $thyPercent = $thy / $length * 100;
205 my $guaPercent = $gua / $length * 100;
206 my $cytPercent = $cyt / $length * 100;
207 my $nPercent = $n / $length * 100;
208
209 return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent);
210
211 }
212
213 #------------------------------------------------------------------------------
214 # compute GC pourcent
215 sub gc_percent {
216 my ($seq) = @_;
217
218 my @charSeq = split(//, uc($seq));
219 my %hashFlank = ();
220
221 foreach my $v (@charSeq) {
222 $hashFlank{$v} += 1;
223 }
224
225 if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;}
226 if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;}
227
228 if(length($seq) == 0) {
229 return 0;
230 }
231 else {
232 return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100;
233 }
234
235 }
236 #------------------------------------------------------------------------------
237 # compute ATGC ratio
238 sub atgc_ratio {
239 my ($ade, $thy, $gua, $cyt) = @_;
240
241 return (($ade + $thy) / ($gua + $cyt));
242
243 }
244 #------------------------------------------------------------------------------
245 # variance
246 sub shift_data_variance {
247 my (@data) = @_;
248
249 if ($#data + 1 < 2) { return 0.0; }
250
251 my $K = $data[0];
252 my ($n, $Ex, $Ex2) = 0.0;
253
254 for my $x (@data) {
255 $n = $n + 1;
256 $Ex += $x - $K;
257 $Ex2 += ($x - $K) * ($x - $K);
258 }
259
260 my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1)
261
262 return $variance;
263
264 }
265 #------------------------------------------------------------------------------
266 # nucle score
267 #sub nucle_score {
268 # my ($variance, $gcPercent, $atgcRatio, $length) = @_;
269 #
270 # return (($variance * $gcPercent * $atgcRatio) / $length);
271 #}
272 sub nucle_score {
273 my ($variance, $gcPercent, $atgcRatio, $length) = @_;
274 return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length));
275 }
276
277 #------------------------------------------------------------------------------
278 sub log2 {
279 my $n = shift;
280 return (log($n) / log(2));
281 }