Mercurial > repos > dcouvin > nuclescore
changeset 0:82dce1eb9074 draft default tip
Uploaded
author | dcouvin |
---|---|
date | Fri, 03 Sep 2021 22:36:56 +0000 |
parents | |
children | |
files | nucleScore.pl nuclescore.sample nuclescore.sh nuclescore.xml output.xls |
diffstat | 5 files changed, 584 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nucleScore.pl Fri Sep 03 22:36:56 2021 +0000 @@ -0,0 +1,241 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Bio::SeqIO; +#use Shannon::Entropy qw/entropy/; +use File::Basename; +#use Bio::Species; + +#use FindBin; +#use lib "$FindBin::RealBin/../perl5"; + +#my $input = $ARGV[0]; +#chercher comment faire une liste perl pour input +#my @liste = split(/,/, $input); +#my $recap_total_seq = $ARGV[1]; + +#my ($input, $recap_total_seq) = @ARGV; + +#my $start = time(); + +#my $file = ""; #= $ARGV[0]; +#my $recap_total_seq = "nucleScore_result.xls"; + +#open (RECAP,'>', $recap_total_seq) or die "could not open $!"; + print "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tATG\tTGA\tTAG\tTAA\tGenome size (bp)\n"; +#close(RECAP); + + +#FASTA files +#if(@ARGV){ + + #for (my $i = 0; $i <= $#ARGV; $i++) { + #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { + # $recap_total_seq = $ARGV[$i+1]; + #} + #} + + + #open (RECAP,'>>', $recap_total_seq) or die "could not open $!"; + +#refaire le for pour la liste input +for my $arg (@ARGV){ +#for my $arg (@liste){ +# if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){ + + #print "Traitement du fichier de sequence: $arg\n"; + #print "Traitement du fichier de sequence: $arg\n"; + #my $file = $arg; + my $file = $arg; + + + my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); + my $globalSeq = ""; + while (my $seq = $seqIO->next_seq()) { + my $seqID = $seq->id; + my $seqNuc = $seq->seq; + $globalSeq .= $seqNuc; + #push @arrayID, $seqID; + #$hSeq{$seqID} = $seqNuc; + #my @seqArray = split //, $seqNuc; + } + + my $gcpercent = gc_percent($globalSeq); + my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file); + my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); + + my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); + + my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); + + my $variance = shift_data_variance(@percentList); + my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length); + #my $entropy = entropy($globalSeq); + + #print "The sequence length for $file is: $length\n"; + #print "A percent: $aPercent\n"; + #print "T percent: $tPercent\n"; + #print "G percent: $gPercent\n"; + #print "C percent: $cPercent\n"; + #print "N percent: $nPercent\n"; + + #print "GC percent: $gcpercent\n"; + + #print "AT/GC ratio: $atgcRatio\n"; + + #print "NucleScore: $nucleScore\n"; + + #print "Shannon Entropy: $entropy\n\n"; + + #print "3 digits:\n"; + my @trinucs=($globalSeq=~/(?=(.{3}))/g); + my %tri_count=(); + $tri_count{$_}++ for @trinucs; + #print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count); + #print "\n2 digits:\n"; + my @trinucs2=($globalSeq=~/(?=(.{2}))/g); + my %tri_count2=(); + $tri_count2{$_}++ for @trinucs2; + #print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2); + + my $atg = $tri_count{'ATG'}; + my $tga = $tri_count{'TGA'}; + my $tag = $tri_count{'TAG'}; + my $taa = $tri_count{'TAA'}; + + #print "--------------------------------------\n\n"; + + my $label = basename($file); + + + #Summary file + #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; + print "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$atg\t$tga\t$tag\t$taa\t$length\n"; + #} + } + #close (RECAP) or die "close file error : $!"; +#} + +#my $end = time(); + +#my $total = $end - $start; + +#print "***** Total time (in seconds) is: $total *****\n"; + +#------------------------------------------------------------------------------ +# number nucleotid and length +sub number_nuc_length_seq { + my ($fastaFile) = @_; + my $ade = 0; + my $thy = 0; + my $gua = 0; + my $cyt = 0; + my $n = 0; + my $length = 0; + + open (FASTA, "<", $fastaFile) or die "Could not open $!"; + while (<FASTA>) { + chomp; + if ($_ !~ />/) { + my @seq = split //, $_; + + for my $nuc (@seq) { + $length +=1 ; + if ($nuc =~ /a/i) {$ade+=1;} + elsif ($nuc =~ /t/i) {$thy+=1;} + elsif ($nuc =~ /g/i) {$gua+=1;} + elsif ($nuc =~ /c/i) {$cyt+=1;} + elsif ($nuc =~ /n/i) {$n+=1;} + } + } + } + close(FASTA) or die "Error close file :$!"; + return ($ade, $thy, $gua, $cyt, $n, $length); + +} + +#------------------------------------------------------------------------------ +# compute percentage of nucleotid +sub nucleotid_percent { + my($ade, $thy, $gua, $cyt, $n, $length) = @_; + + my $adePercent = $ade / $length * 100; + my $thyPercent = $thy / $length * 100; + my $guaPercent = $gua / $length * 100; + my $cytPercent = $cyt / $length * 100; + my $nPercent = $n / $length * 100; + + return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); + +} + +#------------------------------------------------------------------------------ +# compute GC pourcent +sub gc_percent { + my ($seq) = @_; + + my @charSeq = split(//, uc($seq)); + my %hashFlank = (); + + foreach my $v (@charSeq) { + $hashFlank{$v} += 1; + } + + if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} + if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} + + if(length($seq) == 0) { + return 0; + } + else { + return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; + } + +} +#------------------------------------------------------------------------------ +# compute ATGC ratio +sub atgc_ratio { + my ($ade, $thy, $gua, $cyt) = @_; + + return (($ade + $thy) / ($gua + $cyt)); + +} +#------------------------------------------------------------------------------ +# variance +sub shift_data_variance { + my (@data) = @_; + + if ($#data + 1 < 2) { return 0.0; } + + my $K = $data[0]; + my ($n, $Ex, $Ex2) = 0.0; + + for my $x (@data) { + $n = $n + 1; + $Ex += $x - $K; + $Ex2 += ($x - $K) * ($x - $K); + } + + my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) + + return $variance; + +} +#------------------------------------------------------------------------------ +# nucle score +#sub nucle_score { +# my ($variance, $gcPercent, $atgcRatio, $length) = @_; +# +# return (($variance * $gcPercent * $atgcRatio) / $length); +#} +sub nucle_score { + my ($variance, $gcPercent, $atgcRatio, $length) = @_; + return log2(($variance * $gcPercent * $atgcRatio ** (3)) / sqrt($length)); +} + +#------------------------------------------------------------------------------ +sub log2 { + my $n = shift; + return (log($n) / log(2)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nuclescore.sample Fri Sep 03 22:36:56 2021 +0000 @@ -0,0 +1,281 @@ +<tool id="nuclescoretool" name="nuclescore" version="0.1.0"> + <description>nuclescore</description> + + +<command detect_errors="aggressive"><![CDATA[ + +#import re + ## Creates symlinks for each input file based on the Galaxy 'element_identifier' + ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat') + #set $named_input_files = '' + #for $input_file in $input_files + ## Add single quotes around each input file identifier + #set $_input_file = "'{}'".format($input_file.element_identifier) + ln -s '${input_file}' ${_input_file} && + #set $named_input_files = $named_input_files + ' ' + $_input_file + #end for + + perl '$__tool_directory__/nucleScore.pl' $_input_file $output + + + + +]]></command> + <!-- ./nuclescore.sh ${named_input_files} > "$output" --> + +<inputs> + <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/> +</inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + +<help> +No documentation + </help> + +</tool> + + +------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +#!/usr/bin/perl + +use strict; +use warnings; +use Bio::SeqIO; +use Shannon::Entropy qw/entropy/; +use File::Basename; +#use Bio::Species; + +#use FindBin; +#use lib "$FindBin::RealBin/../perl5"; + +my $input = $ARGV[0]; +#chercher comment faire une liste perl pour input +my @liste = split(/,/, $input); +my $recap_total_seq = $ARGV[1]; + +#my ($input, $recap_total_seq) = @ARGV; + +my $start = time(); + +#my $file = ""; #= $ARGV[0]; +#my $recap_total_seq = "nucleScore_result.xls"; + +open (RECAP,'>', $recap_total_seq) or die "could not open $!"; + print RECAP "File\tA percent\tT percent\tC percent\tG percent\tGC percent\tAT/GC ratio\tNucleScore\tShannon Entropy\tAAA\tAAT\n"; +close(RECAP); + + +#FASTA files +#if(@ARGV){ + + #for (my $i = 0; $i <= $#ARGV; $i++) { + #if ($ARGV[$i]=~/-output/i or $ARGV[$i]=~/-o/i) { + # $recap_total_seq = $ARGV[$i+1]; + #} + #} + + + open (RECAP,'>>', $recap_total_seq) or die "could not open $!"; + +#refaire le for pour la liste input +#for my $arg (@ARGV){ +for my $arg (@liste){ +# if ($arg =~ m/.fasta/ or $arg =~ m/.fna/ or $arg =~ m/.fa/){ + + #print "Traitement du fichier de sequence: $arg\n"; + print "Traitement du fichier de sequence: $arg\n"; + #my $file = $arg; + my $file = $arg; + + + my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); + my $globalSeq = ""; + while (my $seq = $seqIO->next_seq()) { + my $seqID = $seq->id; + my $seqNuc = $seq->seq; + $globalSeq .= $seqNuc; + #push @arrayID, $seqID; + #$hSeq{$seqID} = $seqNuc; + #my @seqArray = split //, $seqNuc; + } + + my $gcpercent = gc_percent($globalSeq); + my ($ade, $thy, $gua, $cyt, $n, $length) = number_nuc_length_seq($file); + my ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent) = nucleotid_percent($ade, $thy, $gua, $cyt, $n, $length); + + my $atgcRatio = atgc_ratio($ade, $thy, $gua, $cyt); + + my @percentList = ($aPercent, $tPercent, $gPercent, $cPercent, $nPercent); + + my $variance = shift_data_variance(@percentList); + my $nucleScore = nucle_score($variance, $gcpercent, $atgcRatio, $length); + my $entropy = entropy($globalSeq); + + print "The sequence length for $file is: $length\n"; + print "A percent: $aPercent\n"; + print "T percent: $tPercent\n"; + print "G percent: $gPercent\n"; + print "C percent: $cPercent\n"; + print "N percent: $nPercent\n"; + + print "GC percent: $gcpercent\n"; + + print "AT/GC ratio: $atgcRatio\n"; + + print "NucleScore: $nucleScore\n"; + + print "Shannon Entropy: $entropy\n\n"; + + print "3 digits:\n"; + my @trinucs=($globalSeq=~/(?=(.{3}))/g); + my %tri_count=(); + $tri_count{$_}++ for @trinucs; + print $_,":",$tri_count{$_},"\n" for sort keys(%tri_count); + print "\n2 digits:\n"; + my @trinucs2=($globalSeq=~/(?=(.{2}))/g); + my %tri_count2=(); + $tri_count2{$_}++ for @trinucs2; + print $_,":",$tri_count2{$_},"\n" for sort keys(%tri_count2); + + my $aaa = $tri_count{'AAA'}; + my $aat = $tri_count{'AAT'}; + + print "--------------------------------------\n\n"; + + my $label = basename($file); + + + #Summary file + #print RECAP "$file\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; + print RECAP "$label\t$aPercent\t$tPercent\t$cPercent\t$gPercent\t$gcpercent\t$atgcRatio\t$nucleScore\t$entropy\t$aaa\t$aat\n"; + #} + } + close (RECAP) or die "close file error : $!"; +#} + +my $end = time(); + +my $total = $end - $start; + +print "***** Total time (in seconds) is: $total *****\n"; + +#------------------------------------------------------------------------------ +# number nucleotid and length +sub number_nuc_length_seq { + my ($fastaFile) = @_; + my $ade = 0; + my $thy = 0; + my $gua = 0; + my $cyt = 0; + my $n = 0; + my $length = 0; + + open (FASTA, "<", $fastaFile) or die "Could not open $!"; + while (<FASTA>) { + chomp; + if ($_ !~ />/) { + my @seq = split //, $_; + + for my $nuc (@seq) { + $length +=1 ; + if ($nuc =~ /a/i) {$ade+=1;} + elsif ($nuc =~ /t/i) {$thy+=1;} + elsif ($nuc =~ /g/i) {$gua+=1;} + elsif ($nuc =~ /c/i) {$cyt+=1;} + elsif ($nuc =~ /n/i) {$n+=1;} + } + } + } + close(FASTA) or die "Error close file :$!"; + return ($ade, $thy, $gua, $cyt, $n, $length); + +} + +#------------------------------------------------------------------------------ +# compute percentage of nucleotid +sub nucleotid_percent { + my($ade, $thy, $gua, $cyt, $n, $length) = @_; + + my $adePercent = $ade / $length * 100; + my $thyPercent = $thy / $length * 100; + my $guaPercent = $gua / $length * 100; + my $cytPercent = $cyt / $length * 100; + my $nPercent = $n / $length * 100; + + return ($adePercent, $thyPercent, $guaPercent, $cytPercent, $nPercent); + +} + +#------------------------------------------------------------------------------ +# compute GC pourcent +sub gc_percent { + my ($seq) = @_; + + my @charSeq = split(//, uc($seq)); + my %hashFlank = (); + + foreach my $v (@charSeq) { + $hashFlank{$v} += 1; + } + + if (! $hashFlank{'G'}) { $hashFlank{'G'} = 0;} + if (! $hashFlank{'C'}) { $hashFlank{'C'} = 0;} + + if(length($seq) == 0) { + return 0; + } + else { + return (($hashFlank{'G'} + $hashFlank{'C'}) / (length($seq))) * 100; + } + +} +#------------------------------------------------------------------------------ +# compute ATGC ratio +sub atgc_ratio { + my ($ade, $thy, $gua, $cyt) = @_; + + return (($ade + $thy) / ($gua + $cyt)); + +} +#------------------------------------------------------------------------------ +# variance +sub shift_data_variance { + my (@data) = @_; + + if ($#data + 1 < 2) { return 0.0; } + + my $K = $data[0]; + my ($n, $Ex, $Ex2) = 0.0; + + for my $x (@data) { + $n = $n + 1; + $Ex += $x - $K; + $Ex2 += ($x - $K) * ($x - $K); + } + + my $variance = ($Ex2 - ($Ex * $Ex) / $n) / ($n); ## ($n - 1) + + return $variance; + +} +#------------------------------------------------------------------------------ +# nucle score +#sub nucle_score { +# my ($variance, $gcPercent, $atgcRatio, $length) = @_; +# +# return (($variance * $gcPercent * $atgcRatio) / $length); +#} +sub nucle_score { + my ($variance, $gcPercent, $atgcRatio, $length) = @_; + return log2(($variance * $gcPercent * $atgcRatio) / sqrt($length)); +} + +#------------------------------------------------------------------------------ +sub log2 { + my $n = shift; + return (log($n) / log(2)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nuclescore.sh Fri Sep 03 22:36:56 2021 +0000 @@ -0,0 +1,13 @@ +#!/bin/bash + +input=$1 +output=$2 + + +directory=`dirname $0` + +perl $directory/nucleScore.pl $input_files $output + +#perl $directory/nucleScore.pl `echo $input | tr ',' ' '` $output + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nuclescore.xml Fri Sep 03 22:36:56 2021 +0000 @@ -0,0 +1,47 @@ +<tool id="nuclescore" name="NucleScore tool" version="0.1.0"> + <description>allows to get information for classification of genome assemblies</description> + +<requirements> + <requirement type="package" version="1.7.2">perl-bioperl</requirement> +</requirements> + +<command detect_errors="aggressive"><![CDATA[ + +#import re + ## Creates symlinks for each input file based on the Galaxy 'element_identifier' + ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat') + #set $named_input_files = '' + #for $input_file in $input_files + ## Add single quotes around each input file identifier + #set $_input_file = "'{}'".format($input_file.element_identifier) + ln -s '${input_file}' ${_input_file} && + #set $named_input_files = $named_input_files + ' ' + $_input_file + #end for + + + perl '$__tool_directory__/nucleScore.pl' $named_input_files > "$output" + + + +]]></command> + <!-- perl '$__tool_directory__/nucleScore.pl' $_input_file > "$output" --> + <!-- ./nuclescore.sh ${named_input_files} > "$output" --> + +<inputs> + <param format="fasta" name="input_files" type="data" label="Genome fasta file : " multiple="true" display="checkboxes"/> +</inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + +<help><![CDATA[ +nucleScore.pl is a Perl script allowing to get information for classification of genome assemblies basing on nucleotides data (GC percent, AT/GC ratio, ...) + +This script belongs to the getSequenceInfo supplementary tools + +- GitHub: https://github.com/karubiotools/getSequenceInfo/tree/master/supplementary_tools +]]> +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/output.xls Fri Sep 03 22:36:56 2021 +0000 @@ -0,0 +1,2 @@ +File A percent T percent C percent G percent GC percent AT/GC ratio NucleScore Shannon Entropy AAA AAT +exemple_sequence_AFRI2a.fasta 17.2865811379154 17.2254003843317 32.9175761234243 32.5704423543286 65.4880184777529 0.526996881635245 1.30199532029328 1.92961990163878 22633 24339