Mercurial > repos > dcouvin > countdifferences
changeset 0:a9fb5e4abb2d draft default tip
Uploaded
author | dcouvin |
---|---|
date | Mon, 20 Sep 2021 21:51:24 +0000 |
parents | |
children | |
files | countDifferences.pl countDifferences.xml input.fasta |
diffstat | 3 files changed, 173 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countDifferences.pl Mon Sep 20 21:51:24 2021 +0000 @@ -0,0 +1,115 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +#use DateTime; +#use Date::Calc qw(:all); +use Bio::SeqIO; +#use String::Similarity; +#use StringOO::Similarity; +#use lib qw(./StringOO); +#use Similarity::similarity; + +#use local::lib "$FindBin::Bin/.."; ### points to ~/mydir1 and local::lib finds lib +#use lib "$FindBin::StringOO/../lib"; +#package Similarity.pm; +#use String::Approx 'adistr'; +#package String::Similarity; +#package String::Similarity qw(similarity); +#use String::Similarity; + +################################################################## +## Perl program to count variants from aligned multi-FASTA file +################################################################## + +my $file = $ARGV[0]; +my %hSeq = (); +my @arrayID = (); +my $resultFile = $ARGV[1]; #"resultVariants.xls"; +my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls"; +my $chain = ""; + + my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); + + while (my $seq = $seqIO->next_seq()) { + my $seqID = $seq->id; + my $seqNuc = $seq->seq; + push @arrayID, $seqID; + $hSeq{$seqID} = $seqNuc; + #my @seqArray = split //, $seqNuc; + } + + open(RES, ">", $resultFile) or die "error open file $!:"; + open(SIM, ">", $resultSimilarity) or die "error open file $!:"; + + print RES "\t"; + print SIM "\t"; + foreach my $id (@arrayID) { + print RES "$id\t"; + print SIM "$id\t"; + } + print RES "\n"; + print SIM "\n"; + + for(my $i = 0; $i <= $#arrayID; $i++){ + print RES "$arrayID[$i]\t"; + print SIM "$arrayID[$i]\t"; + for(my $j = 0; $j <= $#arrayID; $j++){ #for(my $j = $i+1; $j <= $#arrayID; $j++){ + my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); + #my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); + my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident + my $roundSim = sprintf('%.3f', $similarity); + print RES "$snpVariants\t"; + print SIM "$roundSim\t"; + #print RES ("SNPs between $arrayID[$i] and $arrayID[$j] = $snpVariants\t"); + } + print RES "\n"; + print SIM "\n"; + } + +close(RES) or die "error when closing file $!:"; +close(SIM) or die "error when closing file $!:"; + + #foreach my $id (@arrayID) { + # my @seqArray = split //, $hSeq{$id}; + # + #} + + +#------------------------------------------------------------------------------ +# compare two aligned sequences +sub compare { + my ($seq1,$seq2) = @_; + #$seq1 = uc $seq1; + #$seq2 = uc $seq2; + my @seqArray1 = split //, $seq1; + my @seqArray2 = split //, $seq2; + my $variants = 0; + + for(my $i = 0; $i <= $#seqArray1; $i++){ + if( $seqArray1[$i] ne $seqArray2[$i] ){ + $variants++; + } + } + + return $variants; +} + +# calculate percent identity between two aligned sequences +sub ident { + my ($seq1,$seq2) = @_; + #$seq1 = uc $seq1; + #$seq2 = uc $seq2; + my @seqArray1 = split //, $seq1; + my @seqArray2 = split //, $seq2; + my $sim = 0; + + for(my $i = 0; $i <= $#seqArray1; $i++){ + if( $seqArray1[$i] eq $seqArray2[$i] ){ + $sim++; + } + } + + return ($sim / ($#seqArray1 + 1)); +} + +#------------------------------------------------------------------------------
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countDifferences.xml Mon Sep 20 21:51:24 2021 +0000 @@ -0,0 +1,50 @@ +<tool id="countdifferences" name="countDifferences tool" version="1.0.0"> + <description>allows to compare sequences from a multi-Fasta alignment file by calculating differences (in bp) and percentage identity</description> + +<requirements> + <requirement type="package" version="1.7.2">perl-bioperl</requirement> + <!--<requirement type="package" version="3.27">perl-string-approx</requirement>--> +</requirements> + +<command detect_errors="aggressive"><![CDATA[ + +#import re + ## Creates symlinks for each input file based on the Galaxy 'element_identifier' + ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat') + ## Add single quotes around each input file identifier + #set $_input_file = "'{}'".format($input.element_identifier) + ln -s '${input}' ${_input_file} && + + + perl '$__tool_directory__/countDifferences.pl' $_input_file $output1 $output2 + + + +]]></command> + <!-- perl '$__tool_directory__/nucleScore.pl' $_input_file > "$output" --> + <!-- ./nuclescore.sh ${named_input_files} > "$output" --> + +<inputs> + <param format="fasta" name="input" type="data" label="Multi-FASTA file: "/> + <!--<param name="char" type="text" area="false" value="N" label="Character to be removed from Multi-FASTA file:" help="Users can directly write the character to be removed without quotes" />--> +</inputs> + + <outputs> + <data format="tabular" name="output1" label="bp_differences.tsv "/> + <data format="tabular" name="output2" label="percentage_identity.tsv "/> + </outputs> + +<help><![CDATA[ +countDifferences.pl is a Perl script allowing to calculate differences (changes in bp) and sequences percentage identity from an aligned multi-Fasta file. + +Two resulting tables are produced: +(i) a table containing a matrix with bp differences for each sequence; +(ii) a table containing a matrix with percentage identity for each sequence + +This script belongs to the getSequenceInfo supplementary tools. + +- GitHub: https://github.com/karubiotools/getSequenceInfo/tree/master/supplementary_tools +]]> +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/input.fasta Mon Sep 20 21:51:24 2021 +0000 @@ -0,0 +1,8 @@ +>sequence1 +atgcatgcatgcacgatcgatcgat--gca-tgcac +>sequence2 +aaacatgcatgcacgatcgatcgatgtatg---cac +>sequence3 +atgcatgcatgcactatcgatcgat-gcata--aac +>sequence4 +atgcatgcacgcatgatcgatcga-tgca--tgcac