Mercurial > repos > dcouvin > countdifferences

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countDifferences.pl	Mon Sep 20 21:51:24 2021 +0000
@@ -0,0 +1,115 @@
+#!/usr/bin/perl -w
+use strict;
+use warnings;
+#use DateTime;
+#use Date::Calc qw(:all);
+use Bio::SeqIO;
+#use String::Similarity;
+#use StringOO::Similarity;
+#use lib qw(./StringOO);
+#use Similarity::similarity;
+
+#use local::lib "$FindBin::Bin/..";  ### points to ~/mydir1 and local::lib finds lib
+#use lib "$FindBin::StringOO/../lib";
+#package Similarity.pm;
+#use String::Approx 'adistr';
+#package String::Similarity;
+#package String::Similarity qw(similarity);
+#use String::Similarity;
+
+##################################################################
+## Perl program to count variants from aligned multi-FASTA file
+##################################################################
+
+my $file = $ARGV[0];
+my %hSeq = ();
+my @arrayID = ();
+my $resultFile = $ARGV[1]; #"resultVariants.xls";
+my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls";
+my $chain = "";
+
+	my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
+
+	while (my $seq = $seqIO->next_seq()) {
+		my $seqID = $seq->id;
+		my $seqNuc = $seq->seq;
+		push @arrayID, $seqID;
+		$hSeq{$seqID} = $seqNuc;
+		#my @seqArray = split //, $seqNuc;
+	}
+
+	open(RES, ">", $resultFile) or die "error open file $!:";
+	open(SIM, ">", $resultSimilarity) or die "error open file $!:";
+
+	print RES "\t";
+	print SIM "\t";
+	foreach my $id (@arrayID) {
+		print RES "$id\t";
+		print SIM "$id\t";
+	}
+	print RES "\n";
+	print SIM "\n";
+
+	for(my $i = 0; $i <= $#arrayID; $i++){
+		print RES "$arrayID[$i]\t";
+		print SIM "$arrayID[$i]\t";
+		for(my $j = 0; $j <= $#arrayID; $j++){	#for(my $j = $i+1; $j <= $#arrayID; $j++){
+			my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
+			#my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
+                        my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident
+			my $roundSim = sprintf('%.3f', $similarity);
+			print RES "$snpVariants\t";
+			print SIM "$roundSim\t";
+			#print RES ("SNPs between  $arrayID[$i] and $arrayID[$j] = $snpVariants\t");
+		}
+		print RES "\n";
+		print SIM "\n";
+	}
+
+close(RES) or die "error when closing file $!:";
+close(SIM) or die "error when closing file $!:";
+
+	#foreach my $id (@arrayID) {
+	#	my @seqArray = split //, $hSeq{$id};
+	#
+	#}
+
+
+#------------------------------------------------------------------------------
+# compare two aligned sequences
+sub compare {
+	my ($seq1,$seq2) = @_;
+	#$seq1 = uc $seq1;
+	#$seq2 = uc $seq2;
+	my @seqArray1 = split //, $seq1;
+	my @seqArray2 = split //, $seq2;
+	my $variants = 0;
+
+	for(my $i = 0; $i <= $#seqArray1; $i++){
+		if( $seqArray1[$i] ne $seqArray2[$i] ){
+			$variants++;
+		}
+	}
+
+	return $variants;
+}
+
+# calculate percent identity between two aligned sequences
+sub ident {
+        my ($seq1,$seq2) = @_;
+        #$seq1 = uc $seq1;
+        #$seq2 = uc $seq2;
+        my @seqArray1 = split //, $seq1;
+        my @seqArray2 = split //, $seq2;
+        my $sim = 0;
+
+        for(my $i = 0; $i <= $#seqArray1; $i++){
+                if( $seqArray1[$i] eq $seqArray2[$i] ){
+                        $sim++;
+                }
+        }
+
+        return ($sim / ($#seqArray1 + 1));
+}
+
+#------------------------------------------------------------------------------
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/countDifferences.xml	Mon Sep 20 21:51:24 2021 +0000
@@ -0,0 +1,50 @@
+<tool id="countdifferences" name="countDifferences tool" version="1.0.0">
+  <description>allows to compare sequences from a multi-Fasta alignment file by calculating differences (in bp) and percentage identity</description>
+
+<requirements>
+  <requirement type="package" version="1.7.2">perl-bioperl</requirement>
+  <!--<requirement type="package" version="3.27">perl-string-approx</requirement>-->
+</requirements>
+
+<command detect_errors="aggressive"><![CDATA[
+
+#import re
+        ## Creates symlinks for each input file based on the Galaxy 'element_identifier'
+        ## Used so that a human-readable name appears in the output table (instead of 'dataset_xyz.dat')
+            ## Add single quotes around each input file identifier
+            #set $_input_file = "'{}'".format($input.element_identifier)
+            ln -s '${input}' ${_input_file} &&
+
+
+        perl '$__tool_directory__/countDifferences.pl' $_input_file $output1 $output2
+
+
+
+]]></command>
+ <!-- perl '$__tool_directory__/nucleScore.pl' $_input_file > "$output"  -->
+ <!-- ./nuclescore.sh ${named_input_files} > "$output" -->
+
+<inputs>
+  <param format="fasta" name="input" type="data" label="Multi-FASTA file: "/>
+  <!--<param name="char" type="text" area="false" value="N" label="Character to be removed from Multi-FASTA file:" help="Users can directly write the character to be removed without quotes" />-->
+</inputs>
+
+ <outputs>
+    <data format="tabular" name="output1" label="bp_differences.tsv "/>
+    <data format="tabular" name="output2" label="percentage_identity.tsv "/>
+ </outputs>
+
+<help><![CDATA[
+countDifferences.pl is a Perl script allowing to calculate differences (changes in bp) and sequences percentage identity from an aligned multi-Fasta file.
+
+Two resulting tables are produced:
+(i) a table containing a matrix with bp differences for each sequence;
+(ii) a table containing a matrix with percentage identity for each sequence
+
+This script belongs to the getSequenceInfo supplementary tools.
+
+- GitHub: https://github.com/karubiotools/getSequenceInfo/tree/master/supplementary_tools
+]]>
+</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/input.fasta	Mon Sep 20 21:51:24 2021 +0000
@@ -0,0 +1,8 @@
+>sequence1
+atgcatgcatgcacgatcgatcgat--gca-tgcac
+>sequence2
+aaacatgcatgcacgatcgatcgatgtatg---cac
+>sequence3
+atgcatgcatgcactatcgatcgat-gcata--aac
+>sequence4
+atgcatgcacgcatgatcgatcga-tgca--tgcac