comparison countDifferences.pl @ 0:a9fb5e4abb2d draft default tip

Uploaded
author dcouvin
date Mon, 20 Sep 2021 21:51:24 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a9fb5e4abb2d
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 #use DateTime;
5 #use Date::Calc qw(:all);
6 use Bio::SeqIO;
7 #use String::Similarity;
8 #use StringOO::Similarity;
9 #use lib qw(./StringOO);
10 #use Similarity::similarity;
11
12 #use local::lib "$FindBin::Bin/.."; ### points to ~/mydir1 and local::lib finds lib
13 #use lib "$FindBin::StringOO/../lib";
14 #package Similarity.pm;
15 #use String::Approx 'adistr';
16 #package String::Similarity;
17 #package String::Similarity qw(similarity);
18 #use String::Similarity;
19
20 ##################################################################
21 ## Perl program to count variants from aligned multi-FASTA file
22 ##################################################################
23
24 my $file = $ARGV[0];
25 my %hSeq = ();
26 my @arrayID = ();
27 my $resultFile = $ARGV[1]; #"resultVariants.xls";
28 my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls";
29 my $chain = "";
30
31 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
32
33 while (my $seq = $seqIO->next_seq()) {
34 my $seqID = $seq->id;
35 my $seqNuc = $seq->seq;
36 push @arrayID, $seqID;
37 $hSeq{$seqID} = $seqNuc;
38 #my @seqArray = split //, $seqNuc;
39 }
40
41 open(RES, ">", $resultFile) or die "error open file $!:";
42 open(SIM, ">", $resultSimilarity) or die "error open file $!:";
43
44 print RES "\t";
45 print SIM "\t";
46 foreach my $id (@arrayID) {
47 print RES "$id\t";
48 print SIM "$id\t";
49 }
50 print RES "\n";
51 print SIM "\n";
52
53 for(my $i = 0; $i <= $#arrayID; $i++){
54 print RES "$arrayID[$i]\t";
55 print SIM "$arrayID[$i]\t";
56 for(my $j = 0; $j <= $#arrayID; $j++){ #for(my $j = $i+1; $j <= $#arrayID; $j++){
57 my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
58 #my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
59 my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident
60 my $roundSim = sprintf('%.3f', $similarity);
61 print RES "$snpVariants\t";
62 print SIM "$roundSim\t";
63 #print RES ("SNPs between $arrayID[$i] and $arrayID[$j] = $snpVariants\t");
64 }
65 print RES "\n";
66 print SIM "\n";
67 }
68
69 close(RES) or die "error when closing file $!:";
70 close(SIM) or die "error when closing file $!:";
71
72 #foreach my $id (@arrayID) {
73 # my @seqArray = split //, $hSeq{$id};
74 #
75 #}
76
77
78 #------------------------------------------------------------------------------
79 # compare two aligned sequences
80 sub compare {
81 my ($seq1,$seq2) = @_;
82 #$seq1 = uc $seq1;
83 #$seq2 = uc $seq2;
84 my @seqArray1 = split //, $seq1;
85 my @seqArray2 = split //, $seq2;
86 my $variants = 0;
87
88 for(my $i = 0; $i <= $#seqArray1; $i++){
89 if( $seqArray1[$i] ne $seqArray2[$i] ){
90 $variants++;
91 }
92 }
93
94 return $variants;
95 }
96
97 # calculate percent identity between two aligned sequences
98 sub ident {
99 my ($seq1,$seq2) = @_;
100 #$seq1 = uc $seq1;
101 #$seq2 = uc $seq2;
102 my @seqArray1 = split //, $seq1;
103 my @seqArray2 = split //, $seq2;
104 my $sim = 0;
105
106 for(my $i = 0; $i <= $#seqArray1; $i++){
107 if( $seqArray1[$i] eq $seqArray2[$i] ){
108 $sim++;
109 }
110 }
111
112 return ($sim / ($#seqArray1 + 1));
113 }
114
115 #------------------------------------------------------------------------------