0
|
1 #!/usr/bin/perl -w
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 #use DateTime;
|
|
5 #use Date::Calc qw(:all);
|
|
6 use Bio::SeqIO;
|
|
7 #use String::Similarity;
|
|
8 #use StringOO::Similarity;
|
|
9 #use lib qw(./StringOO);
|
|
10 #use Similarity::similarity;
|
|
11
|
|
12 #use local::lib "$FindBin::Bin/.."; ### points to ~/mydir1 and local::lib finds lib
|
|
13 #use lib "$FindBin::StringOO/../lib";
|
|
14 #package Similarity.pm;
|
|
15 #use String::Approx 'adistr';
|
|
16 #package String::Similarity;
|
|
17 #package String::Similarity qw(similarity);
|
|
18 #use String::Similarity;
|
|
19
|
|
20 ##################################################################
|
|
21 ## Perl program to count variants from aligned multi-FASTA file
|
|
22 ##################################################################
|
|
23
|
|
24 my $file = $ARGV[0];
|
|
25 my %hSeq = ();
|
|
26 my @arrayID = ();
|
|
27 my $resultFile = $ARGV[1]; #"resultVariants.xls";
|
|
28 my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls";
|
|
29 my $chain = "";
|
|
30
|
|
31 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
|
|
32
|
|
33 while (my $seq = $seqIO->next_seq()) {
|
|
34 my $seqID = $seq->id;
|
|
35 my $seqNuc = $seq->seq;
|
|
36 push @arrayID, $seqID;
|
|
37 $hSeq{$seqID} = $seqNuc;
|
|
38 #my @seqArray = split //, $seqNuc;
|
|
39 }
|
|
40
|
|
41 open(RES, ">", $resultFile) or die "error open file $!:";
|
|
42 open(SIM, ">", $resultSimilarity) or die "error open file $!:";
|
|
43
|
|
44 print RES "\t";
|
|
45 print SIM "\t";
|
|
46 foreach my $id (@arrayID) {
|
|
47 print RES "$id\t";
|
|
48 print SIM "$id\t";
|
|
49 }
|
|
50 print RES "\n";
|
|
51 print SIM "\n";
|
|
52
|
|
53 for(my $i = 0; $i <= $#arrayID; $i++){
|
|
54 print RES "$arrayID[$i]\t";
|
|
55 print SIM "$arrayID[$i]\t";
|
|
56 for(my $j = 0; $j <= $#arrayID; $j++){ #for(my $j = $i+1; $j <= $#arrayID; $j++){
|
|
57 my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
|
|
58 #my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
|
|
59 my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident
|
|
60 my $roundSim = sprintf('%.3f', $similarity);
|
|
61 print RES "$snpVariants\t";
|
|
62 print SIM "$roundSim\t";
|
|
63 #print RES ("SNPs between $arrayID[$i] and $arrayID[$j] = $snpVariants\t");
|
|
64 }
|
|
65 print RES "\n";
|
|
66 print SIM "\n";
|
|
67 }
|
|
68
|
|
69 close(RES) or die "error when closing file $!:";
|
|
70 close(SIM) or die "error when closing file $!:";
|
|
71
|
|
72 #foreach my $id (@arrayID) {
|
|
73 # my @seqArray = split //, $hSeq{$id};
|
|
74 #
|
|
75 #}
|
|
76
|
|
77
|
|
78 #------------------------------------------------------------------------------
|
|
79 # compare two aligned sequences
|
|
80 sub compare {
|
|
81 my ($seq1,$seq2) = @_;
|
|
82 #$seq1 = uc $seq1;
|
|
83 #$seq2 = uc $seq2;
|
|
84 my @seqArray1 = split //, $seq1;
|
|
85 my @seqArray2 = split //, $seq2;
|
|
86 my $variants = 0;
|
|
87
|
|
88 for(my $i = 0; $i <= $#seqArray1; $i++){
|
|
89 if( $seqArray1[$i] ne $seqArray2[$i] ){
|
|
90 $variants++;
|
|
91 }
|
|
92 }
|
|
93
|
|
94 return $variants;
|
|
95 }
|
|
96
|
|
97 # calculate percent identity between two aligned sequences
|
|
98 sub ident {
|
|
99 my ($seq1,$seq2) = @_;
|
|
100 #$seq1 = uc $seq1;
|
|
101 #$seq2 = uc $seq2;
|
|
102 my @seqArray1 = split //, $seq1;
|
|
103 my @seqArray2 = split //, $seq2;
|
|
104 my $sim = 0;
|
|
105
|
|
106 for(my $i = 0; $i <= $#seqArray1; $i++){
|
|
107 if( $seqArray1[$i] eq $seqArray2[$i] ){
|
|
108 $sim++;
|
|
109 }
|
|
110 }
|
|
111
|
|
112 return ($sim / ($#seqArray1 + 1));
|
|
113 }
|
|
114
|
|
115 #------------------------------------------------------------------------------
|