annotate countDifferences.pl @ 0:a9fb5e4abb2d draft default tip

Uploaded
author dcouvin
date Mon, 20 Sep 2021 21:51:24 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
1 #!/usr/bin/perl -w
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
2 use strict;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
3 use warnings;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
4 #use DateTime;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
5 #use Date::Calc qw(:all);
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
6 use Bio::SeqIO;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
7 #use String::Similarity;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
8 #use StringOO::Similarity;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
9 #use lib qw(./StringOO);
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
10 #use Similarity::similarity;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
11
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
12 #use local::lib "$FindBin::Bin/.."; ### points to ~/mydir1 and local::lib finds lib
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
13 #use lib "$FindBin::StringOO/../lib";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
14 #package Similarity.pm;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
15 #use String::Approx 'adistr';
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
16 #package String::Similarity;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
17 #package String::Similarity qw(similarity);
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
18 #use String::Similarity;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
19
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
20 ##################################################################
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
21 ## Perl program to count variants from aligned multi-FASTA file
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
22 ##################################################################
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
23
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
24 my $file = $ARGV[0];
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
25 my %hSeq = ();
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
26 my @arrayID = ();
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
27 my $resultFile = $ARGV[1]; #"resultVariants.xls";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
28 my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
29 my $chain = "";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
30
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
31 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file);
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
32
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
33 while (my $seq = $seqIO->next_seq()) {
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
34 my $seqID = $seq->id;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
35 my $seqNuc = $seq->seq;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
36 push @arrayID, $seqID;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
37 $hSeq{$seqID} = $seqNuc;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
38 #my @seqArray = split //, $seqNuc;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
39 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
40
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
41 open(RES, ">", $resultFile) or die "error open file $!:";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
42 open(SIM, ">", $resultSimilarity) or die "error open file $!:";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
43
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
44 print RES "\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
45 print SIM "\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
46 foreach my $id (@arrayID) {
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
47 print RES "$id\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
48 print SIM "$id\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
49 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
50 print RES "\n";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
51 print SIM "\n";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
52
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
53 for(my $i = 0; $i <= $#arrayID; $i++){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
54 print RES "$arrayID[$i]\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
55 print SIM "$arrayID[$i]\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
56 for(my $j = 0; $j <= $#arrayID; $j++){ #for(my $j = $i+1; $j <= $#arrayID; $j++){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
57 my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
58 #my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]});
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
59 my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
60 my $roundSim = sprintf('%.3f', $similarity);
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
61 print RES "$snpVariants\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
62 print SIM "$roundSim\t";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
63 #print RES ("SNPs between $arrayID[$i] and $arrayID[$j] = $snpVariants\t");
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
64 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
65 print RES "\n";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
66 print SIM "\n";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
67 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
68
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
69 close(RES) or die "error when closing file $!:";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
70 close(SIM) or die "error when closing file $!:";
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
71
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
72 #foreach my $id (@arrayID) {
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
73 # my @seqArray = split //, $hSeq{$id};
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
74 #
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
75 #}
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
76
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
77
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
78 #------------------------------------------------------------------------------
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
79 # compare two aligned sequences
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
80 sub compare {
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
81 my ($seq1,$seq2) = @_;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
82 #$seq1 = uc $seq1;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
83 #$seq2 = uc $seq2;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
84 my @seqArray1 = split //, $seq1;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
85 my @seqArray2 = split //, $seq2;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
86 my $variants = 0;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
87
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
88 for(my $i = 0; $i <= $#seqArray1; $i++){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
89 if( $seqArray1[$i] ne $seqArray2[$i] ){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
90 $variants++;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
91 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
92 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
93
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
94 return $variants;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
95 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
96
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
97 # calculate percent identity between two aligned sequences
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
98 sub ident {
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
99 my ($seq1,$seq2) = @_;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
100 #$seq1 = uc $seq1;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
101 #$seq2 = uc $seq2;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
102 my @seqArray1 = split //, $seq1;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
103 my @seqArray2 = split //, $seq2;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
104 my $sim = 0;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
105
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
106 for(my $i = 0; $i <= $#seqArray1; $i++){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
107 if( $seqArray1[$i] eq $seqArray2[$i] ){
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
108 $sim++;
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
109 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
110 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
111
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
112 return ($sim / ($#seqArray1 + 1));
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
113 }
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
114
a9fb5e4abb2d Uploaded
dcouvin
parents:
diff changeset
115 #------------------------------------------------------------------------------