Mercurial > repos > dcouvin > countdifferences
comparison countDifferences.pl @ 0:a9fb5e4abb2d draft default tip
Uploaded
author | dcouvin |
---|---|
date | Mon, 20 Sep 2021 21:51:24 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a9fb5e4abb2d |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use warnings; | |
4 #use DateTime; | |
5 #use Date::Calc qw(:all); | |
6 use Bio::SeqIO; | |
7 #use String::Similarity; | |
8 #use StringOO::Similarity; | |
9 #use lib qw(./StringOO); | |
10 #use Similarity::similarity; | |
11 | |
12 #use local::lib "$FindBin::Bin/.."; ### points to ~/mydir1 and local::lib finds lib | |
13 #use lib "$FindBin::StringOO/../lib"; | |
14 #package Similarity.pm; | |
15 #use String::Approx 'adistr'; | |
16 #package String::Similarity; | |
17 #package String::Similarity qw(similarity); | |
18 #use String::Similarity; | |
19 | |
20 ################################################################## | |
21 ## Perl program to count variants from aligned multi-FASTA file | |
22 ################################################################## | |
23 | |
24 my $file = $ARGV[0]; | |
25 my %hSeq = (); | |
26 my @arrayID = (); | |
27 my $resultFile = $ARGV[1]; #"resultVariants.xls"; | |
28 my $resultSimilarity = $ARGV[2]; #"resultSimilarity.xls"; | |
29 my $chain = ""; | |
30 | |
31 my $seqIO = Bio::SeqIO->new(-format=>'Fasta', -file=>$file); | |
32 | |
33 while (my $seq = $seqIO->next_seq()) { | |
34 my $seqID = $seq->id; | |
35 my $seqNuc = $seq->seq; | |
36 push @arrayID, $seqID; | |
37 $hSeq{$seqID} = $seqNuc; | |
38 #my @seqArray = split //, $seqNuc; | |
39 } | |
40 | |
41 open(RES, ">", $resultFile) or die "error open file $!:"; | |
42 open(SIM, ">", $resultSimilarity) or die "error open file $!:"; | |
43 | |
44 print RES "\t"; | |
45 print SIM "\t"; | |
46 foreach my $id (@arrayID) { | |
47 print RES "$id\t"; | |
48 print SIM "$id\t"; | |
49 } | |
50 print RES "\n"; | |
51 print SIM "\n"; | |
52 | |
53 for(my $i = 0; $i <= $#arrayID; $i++){ | |
54 print RES "$arrayID[$i]\t"; | |
55 print SIM "$arrayID[$i]\t"; | |
56 for(my $j = 0; $j <= $#arrayID; $j++){ #for(my $j = $i+1; $j <= $#arrayID; $j++){ | |
57 my $snpVariants = compare($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); | |
58 #my $similarity = adistr($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); | |
59 my $similarity = ident($hSeq{$arrayID[$i]}, $hSeq{$arrayID[$j]}); # replace similarity by ident | |
60 my $roundSim = sprintf('%.3f', $similarity); | |
61 print RES "$snpVariants\t"; | |
62 print SIM "$roundSim\t"; | |
63 #print RES ("SNPs between $arrayID[$i] and $arrayID[$j] = $snpVariants\t"); | |
64 } | |
65 print RES "\n"; | |
66 print SIM "\n"; | |
67 } | |
68 | |
69 close(RES) or die "error when closing file $!:"; | |
70 close(SIM) or die "error when closing file $!:"; | |
71 | |
72 #foreach my $id (@arrayID) { | |
73 # my @seqArray = split //, $hSeq{$id}; | |
74 # | |
75 #} | |
76 | |
77 | |
78 #------------------------------------------------------------------------------ | |
79 # compare two aligned sequences | |
80 sub compare { | |
81 my ($seq1,$seq2) = @_; | |
82 #$seq1 = uc $seq1; | |
83 #$seq2 = uc $seq2; | |
84 my @seqArray1 = split //, $seq1; | |
85 my @seqArray2 = split //, $seq2; | |
86 my $variants = 0; | |
87 | |
88 for(my $i = 0; $i <= $#seqArray1; $i++){ | |
89 if( $seqArray1[$i] ne $seqArray2[$i] ){ | |
90 $variants++; | |
91 } | |
92 } | |
93 | |
94 return $variants; | |
95 } | |
96 | |
97 # calculate percent identity between two aligned sequences | |
98 sub ident { | |
99 my ($seq1,$seq2) = @_; | |
100 #$seq1 = uc $seq1; | |
101 #$seq2 = uc $seq2; | |
102 my @seqArray1 = split //, $seq1; | |
103 my @seqArray2 = split //, $seq2; | |
104 my $sim = 0; | |
105 | |
106 for(my $i = 0; $i <= $#seqArray1; $i++){ | |
107 if( $seqArray1[$i] eq $seqArray2[$i] ){ | |
108 $sim++; | |
109 } | |
110 } | |
111 | |
112 return ($sim / ($#seqArray1 + 1)); | |
113 } | |
114 | |
115 #------------------------------------------------------------------------------ |