|
0
|
1 #!/usr/bin/perl -w
|
|
|
2 # Author: Thomas Thiel
|
|
|
3 # Program name: misa.pl
|
|
|
4
|
|
|
5 ###_______________________________________________________________________________
|
|
|
6 ###
|
|
|
7 ###Program name:p3_ misa_parameter.pl
|
|
|
8 ###Author: Thomas Thiel
|
|
|
9 ###Release date: 14/12/01 (version 1.0)
|
|
|
10 ###
|
|
|
11 ###_______________________________________________________________________________
|
|
|
12 ###
|
|
|
13 ## _______________________________________________________________________________
|
|
|
14 ##
|
|
|
15 ## DESCRIPTION: Tool for the identification and localization of
|
|
|
16 ## (I) perfect microsatellites as well as
|
|
|
17 ## (II) compound microsatellites (two individual microsatellites,
|
|
|
18 ## disrupted by a certain number of bases)
|
|
|
19 ##
|
|
|
20 ## SYNTAX: misa.pl <FASTA file>
|
|
|
21 ##
|
|
|
22 ## <FASTAfile> Single file in FASTA format containing the sequence(s).
|
|
|
23 ##
|
|
|
24 ## In order to specify the search criteria, an additional file containing
|
|
|
25 ## the microsatellite search parameters is required named "misa.ini", which
|
|
|
26 ## has the following structure:
|
|
|
27 ## (a) Following a text string beginning with 'def', pairs of numbers are
|
|
|
28 ## expected, whereas the first number defines the unit size and the
|
|
|
29 ## second number the lower threshold of repeats for that specific unit.
|
|
|
30 ## (b) Following a text string beginning with 'int' a single number defines
|
|
|
31 ## the maximal number of bases between two adjacent microsatellites in
|
|
|
32 ## order to specify the compound microsatellite type.
|
|
|
33 ## Example:
|
|
|
34 ## definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5
|
|
|
35 ## interruptions(max_difference_for_2_SSRs): 100
|
|
|
36 ##
|
|
|
37 ## EXAMPLE: misa.pl seqs.fasta
|
|
|
38 ## Modified by Leshi Chen for primer design
|
|
|
39 ## _______________________________________________________________________________
|
|
|
40 ##
|
|
|
41
|
|
|
42
|
|
|
43 #§§§§§ DECLARATION §§§§§#
|
|
|
44
|
|
|
45 # Check for arguments. If none display syntax #
|
|
|
46
|
|
|
47
|
|
|
48 if (@ARGV == 0)
|
|
|
49 {
|
|
|
50 open (IN,"<$0");
|
|
|
51 while (<IN>) {if (/^\#\# (.*)/) {$message .= "$1\n"}};
|
|
|
52 close (IN);
|
|
|
53 die $message;
|
|
|
54 };
|
|
|
55
|
|
|
56 # Check if help is required #
|
|
|
57
|
|
|
58 if ($ARGV[0] =~ /-help/i)
|
|
|
59 {
|
|
|
60 open (IN,"<$0");
|
|
|
61 while (<IN>) {if (/^\#\#\#(.*)/) {$message .= "$1\n"}};
|
|
|
62 close (IN);
|
|
|
63 die $message;
|
|
|
64 };
|
|
|
65
|
|
|
66 # Open FASTA file #
|
|
|
67
|
|
|
68 open (IN,"<$ARGV[0]") || die ("\nError: FASTA file doesn't exist !\n\n");
|
|
|
69 #open (OUT,">$ARGV[0].misa"); updated by Leshi chen for galaxy integration
|
|
|
70 open (OUT,">$ARGV[1]");
|
|
|
71 print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\n";
|
|
|
72
|
|
|
73 # Reading arguments updated by Leshi chen to get local path otherwise will create error #
|
|
|
74 #use Cwd 'abs_path';
|
|
|
75 #use Cwd 'getcwd';
|
|
|
76 #print getcwd()&"misa.ini";
|
|
|
77 #print OUT abs_path($0);
|
|
|
78 #open (SPECS,"\/root\/galaxy_dist\/tools\/pfr_2010\/"."misa.ini") || die ("\nError: Specifications file doesn't exist ! \n\n misa.ini not found ! \n\n");
|
|
|
79 my $arg_def= $ARGV[2]||'';
|
|
|
80 my $arg_interuption= $ARGV[3]||'';
|
|
|
81 #my $tmb = '';
|
|
|
82 #my $_ = '';
|
|
|
83 my %typrep;
|
|
|
84 my $amb = 0;
|
|
|
85
|
|
|
86 %typrep = $arg_def =~/(\d+)-(\d+)/gi;
|
|
|
87 #print "1:" , $arg_def , "\n";
|
|
|
88 #print "hh: ", %typrep , "\n";
|
|
|
89 #print $arg_def , "\n";
|
|
|
90 #print $arg_interuption ,"\n";
|
|
|
91 #print $arg_def =~/(\d+)/gi , "\n";
|
|
|
92 #%typrep = $arg_def =~/(\d+)/gi;
|
|
|
93 print %typrep , "\n";
|
|
|
94 $amb = $arg_interuption;
|
|
|
95 print $amb , "\n";
|
|
|
96 #while (<SPECS>)#
|
|
|
97 # {#
|
|
|
98 # %typrep = $1 =~ /(\d+)/gi if (/^def\S*\s+(.*)/i);#
|
|
|
99 # if (/^int\S*\s+(\d+)/i) {$amb = $1}#
|
|
|
100 # };#
|
|
|
101 my @typ = sort { $a <=> $b } keys %typrep;
|
|
|
102 print @typ . "\n";
|
|
|
103 #die (%typrep , "--" , @typ , "--" , $amb);
|
|
|
104 #§§§§§ CORE §§§§§#
|
|
|
105
|
|
|
106 $/ = ">";
|
|
|
107 my $max_repeats = 1; #count repeats
|
|
|
108 my $min_repeats = 1000; #count repeats
|
|
|
109 my (%count_motif,%count_class); #count
|
|
|
110 my ($number_sequences,$size_sequences,%ssr_containing_seqs); #stores number and size of all sequences examined
|
|
|
111 my $ssr_in_compound = 0;
|
|
|
112 my ($id,$seq);
|
|
|
113 while (<IN>)
|
|
|
114 {
|
|
|
115 next unless (($id,$seq) = /(.*?)\n(.*)/s);
|
|
|
116 my ($nr,%start,@order,%end,%motif,%repeats); # store info of all SSRs from each sequence
|
|
|
117 $seq =~ s/[\d\s>]//g; #remove digits, spaces, line breaks,...
|
|
|
118 $id =~ s/^\s*//g; $id =~ s/\s*$//g;$id =~ s/\s/_/g; #replace whitespace with "_"
|
|
|
119 $number_sequences++;
|
|
|
120 $size_sequences += length $seq;
|
|
|
121 for ($i=0; $i < scalar(@typ); $i++) #check each motif class
|
|
|
122 {
|
|
|
123 my $motiflen = $typ[$i];
|
|
|
124 my $minreps = $typrep{$typ[$i]} - 1;
|
|
|
125 if ($min_repeats > $typrep{$typ[$i]}) {$min_repeats = $typrep{$typ[$i]}}; #count repeats
|
|
|
126 my $search = "(([acgt]{$motiflen})\\2{$minreps,})";
|
|
|
127 while ( $seq =~ /$search/ig ) #scan whole sequence for that class
|
|
|
128 {
|
|
|
129 my $motif = uc $2;
|
|
|
130 my $redundant; #reject false type motifs [e.g. (TT)6 or (ACAC)5]
|
|
|
131 for ($j = $motiflen - 1; $j > 0; $j--)
|
|
|
132 {
|
|
|
133 my $redmotif = "([ACGT]{$j})\\1{".($motiflen/$j-1)."}";
|
|
|
134 $redundant = 1 if ( $motif =~ /$redmotif/ )
|
|
|
135 };
|
|
|
136 next if $redundant;
|
|
|
137 $motif{++$nr} = $motif;
|
|
|
138 my $ssr = uc $1;
|
|
|
139 $repeats{$nr} = length($ssr) / $motiflen;
|
|
|
140 $end{$nr} = pos($seq);
|
|
|
141 $start{$nr} = $end{$nr} - length($ssr) + 1;
|
|
|
142 # count repeats
|
|
|
143 # count_motifs doesn't required as statistic has been removed - modified by leshi
|
|
|
144 #$count_motifs{$motif{$nr}}++; #counts occurrence of individual motifs
|
|
|
145 $motif{$nr}->{$repeats{$nr}}++; #counts occurrence of specific SSR in its appearing repeat
|
|
|
146 $count_class{$typ[$i]}++; #counts occurrence in each motif class
|
|
|
147 if ($max_repeats < $repeats{$nr}) {$max_repeats = $repeats{$nr}};
|
|
|
148 };
|
|
|
149 };
|
|
|
150 next if (!$nr); #no SSRs
|
|
|
151 $ssr_containing_seqs{$nr}++;
|
|
|
152 @order = sort { $start{$a} <=> $start{$b} } keys %start; #put SSRs in right order
|
|
|
153 $i = 0;
|
|
|
154 my $count_seq; #counts
|
|
|
155 my ($start,$end,$ssrseq,$ssrtype,$size);
|
|
|
156 while ($i < $nr)
|
|
|
157 {
|
|
|
158 my $space = $amb + 1;
|
|
|
159 if (!$order[$i+1]) #last or only SSR
|
|
|
160 {
|
|
|
161 $count_seq++;
|
|
|
162 my $motiflen = length ($motif{$order[$i]});
|
|
|
163 $ssrtype = "p".$motiflen;
|
|
|
164 $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
|
|
|
165 $start = $start{$order[$i]}; $end = $end{$order[$i++]};
|
|
|
166 next
|
|
|
167 };
|
|
|
168 if (($start{$order[$i+1]} - $end{$order[$i]}) > $space)
|
|
|
169 {
|
|
|
170 $count_seq++;
|
|
|
171 my $motiflen = length ($motif{$order[$i]});
|
|
|
172 $ssrtype = "p".$motiflen;
|
|
|
173 $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
|
|
|
174 $start = $start{$order[$i]}; $end = $end{$order[$i++]};
|
|
|
175 next
|
|
|
176 };
|
|
|
177 my ($interssr);
|
|
|
178 if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
|
|
|
179 {
|
|
|
180 $count_seq++; $ssr_in_compound++;
|
|
|
181 $ssrtype = 'c*';
|
|
|
182 $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
|
|
|
183 $start = $start{$order[$i]}; $end = $end{$order[$i+1]}
|
|
|
184 }
|
|
|
185 else
|
|
|
186 {
|
|
|
187 $count_seq++; $ssr_in_compound++;
|
|
|
188 $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
|
|
|
189 $ssrtype = 'c';
|
|
|
190 $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
|
|
|
191 $start = $start{$order[$i]}; $end = $end{$order[$i+1]};
|
|
|
192 #$space -= length $interssr
|
|
|
193 };
|
|
|
194 while ($order[++$i + 1] and (($start{$order[$i+1]} - $end{$order[$i]}) <= $space))
|
|
|
195 {
|
|
|
196 if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
|
|
|
197 {
|
|
|
198 $ssr_in_compound++;
|
|
|
199 $ssrseq .= "($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
|
|
|
200 $ssrtype = 'c*';
|
|
|
201 $end = $end{$order[$i+1]}
|
|
|
202 }
|
|
|
203 else
|
|
|
204 {
|
|
|
205 $ssr_in_compound++;
|
|
|
206 $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
|
|
|
207 $ssrseq .= "$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
|
|
|
208 $end = $end{$order[$i+1]};
|
|
|
209 #$space -= length $interssr
|
|
|
210 }
|
|
|
211 };
|
|
|
212 $i++;
|
|
|
213 }
|
|
|
214 continue
|
|
|
215 {
|
|
|
216 print OUT "$id\t$count_seq\t$ssrtype\t$ssrseq\t",($end - $start + 1),"\t$start\t$end\n"
|
|
|
217 };
|
|
|
218 };
|
|
|
219
|
|
|
220 close (OUT);
|
|
|
221 #open (OUT,">$ARGV[0].statistics"); updated by Leshi chen for galaxy integration
|
|
|
222 # the statistics part has been removed as we only need misa for primer
|
|
|
223
|