# HG changeset patch
# User john-mccallum
# Date 1316059077 14400
# Node ID 3006582bfc7618b2e60600ef14811c8b8838bde1
Uploaded V1.0 MISA tools and helper scripts
diff -r 000000000000 -r 3006582bfc76 MISA/clean_fasta_header.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/clean_fasta_header.sh Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,5 @@
+#!/bin/sh
+## clean_fasta_header.sh
+##Remove descriptions from header
+
+sed 's/\(>\w*\)\s*.*/\1/'
\ No newline at end of file
diff -r 000000000000 -r 3006582bfc76 MISA/clean_fasta_header.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/clean_fasta_header.xml Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,53 @@
+
+
diff -r 000000000000 -r 3006582bfc76 MISA/design_MISA.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/design_MISA.sh Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,17 @@
+#!/bin/sh
+#design primer sets from MISA output using Primer 3 and MISA helper scripts
+#USAGE sh design_MISA.sh
+
+#get directory
+SCRIPT=`readlink -f $0`
+SCRIPTPATH=`dirname $SCRIPT`
+
+perl $SCRIPTPATH/p3_in.pl $1 $2 temp.p3in
+
+cat temp.p3in | primer3_core > temp.p3out
+
+
+perl $SCRIPTPATH/p3_out.pl temp.p3out $1 $3
+
+rm -f temp.*
+
diff -r 000000000000 -r 3006582bfc76 MISA/design_MISA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/design_MISA.xml Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,27 @@
+
+ Design primer sets using MISA output
+ design_MISA.sh $inputMisaFile $inputFastaFile $outputfile
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Design SSR primer sets from MISA output using Primer3
+
+This tool uses helper scripts developed at IPK
+
+http://pgrc.ipk-gatersleben.de/misa/primer3.html
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 3006582bfc76 MISA/misa.ini
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/misa.ini Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,2 @@
+definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5
+interruptions(max_difference_between_2_SSRs): 100
diff -r 000000000000 -r 3006582bfc76 MISA/misa.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/misa.pl Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,307 @@
+#!/usr/bin/perl -w
+# Author: Thomas Thiel
+# Program name: misa.pl
+
+###_______________________________________________________________________________
+###
+###Program name: misa.pl
+###Author: Thomas Thiel
+###Release date: 14/12/01 (version 1.0)
+###
+###_______________________________________________________________________________
+###
+## _______________________________________________________________________________
+##
+## DESCRIPTION: Tool for the identification and localization of
+## (I) perfect microsatellites as well as
+## (II) compound microsatellites (two individual microsatellites,
+## disrupted by a certain number of bases)
+##
+## SYNTAX: misa.pl
+##
+## Single file in FASTA format containing the sequence(s).
+##
+## In order to specify the search criteria, an additional file containing
+## the microsatellite search parameters is required named "misa.ini", which
+## has the following structure:
+## (a) Following a text string beginning with 'def', pairs of numbers are
+## expected, whereas the first number defines the unit size and the
+## second number the lower threshold of repeats for that specific unit.
+## (b) Following a text string beginning with 'int' a single number defines
+## the maximal number of bases between two adjacent microsatellites in
+## order to specify the compound microsatellite type.
+## Example:
+## definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5
+## interruptions(max_difference_for_2_SSRs): 100
+##
+## EXAMPLE: misa.pl seqs.fasta
+##
+## _______________________________________________________________________________
+##
+
+
+#§§§§§ DECLARATION §§§§§#
+
+# Check for arguments. If none display syntax #
+
+if (@ARGV == 0)
+ {
+ open (IN,"<$0");
+ while () {if (/^\#\# (.*)/) {$message .= "$1\n"}};
+ close (IN);
+ die $message;
+ };
+
+# Check if help is required #
+
+if ($ARGV[0] =~ /-help/i)
+ {
+ open (IN,"<$0");
+ while () {if (/^\#\#\#(.*)/) {$message .= "$1\n"}};
+ close (IN);
+ die $message;
+ };
+
+# Open FASTA file #
+
+open (IN,"<$ARGV[0]") || die ("\nError: FASTA file doesn't exist !\n\n");
+#open (OUT,">$ARGV[0].misa"); updated by Leshi chen for galaxy integration
+open (OUT,">$ARGV[1]");
+print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\n";
+
+# Reading arguments updated by Leshi chen to get local path otherwise will create error #
+#use Cwd 'abs_path';
+#use Cwd 'getcwd';
+#print getcwd()&"misa.ini";
+#print OUT abs_path($0);
+open (SPECS,"\/root\/galaxy_dist\/tools\/pfr_2010\/"."misa.ini") || die ("\nError: Specifications file doesn't exist ! \n\n misa.ini not found ! \n\n");
+my %typrep;
+my $amb = 0;
+while ()
+ {
+ %typrep = $1 =~ /(\d+)/gi if (/^def\S*\s+(.*)/i);
+ if (/^int\S*\s+(\d+)/i) {$amb = $1}
+ };
+my @typ = sort { $a <=> $b } keys %typrep;
+
+
+#§§§§§ CORE §§§§§#
+
+$/ = ">";
+my $max_repeats = 1; #count repeats
+my $min_repeats = 1000; #count repeats
+my (%count_motif,%count_class); #count
+my ($number_sequences,$size_sequences,%ssr_containing_seqs); #stores number and size of all sequences examined
+my $ssr_in_compound = 0;
+my ($id,$seq);
+while ()
+ {
+ next unless (($id,$seq) = /(.*?)\n(.*)/s);
+ my ($nr,%start,@order,%end,%motif,%repeats); # store info of all SSRs from each sequence
+ $seq =~ s/[\d\s>]//g; #remove digits, spaces, line breaks,...
+ $id =~ s/^\s*//g; $id =~ s/\s*$//g;$id =~ s/\s/_/g; #replace whitespace with "_"
+ $number_sequences++;
+ $size_sequences += length $seq;
+ for ($i=0; $i < scalar(@typ); $i++) #check each motif class
+ {
+ my $motiflen = $typ[$i];
+ my $minreps = $typrep{$typ[$i]} - 1;
+ if ($min_repeats > $typrep{$typ[$i]}) {$min_repeats = $typrep{$typ[$i]}}; #count repeats
+ my $search = "(([acgt]{$motiflen})\\2{$minreps,})";
+ while ( $seq =~ /$search/ig ) #scan whole sequence for that class
+ {
+ my $motif = uc $2;
+ my $redundant; #reject false type motifs [e.g. (TT)6 or (ACAC)5]
+ for ($j = $motiflen - 1; $j > 0; $j--)
+ {
+ my $redmotif = "([ACGT]{$j})\\1{".($motiflen/$j-1)."}";
+ $redundant = 1 if ( $motif =~ /$redmotif/ )
+ };
+ next if $redundant;
+ $motif{++$nr} = $motif;
+ my $ssr = uc $1;
+ $repeats{$nr} = length($ssr) / $motiflen;
+ $end{$nr} = pos($seq);
+ $start{$nr} = $end{$nr} - length($ssr) + 1;
+ # count repeats
+ $count_motifs{$motif{$nr}}++; #counts occurrence of individual motifs
+ $motif{$nr}->{$repeats{$nr}}++; #counts occurrence of specific SSR in its appearing repeat
+ $count_class{$typ[$i]}++; #counts occurrence in each motif class
+ if ($max_repeats < $repeats{$nr}) {$max_repeats = $repeats{$nr}};
+ };
+ };
+ next if (!$nr); #no SSRs
+ $ssr_containing_seqs{$nr}++;
+ @order = sort { $start{$a} <=> $start{$b} } keys %start; #put SSRs in right order
+ $i = 0;
+ my $count_seq; #counts
+ my ($start,$end,$ssrseq,$ssrtype,$size);
+ while ($i < $nr)
+ {
+ my $space = $amb + 1;
+ if (!$order[$i+1]) #last or only SSR
+ {
+ $count_seq++;
+ my $motiflen = length ($motif{$order[$i]});
+ $ssrtype = "p".$motiflen;
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i++]};
+ next
+ };
+ if (($start{$order[$i+1]} - $end{$order[$i]}) > $space)
+ {
+ $count_seq++;
+ my $motiflen = length ($motif{$order[$i]});
+ $ssrtype = "p".$motiflen;
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i++]};
+ next
+ };
+ my ($interssr);
+ if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
+ {
+ $count_seq++; $ssr_in_compound++;
+ $ssrtype = 'c*';
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
+ $start = $start{$order[$i]}; $end = $end{$order[$i+1]}
+ }
+ else
+ {
+ $count_seq++; $ssr_in_compound++;
+ $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
+ $ssrtype = 'c';
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i+1]};
+ #$space -= length $interssr
+ };
+ while ($order[++$i + 1] and (($start{$order[$i+1]} - $end{$order[$i]}) <= $space))
+ {
+ if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
+ {
+ $ssr_in_compound++;
+ $ssrseq .= "($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
+ $ssrtype = 'c*';
+ $end = $end{$order[$i+1]}
+ }
+ else
+ {
+ $ssr_in_compound++;
+ $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
+ $ssrseq .= "$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
+ $end = $end{$order[$i+1]};
+ #$space -= length $interssr
+ }
+ };
+ $i++;
+ }
+ continue
+ {
+ print OUT "$id\t$count_seq\t$ssrtype\t$ssrseq\t",($end - $start + 1),"\t$start\t$end\n"
+ };
+ };
+
+close (OUT);
+#open (OUT,">$ARGV[0].statistics"); updated by Leshi chen for galaxy integration
+open (OUT,">$ARGV[2]");
+
+#§§§§§ INFO §§§§§#
+
+#§§§ Specifications §§§#
+print OUT "Specifications\n==============\n\nSequence source file: \"$ARGV[0]\"\n\nDefinement of microsatellites (unit size / minimum number of repeats):\n";
+for ($i = 0; $i < scalar (@typ); $i++) {print OUT "($typ[$i]/$typrep{$typ[$i]}) "};print OUT "\n";
+if ($amb > 0) {print OUT "\nMaximal number of bases interrupting 2 SSRs in a compound microsatellite: $amb\n"};
+print OUT "\n\n\n";
+
+#§§§ OCCURRENCE OF SSRs §§§#
+
+#small calculations
+my @ssr_containing_seqs = values %ssr_containing_seqs;
+my $ssr_containing_seqs = 0;
+for ($i = 0; $i < scalar (@ssr_containing_seqs); $i++) {$ssr_containing_seqs += $ssr_containing_seqs[$i]};
+my @count_motifs = sort {length ($a) <=> length ($b) || $a cmp $b } keys %count_motifs;
+my @count_class = sort { $a <=> $b } keys %count_class;
+for ($i = 0; $i < scalar (@count_class); $i++) {$total += $count_class{$count_class[$i]}};
+
+#§§§ Overview §§§#
+print OUT "RESULTS OF MICROSATELLITE SEARCH\n================================\n\n";
+print OUT "Total number of sequences examined: $number_sequences\n";
+print OUT "Total size of examined sequences (bp): $size_sequences\n";
+print OUT "Total number of identified SSRs: $total\n";
+print OUT "Number of SSR containing sequences: $ssr_containing_seqs\n";
+print OUT "Number of sequences containing more than 1 SSR: ",$ssr_containing_seqs - ($ssr_containing_seqs{1} || 0),"\n";
+print OUT "Number of SSRs present in compound formation: $ssr_in_compound\n\n\n";
+
+#§§§ Frequency of SSR classes §§§#
+print OUT "Distribution to different repeat type classes\n---------------------------------------------\n\n";
+print OUT "Unit size\tNumber of SSRs\n";
+my $total = undef;
+for ($i = 0; $i < scalar (@count_class); $i++) {print OUT "$count_class[$i]\t$count_class{$count_class[$i]}\n"};
+print OUT "\n";
+
+#§§§ Frequency of SSRs: per motif and number of repeats §§§#
+print OUT "Frequency of identified SSR motifs\n----------------------------------\n\nRepeats";
+for ($i = $min_repeats;$i <= $max_repeats; $i++) {print OUT "\t$i"};
+print OUT "\ttotal\n";
+for ($i = 0; $i < scalar (@count_motifs); $i++)
+ {
+ my $typ = length ($count_motifs[$i]);
+ print OUT $count_motifs[$i];
+ for ($j = $min_repeats; $j <= $max_repeats; $j++)
+ {
+ if ($j < $typrep{$typ}) {print OUT "\t-";next};
+ if ($count_motifs[$i]->{$j}) {print OUT "\t$count_motifs[$i]->{$j}"} else {print OUT "\t"};
+ };
+ print OUT "\t$count_motifs{$count_motifs[$i]}\n";
+ };
+print OUT "\n";
+
+#§§§ Frequency of SSRs: summarizing redundant and reverse motifs §§§#
+# Eliminates %count_motifs !
+print OUT "Frequency of classified repeat types (considering sequence complementary)\n-------------------------------------------------------------------------\n\nRepeats";
+my (%red_rev,@red_rev); # groups
+for ($i = 0; $i < scalar (@count_motifs); $i++)
+ {
+ next if ($count_motifs{$count_motifs[$i]} eq 'X');
+ my (%group,@group,$red_rev); # store redundant/reverse motifs
+ my $reverse_motif = $actual_motif = $actual_motif_a = $count_motifs[$i];
+ $reverse_motif =~ tr/ACGT/TGCA/;
+ $reverse_motif = reverse $reverse_motif;
+ my $reverse_motif_a = $reverse_motif;
+ for ($j = 0; $j < length ($count_motifs[$i]); $j++)
+ {
+ if ($count_motifs{$actual_motif}) {$group{$actual_motif} = "1"; $count_motifs{$actual_motif}='X'};
+ if ($count_motifs{$reverse_motif}) {$group{$reverse_motif} = "1"; $count_motifs{$reverse_motif}='X'};
+ $actual_motif =~ s/(.)(.*)/$2$1/;
+ $reverse_motif =~ s/(.)(.*)/$2$1/;
+ $actual_motif_a = $actual_motif if ($actual_motif lt $actual_motif_a);
+ $reverse_motif_a = $reverse_motif if ($reverse_motif lt $reverse_motif_a)
+ };
+ if ($actual_motif_a lt $reverse_motif_a) {$red_rev = "$actual_motif_a/$reverse_motif_a"}
+ else {$red_rev = "$reverse_motif_a/$actual_motif_a"}; # group name
+ $red_rev{$red_rev}++;
+ @group = keys %group;
+ for ($j = 0; $j < scalar (@group); $j++)
+ {
+ for ($k = $min_repeats; $k <= $max_repeats; $k++)
+ {
+ if ($group[$j]->{$k}) {$red_rev->{"total"} += $group[$j]->{$k};$red_rev->{$k} += $group[$j]->{$k}}
+ }
+ }
+ };
+for ($i = $min_repeats; $i <= $max_repeats; $i++) {print OUT "\t$i"};
+print OUT "\ttotal\n";
+@red_rev = sort {length ($a) <=> length ($b) || $a cmp $b } keys %red_rev;
+for ($i = 0; $i < scalar (@red_rev); $i++)
+ {
+ my $typ = (length ($red_rev[$i])-1)/2;
+ print OUT $red_rev[$i];
+ for ($j = $min_repeats; $j <= $max_repeats; $j++)
+ {
+ if ($j < $typrep{$typ}) {print OUT "\t-";next};
+ if ($red_rev[$i]->{$j}) {print OUT "\t",$red_rev[$i]->{$j}}
+ else {print OUT "\t"}
+ };
+ print OUT "\t",$red_rev[$i]->{"total"},"\n";
+ };
+#add by Leshi to close the Out
+close (OUT);
diff -r 000000000000 -r 3006582bfc76 MISA/p3_in.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/p3_in.pl Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+# Author: Thomas Thiel
+# Program name: primer3_in.pl
+# Description: creates a PRIMER3 input file based on SSR search results
+
+open (IN,"<$ARGV[0]") || die ("\nError: Couldn't open misa.pl results file (*.misa) !\n\n");
+
+#my $filename = $ARGV[0];
+#$filename =~ s/\.misa//;
+open (SRC,"<$ARGV[1]") || die ("\nError: Couldn't open source file containing original FASTA sequences !\n\n");
+open (OUT,">$ARGV[2]");
+
+undef $/;
+$in = ;
+study $in;
+
+$/= ">";
+
+#my $count;
+while ()
+ {
+ next unless (my ($id,$seq) = /(.*?)\n(.*)/s);
+ $seq =~ s/[\d\s>]//g;#remove digits, spaces, line breaks,...
+ while ($in =~ /$id\t(\d+)\t\S+\t\S+\t(\d+)\t(\d+)/g)
+ {
+ my ($ssr_nr,$size,$start) = ($1,$2,$3);
+ #$count++;
+ print OUT "PRIMER_SEQUENCE_ID=$id"."_$ssr_nr\nSEQUENCE=$seq\n";
+ print OUT "PRIMER_PRODUCT_SIZE_RANGE=100-280\n";
+ print OUT "TARGET=",$start-3,",",$size+6,"\n";
+ print OUT "PRIMER_MAX_END_STABILITY=250\n=\n"
+ };
+ };
+#print "\n$count records created.\n";
diff -r 000000000000 -r 3006582bfc76 MISA/p3_misa_parameter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/p3_misa_parameter.pl Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/perl -w
+# Author: Thomas Thiel
+# Program name: misa.pl
+
+###_______________________________________________________________________________
+###
+###Program name:p3_ misa_parameter.pl
+###Author: Thomas Thiel
+###Release date: 14/12/01 (version 1.0)
+###
+###_______________________________________________________________________________
+###
+## _______________________________________________________________________________
+##
+## DESCRIPTION: Tool for the identification and localization of
+## (I) perfect microsatellites as well as
+## (II) compound microsatellites (two individual microsatellites,
+## disrupted by a certain number of bases)
+##
+## SYNTAX: misa.pl
+##
+## Single file in FASTA format containing the sequence(s).
+##
+## In order to specify the search criteria, an additional file containing
+## the microsatellite search parameters is required named "misa.ini", which
+## has the following structure:
+## (a) Following a text string beginning with 'def', pairs of numbers are
+## expected, whereas the first number defines the unit size and the
+## second number the lower threshold of repeats for that specific unit.
+## (b) Following a text string beginning with 'int' a single number defines
+## the maximal number of bases between two adjacent microsatellites in
+## order to specify the compound microsatellite type.
+## Example:
+## definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5
+## interruptions(max_difference_for_2_SSRs): 100
+##
+## EXAMPLE: misa.pl seqs.fasta
+## Modified by Leshi Chen for primer design
+## _______________________________________________________________________________
+##
+
+
+#§§§§§ DECLARATION §§§§§#
+
+# Check for arguments. If none display syntax #
+
+
+if (@ARGV == 0)
+ {
+ open (IN,"<$0");
+ while () {if (/^\#\# (.*)/) {$message .= "$1\n"}};
+ close (IN);
+ die $message;
+ };
+
+# Check if help is required #
+
+if ($ARGV[0] =~ /-help/i)
+ {
+ open (IN,"<$0");
+ while () {if (/^\#\#\#(.*)/) {$message .= "$1\n"}};
+ close (IN);
+ die $message;
+ };
+
+# Open FASTA file #
+
+open (IN,"<$ARGV[0]") || die ("\nError: FASTA file doesn't exist !\n\n");
+#open (OUT,">$ARGV[0].misa"); updated by Leshi chen for galaxy integration
+open (OUT,">$ARGV[1]");
+print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\n";
+
+# Reading arguments updated by Leshi chen to get local path otherwise will create error #
+#use Cwd 'abs_path';
+#use Cwd 'getcwd';
+#print getcwd()&"misa.ini";
+#print OUT abs_path($0);
+#open (SPECS,"\/root\/galaxy_dist\/tools\/pfr_2010\/"."misa.ini") || die ("\nError: Specifications file doesn't exist ! \n\n misa.ini not found ! \n\n");
+my $arg_def= $ARGV[2]||'';
+my $arg_interuption= $ARGV[3]||'';
+#my $tmb = '';
+#my $_ = '';
+my %typrep;
+my $amb = 0;
+
+%typrep = $arg_def =~/(\d+)-(\d+)/gi;
+#print "1:" , $arg_def , "\n";
+#print "hh: ", %typrep , "\n";
+#print $arg_def , "\n";
+#print $arg_interuption ,"\n";
+#print $arg_def =~/(\d+)/gi , "\n";
+#%typrep = $arg_def =~/(\d+)/gi;
+print %typrep , "\n";
+$amb = $arg_interuption;
+print $amb , "\n";
+#while ()#
+ # {#
+ # %typrep = $1 =~ /(\d+)/gi if (/^def\S*\s+(.*)/i);#
+ # if (/^int\S*\s+(\d+)/i) {$amb = $1}#
+ # };#
+my @typ = sort { $a <=> $b } keys %typrep;
+print @typ . "\n";
+#die (%typrep , "--" , @typ , "--" , $amb);
+#§§§§§ CORE §§§§§#
+
+$/ = ">";
+my $max_repeats = 1; #count repeats
+my $min_repeats = 1000; #count repeats
+my (%count_motif,%count_class); #count
+my ($number_sequences,$size_sequences,%ssr_containing_seqs); #stores number and size of all sequences examined
+my $ssr_in_compound = 0;
+my ($id,$seq);
+while ()
+ {
+ next unless (($id,$seq) = /(.*?)\n(.*)/s);
+ my ($nr,%start,@order,%end,%motif,%repeats); # store info of all SSRs from each sequence
+ $seq =~ s/[\d\s>]//g; #remove digits, spaces, line breaks,...
+ $id =~ s/^\s*//g; $id =~ s/\s*$//g;$id =~ s/\s/_/g; #replace whitespace with "_"
+ $number_sequences++;
+ $size_sequences += length $seq;
+ for ($i=0; $i < scalar(@typ); $i++) #check each motif class
+ {
+ my $motiflen = $typ[$i];
+ my $minreps = $typrep{$typ[$i]} - 1;
+ if ($min_repeats > $typrep{$typ[$i]}) {$min_repeats = $typrep{$typ[$i]}}; #count repeats
+ my $search = "(([acgt]{$motiflen})\\2{$minreps,})";
+ while ( $seq =~ /$search/ig ) #scan whole sequence for that class
+ {
+ my $motif = uc $2;
+ my $redundant; #reject false type motifs [e.g. (TT)6 or (ACAC)5]
+ for ($j = $motiflen - 1; $j > 0; $j--)
+ {
+ my $redmotif = "([ACGT]{$j})\\1{".($motiflen/$j-1)."}";
+ $redundant = 1 if ( $motif =~ /$redmotif/ )
+ };
+ next if $redundant;
+ $motif{++$nr} = $motif;
+ my $ssr = uc $1;
+ $repeats{$nr} = length($ssr) / $motiflen;
+ $end{$nr} = pos($seq);
+ $start{$nr} = $end{$nr} - length($ssr) + 1;
+ # count repeats
+ # count_motifs doesn't required as statistic has been removed - modified by leshi
+ #$count_motifs{$motif{$nr}}++; #counts occurrence of individual motifs
+ $motif{$nr}->{$repeats{$nr}}++; #counts occurrence of specific SSR in its appearing repeat
+ $count_class{$typ[$i]}++; #counts occurrence in each motif class
+ if ($max_repeats < $repeats{$nr}) {$max_repeats = $repeats{$nr}};
+ };
+ };
+ next if (!$nr); #no SSRs
+ $ssr_containing_seqs{$nr}++;
+ @order = sort { $start{$a} <=> $start{$b} } keys %start; #put SSRs in right order
+ $i = 0;
+ my $count_seq; #counts
+ my ($start,$end,$ssrseq,$ssrtype,$size);
+ while ($i < $nr)
+ {
+ my $space = $amb + 1;
+ if (!$order[$i+1]) #last or only SSR
+ {
+ $count_seq++;
+ my $motiflen = length ($motif{$order[$i]});
+ $ssrtype = "p".$motiflen;
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i++]};
+ next
+ };
+ if (($start{$order[$i+1]} - $end{$order[$i]}) > $space)
+ {
+ $count_seq++;
+ my $motiflen = length ($motif{$order[$i]});
+ $ssrtype = "p".$motiflen;
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i++]};
+ next
+ };
+ my ($interssr);
+ if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
+ {
+ $count_seq++; $ssr_in_compound++;
+ $ssrtype = 'c*';
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
+ $start = $start{$order[$i]}; $end = $end{$order[$i+1]}
+ }
+ else
+ {
+ $count_seq++; $ssr_in_compound++;
+ $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
+ $ssrtype = 'c';
+ $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
+ $start = $start{$order[$i]}; $end = $end{$order[$i+1]};
+ #$space -= length $interssr
+ };
+ while ($order[++$i + 1] and (($start{$order[$i+1]} - $end{$order[$i]}) <= $space))
+ {
+ if (($start{$order[$i+1]} - $end{$order[$i]}) < 1)
+ {
+ $ssr_in_compound++;
+ $ssrseq .= "($motif{$order[$i+1]})$repeats{$order[$i+1]}*";
+ $ssrtype = 'c*';
+ $end = $end{$order[$i+1]}
+ }
+ else
+ {
+ $ssr_in_compound++;
+ $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1);
+ $ssrseq .= "$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}";
+ $end = $end{$order[$i+1]};
+ #$space -= length $interssr
+ }
+ };
+ $i++;
+ }
+ continue
+ {
+ print OUT "$id\t$count_seq\t$ssrtype\t$ssrseq\t",($end - $start + 1),"\t$start\t$end\n"
+ };
+ };
+
+close (OUT);
+#open (OUT,">$ARGV[0].statistics"); updated by Leshi chen for galaxy integration
+# the statistics part has been removed as we only need misa for primer
+
diff -r 000000000000 -r 3006582bfc76 MISA/p3_misa_parameter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/p3_misa_parameter.xml Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,46 @@
+
+
+ Detect simple sequence repeats using MISA
+ p3_misa_parameter.pl $inputfastaFile $misa_outputfile "1-$definition_1 2-$definition_2 3-$definition_3 4-$definition_4 5-$definition_5 6-$definition_6" "$interruptions"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Detect simple sequence repeats using **MISA** - MIcroSAtellite identification tool
+
+The MISA script was developed at IPK by Thomas Thiel
+
+http://pgrc.ipk-gatersleben.de/misa/
+
+CITATION
+---------
+
+Exploiting EST databases for the development and characterization of gene-derived SSR-markers in barley (Hordeum vulgare L.)
+
+T. Thiel, W. Michalek, R. Varshney and A. Graner
+
+THEORETICAL AND APPLIED GENETICS Volume 106, Number 3, 411-422
+
+DOI: 10.1007/s00122-002-1031-0
+
+**TIP**
+The primer design tool will work more predictably if fasta header lines are simplied to only include the sequence ID
+
+
+
+
+
+
diff -r 000000000000 -r 3006582bfc76 MISA/p3_out.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/p3_out.pl Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,76 @@
+#!/usr/bin/perl -w
+# Author: Thomas Thiel
+# Program name: prim_output.pl
+# Description: converts the Primer3 output into an table
+
+open (SRC,"<$ARGV[0]") || die ("\nError: Couldn't open Primer3 results file (*.p3out) !\n\n");
+my $filename = $ARGV[0];
+$filename =~ s/\.p3out//;
+open (IN,"<$ARGV[1]") || die ("\nError: Couldn't open source file containing MISA (*.misa) results !\n\n");
+open (OUT,">$ARGV[2]") || die ("nError: Couldn't create file !\n\n");
+
+my ($seq_names_failed,$count_failed,$count);
+print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\t";
+print OUT "FORWARD PRIMER1 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER1 (5'-3')\tTm(°C)\tsize\tPRODUCT1 size (bp)\tstart (bp)\tend (bp)\t";
+print OUT "FORWARD PRIMER2 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER2 (5'-3')\tTm(°C)\tsize\tPRODUCT2 size (bp)\tstart (bp)\tend (bp)\t";
+print OUT "FORWARD PRIMER3 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER3 (5'-3')\tTm(°C)\tsize\tPRODUCT3 size (bp)\tstart (bp)\tend (bp)\n";
+
+undef $/;
+my $in = ;
+study $in;
+
+$/ = "=\n";
+
+while ()
+ {
+ my ($id,$ssr_nr) = (/PRIMER_SEQUENCE_ID=(\S+)_(\d+)/);
+
+ $in =~ /($id\t$ssr_nr\t.*)\n/;
+ my $misa = $1;
+
+ /PRIMER_LEFT_SEQUENCE=(.*)/ || do {$count_failed++;print OUT "$misa\n"; next};
+ my $info = "$1\t";
+
+ /PRIMER_LEFT_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_RIGHT_SEQUENCE=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_PRODUCT_SIZE=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT=(\d+),\d+/; $info .= "$1\t";
+ /PRIMER_RIGHT=(\d+),\d+/; $info .= "$1\t";
+
+
+ /PRIMER_LEFT_1_SEQUENCE=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_1_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_1=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_RIGHT_1_SEQUENCE=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT_1_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT_1=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_PRODUCT_SIZE_1=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_1=(\d+),\d+/; $info .= "$1\t";
+ /PRIMER_RIGHT_1=(\d+),\d+/; $info .= "$1\t";
+
+
+ /PRIMER_LEFT_2_SEQUENCE=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_2_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_2=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_RIGHT_2_SEQUENCE=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT_2_TM=(.*)/; $info .= "$1\t";
+ /PRIMER_RIGHT_2=\d+,(\d+)/; $info .= "$1\t";
+
+ /PRIMER_PRODUCT_SIZE_2=(.*)/; $info .= "$1\t";
+ /PRIMER_LEFT_2=(\d+),\d+/; $info .= "$1\t";
+ /PRIMER_RIGHT_2=(\d+),\d+/; $info .= "$1";
+
+ $count++;
+ print OUT "$misa\t$info\n"
+ };
+
+#print "\nPrimer modelling was successful for $count sequences.\n";
+#print "Primer modelling failed for $count_failed sequences.\n";
diff -r 000000000000 -r 3006582bfc76 MISA/tool_conf_entry_MISA.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MISA/tool_conf_entry_MISA.xml Wed Sep 14 23:57:57 2011 -0400
@@ -0,0 +1,5 @@
+