Mercurial > repos > john-mccallum > ssr_marker_design
changeset 0:3006582bfc76
Uploaded V1.0 MISA tools and helper scripts
author | john-mccallum |
---|---|
date | Wed, 14 Sep 2011 23:57:57 -0400 |
parents | |
children | e9f5d1afba70 |
files | MISA/clean_fasta_header.sh MISA/clean_fasta_header.xml MISA/design_MISA.sh MISA/design_MISA.xml MISA/misa.ini MISA/misa.pl MISA/p3_in.pl MISA/p3_misa_parameter.pl MISA/p3_misa_parameter.xml MISA/p3_out.pl MISA/tool_conf_entry_MISA.xml |
diffstat | 11 files changed, 795 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/clean_fasta_header.sh Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,5 @@ +#!/bin/sh +## clean_fasta_header.sh +##Remove descriptions from header + +sed 's/\(>\w*\)\s*.*/\1/' \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/clean_fasta_header.xml Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,53 @@ +<?xml version="1.0"?> +<tool id="clean_fasta_header_1" name="Clean fasta header"> + <description>Removes fasta description fields in header </description> + <command>sed 's/\(>\w*\)\s*.*/\1/' $inputFastaFile > $fasta_outputfile</command> + <inputs> + <param format="fasta" name="inputFastaFile" type="data" label="fasta File"/> + </inputs> + <outputs> + <data format="fasta" name="fasta_outputfile" /> + </outputs> +<help> +.. class:: infomark + +**TIP** + +This tool requires *fasta* format. + +It simply removes any additional definition strings from the header line prior to using tools that dont handle these. + +---- + +**Example** + +--Query sequence + +:: + + >contig00001 gene=isogroup00001 length=2159 + tttAaGCATTTAACACTGCATATTGATTGATATAGTTGTTCAGTACAAGCCAATTACATT + GTAGACATAAAACAAAGCATTCGAAACAGTTGAAATTTTGATTCCTCTATACTGGATCAG + GCGGTAATCA + >contig00003 gene=isogroup00001 length=2206 + ggTGGCTGCTTTCTCAAATCCACCCCTTCCCAAGGAAACCCTAAACTCGCAGATAAATTT + + + +--Output + +:: + + >contig00001 + ttAaGCATTTAACACTGCATATTGATTGATATAGTTGTTCAGTACAAGCCAATTACATT + GTAGACATAAAACAAAGCATTCGAAACAGTTGAAATTTTGATTCCTCTATACTGGATCAG + GCGGTAATCAGGGGAAGGAAACCATGGTGTAAGGCTGCATCCCATACTTTATCTATGTCA + >contig00003 + ggTGGCTGCTTTCTCAAATCCACCCCTTCCCAAGGAAACCCTAAACTCGCAGATAAATTT + GTAGGGTTTCTATGTCGACCGAGCGCCGTCGGAAAGTGAGCCTTTTCGACGTAGTTGAC + GAGACCTCAGTCTCTG... + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/design_MISA.sh Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,17 @@ +#!/bin/sh +#design primer sets from MISA output using Primer 3 and MISA helper scripts +#USAGE sh design_MISA.sh <misa_file> <source_fasta_file> <output_file> + +#get directory +SCRIPT=`readlink -f $0` +SCRIPTPATH=`dirname $SCRIPT` + +perl $SCRIPTPATH/p3_in.pl $1 $2 temp.p3in + +cat temp.p3in | primer3_core > temp.p3out + + +perl $SCRIPTPATH/p3_out.pl temp.p3out $1 $3 + +rm -f temp.* +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/design_MISA.xml Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,27 @@ +<tool id="design_MISA_1" name="MISA Primer Design"> + <description>Design primer sets using MISA output </description> + <command interpreter="bash">design_MISA.sh $inputMisaFile $inputFastaFile $outputfile</command> + <inputs> + <param format="misa" name="inputMisaFile" type="data" label="Misa Source file"/> + <param format="fasta" name="inputFastaFile" type="data" label="Fasta Source file"/> + </inputs> + <outputs> + <data format="tabular" name="outputfile" /> + </outputs> +<help> + +.. class:: infomark + +Design SSR primer sets from MISA output using Primer3 + +This tool uses helper scripts developed at IPK + +http://pgrc.ipk-gatersleben.de/misa/primer3.html + + + + +</help> + + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/misa.ini Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,2 @@ +definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5 +interruptions(max_difference_between_2_SSRs): 100
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/misa.pl Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,307 @@ +#!/usr/bin/perl -w +# Author: Thomas Thiel +# Program name: misa.pl + +###_______________________________________________________________________________ +### +###Program name: misa.pl +###Author: Thomas Thiel +###Release date: 14/12/01 (version 1.0) +### +###_______________________________________________________________________________ +### +## _______________________________________________________________________________ +## +## DESCRIPTION: Tool for the identification and localization of +## (I) perfect microsatellites as well as +## (II) compound microsatellites (two individual microsatellites, +## disrupted by a certain number of bases) +## +## SYNTAX: misa.pl <FASTA file> +## +## <FASTAfile> Single file in FASTA format containing the sequence(s). +## +## In order to specify the search criteria, an additional file containing +## the microsatellite search parameters is required named "misa.ini", which +## has the following structure: +## (a) Following a text string beginning with 'def', pairs of numbers are +## expected, whereas the first number defines the unit size and the +## second number the lower threshold of repeats for that specific unit. +## (b) Following a text string beginning with 'int' a single number defines +## the maximal number of bases between two adjacent microsatellites in +## order to specify the compound microsatellite type. +## Example: +## definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5 +## interruptions(max_difference_for_2_SSRs): 100 +## +## EXAMPLE: misa.pl seqs.fasta +## +## _______________________________________________________________________________ +## + + +#§§§§§ DECLARATION §§§§§# + +# Check for arguments. If none display syntax # + +if (@ARGV == 0) + { + open (IN,"<$0"); + while (<IN>) {if (/^\#\# (.*)/) {$message .= "$1\n"}}; + close (IN); + die $message; + }; + +# Check if help is required # + +if ($ARGV[0] =~ /-help/i) + { + open (IN,"<$0"); + while (<IN>) {if (/^\#\#\#(.*)/) {$message .= "$1\n"}}; + close (IN); + die $message; + }; + +# Open FASTA file # + +open (IN,"<$ARGV[0]") || die ("\nError: FASTA file doesn't exist !\n\n"); +#open (OUT,">$ARGV[0].misa"); updated by Leshi chen for galaxy integration +open (OUT,">$ARGV[1]"); +print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\n"; + +# Reading arguments updated by Leshi chen to get local path otherwise will create error # +#use Cwd 'abs_path'; +#use Cwd 'getcwd'; +#print getcwd()&"misa.ini"; +#print OUT abs_path($0); +open (SPECS,"\/root\/galaxy_dist\/tools\/pfr_2010\/"."misa.ini") || die ("\nError: Specifications file doesn't exist ! \n\n misa.ini not found ! \n\n"); +my %typrep; +my $amb = 0; +while (<SPECS>) + { + %typrep = $1 =~ /(\d+)/gi if (/^def\S*\s+(.*)/i); + if (/^int\S*\s+(\d+)/i) {$amb = $1} + }; +my @typ = sort { $a <=> $b } keys %typrep; + + +#§§§§§ CORE §§§§§# + +$/ = ">"; +my $max_repeats = 1; #count repeats +my $min_repeats = 1000; #count repeats +my (%count_motif,%count_class); #count +my ($number_sequences,$size_sequences,%ssr_containing_seqs); #stores number and size of all sequences examined +my $ssr_in_compound = 0; +my ($id,$seq); +while (<IN>) + { + next unless (($id,$seq) = /(.*?)\n(.*)/s); + my ($nr,%start,@order,%end,%motif,%repeats); # store info of all SSRs from each sequence + $seq =~ s/[\d\s>]//g; #remove digits, spaces, line breaks,... + $id =~ s/^\s*//g; $id =~ s/\s*$//g;$id =~ s/\s/_/g; #replace whitespace with "_" + $number_sequences++; + $size_sequences += length $seq; + for ($i=0; $i < scalar(@typ); $i++) #check each motif class + { + my $motiflen = $typ[$i]; + my $minreps = $typrep{$typ[$i]} - 1; + if ($min_repeats > $typrep{$typ[$i]}) {$min_repeats = $typrep{$typ[$i]}}; #count repeats + my $search = "(([acgt]{$motiflen})\\2{$minreps,})"; + while ( $seq =~ /$search/ig ) #scan whole sequence for that class + { + my $motif = uc $2; + my $redundant; #reject false type motifs [e.g. (TT)6 or (ACAC)5] + for ($j = $motiflen - 1; $j > 0; $j--) + { + my $redmotif = "([ACGT]{$j})\\1{".($motiflen/$j-1)."}"; + $redundant = 1 if ( $motif =~ /$redmotif/ ) + }; + next if $redundant; + $motif{++$nr} = $motif; + my $ssr = uc $1; + $repeats{$nr} = length($ssr) / $motiflen; + $end{$nr} = pos($seq); + $start{$nr} = $end{$nr} - length($ssr) + 1; + # count repeats + $count_motifs{$motif{$nr}}++; #counts occurrence of individual motifs + $motif{$nr}->{$repeats{$nr}}++; #counts occurrence of specific SSR in its appearing repeat + $count_class{$typ[$i]}++; #counts occurrence in each motif class + if ($max_repeats < $repeats{$nr}) {$max_repeats = $repeats{$nr}}; + }; + }; + next if (!$nr); #no SSRs + $ssr_containing_seqs{$nr}++; + @order = sort { $start{$a} <=> $start{$b} } keys %start; #put SSRs in right order + $i = 0; + my $count_seq; #counts + my ($start,$end,$ssrseq,$ssrtype,$size); + while ($i < $nr) + { + my $space = $amb + 1; + if (!$order[$i+1]) #last or only SSR + { + $count_seq++; + my $motiflen = length ($motif{$order[$i]}); + $ssrtype = "p".$motiflen; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i++]}; + next + }; + if (($start{$order[$i+1]} - $end{$order[$i]}) > $space) + { + $count_seq++; + my $motiflen = length ($motif{$order[$i]}); + $ssrtype = "p".$motiflen; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i++]}; + next + }; + my ($interssr); + if (($start{$order[$i+1]} - $end{$order[$i]}) < 1) + { + $count_seq++; $ssr_in_compound++; + $ssrtype = 'c*'; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}($motif{$order[$i+1]})$repeats{$order[$i+1]}*"; + $start = $start{$order[$i]}; $end = $end{$order[$i+1]} + } + else + { + $count_seq++; $ssr_in_compound++; + $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1); + $ssrtype = 'c'; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i+1]}; + #$space -= length $interssr + }; + while ($order[++$i + 1] and (($start{$order[$i+1]} - $end{$order[$i]}) <= $space)) + { + if (($start{$order[$i+1]} - $end{$order[$i]}) < 1) + { + $ssr_in_compound++; + $ssrseq .= "($motif{$order[$i+1]})$repeats{$order[$i+1]}*"; + $ssrtype = 'c*'; + $end = $end{$order[$i+1]} + } + else + { + $ssr_in_compound++; + $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1); + $ssrseq .= "$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}"; + $end = $end{$order[$i+1]}; + #$space -= length $interssr + } + }; + $i++; + } + continue + { + print OUT "$id\t$count_seq\t$ssrtype\t$ssrseq\t",($end - $start + 1),"\t$start\t$end\n" + }; + }; + +close (OUT); +#open (OUT,">$ARGV[0].statistics"); updated by Leshi chen for galaxy integration +open (OUT,">$ARGV[2]"); + +#§§§§§ INFO §§§§§# + +#§§§ Specifications §§§# +print OUT "Specifications\n==============\n\nSequence source file: \"$ARGV[0]\"\n\nDefinement of microsatellites (unit size / minimum number of repeats):\n"; +for ($i = 0; $i < scalar (@typ); $i++) {print OUT "($typ[$i]/$typrep{$typ[$i]}) "};print OUT "\n"; +if ($amb > 0) {print OUT "\nMaximal number of bases interrupting 2 SSRs in a compound microsatellite: $amb\n"}; +print OUT "\n\n\n"; + +#§§§ OCCURRENCE OF SSRs §§§# + +#small calculations +my @ssr_containing_seqs = values %ssr_containing_seqs; +my $ssr_containing_seqs = 0; +for ($i = 0; $i < scalar (@ssr_containing_seqs); $i++) {$ssr_containing_seqs += $ssr_containing_seqs[$i]}; +my @count_motifs = sort {length ($a) <=> length ($b) || $a cmp $b } keys %count_motifs; +my @count_class = sort { $a <=> $b } keys %count_class; +for ($i = 0; $i < scalar (@count_class); $i++) {$total += $count_class{$count_class[$i]}}; + +#§§§ Overview §§§# +print OUT "RESULTS OF MICROSATELLITE SEARCH\n================================\n\n"; +print OUT "Total number of sequences examined: $number_sequences\n"; +print OUT "Total size of examined sequences (bp): $size_sequences\n"; +print OUT "Total number of identified SSRs: $total\n"; +print OUT "Number of SSR containing sequences: $ssr_containing_seqs\n"; +print OUT "Number of sequences containing more than 1 SSR: ",$ssr_containing_seqs - ($ssr_containing_seqs{1} || 0),"\n"; +print OUT "Number of SSRs present in compound formation: $ssr_in_compound\n\n\n"; + +#§§§ Frequency of SSR classes §§§# +print OUT "Distribution to different repeat type classes\n---------------------------------------------\n\n"; +print OUT "Unit size\tNumber of SSRs\n"; +my $total = undef; +for ($i = 0; $i < scalar (@count_class); $i++) {print OUT "$count_class[$i]\t$count_class{$count_class[$i]}\n"}; +print OUT "\n"; + +#§§§ Frequency of SSRs: per motif and number of repeats §§§# +print OUT "Frequency of identified SSR motifs\n----------------------------------\n\nRepeats"; +for ($i = $min_repeats;$i <= $max_repeats; $i++) {print OUT "\t$i"}; +print OUT "\ttotal\n"; +for ($i = 0; $i < scalar (@count_motifs); $i++) + { + my $typ = length ($count_motifs[$i]); + print OUT $count_motifs[$i]; + for ($j = $min_repeats; $j <= $max_repeats; $j++) + { + if ($j < $typrep{$typ}) {print OUT "\t-";next}; + if ($count_motifs[$i]->{$j}) {print OUT "\t$count_motifs[$i]->{$j}"} else {print OUT "\t"}; + }; + print OUT "\t$count_motifs{$count_motifs[$i]}\n"; + }; +print OUT "\n"; + +#§§§ Frequency of SSRs: summarizing redundant and reverse motifs §§§# +# Eliminates %count_motifs ! +print OUT "Frequency of classified repeat types (considering sequence complementary)\n-------------------------------------------------------------------------\n\nRepeats"; +my (%red_rev,@red_rev); # groups +for ($i = 0; $i < scalar (@count_motifs); $i++) + { + next if ($count_motifs{$count_motifs[$i]} eq 'X'); + my (%group,@group,$red_rev); # store redundant/reverse motifs + my $reverse_motif = $actual_motif = $actual_motif_a = $count_motifs[$i]; + $reverse_motif =~ tr/ACGT/TGCA/; + $reverse_motif = reverse $reverse_motif; + my $reverse_motif_a = $reverse_motif; + for ($j = 0; $j < length ($count_motifs[$i]); $j++) + { + if ($count_motifs{$actual_motif}) {$group{$actual_motif} = "1"; $count_motifs{$actual_motif}='X'}; + if ($count_motifs{$reverse_motif}) {$group{$reverse_motif} = "1"; $count_motifs{$reverse_motif}='X'}; + $actual_motif =~ s/(.)(.*)/$2$1/; + $reverse_motif =~ s/(.)(.*)/$2$1/; + $actual_motif_a = $actual_motif if ($actual_motif lt $actual_motif_a); + $reverse_motif_a = $reverse_motif if ($reverse_motif lt $reverse_motif_a) + }; + if ($actual_motif_a lt $reverse_motif_a) {$red_rev = "$actual_motif_a/$reverse_motif_a"} + else {$red_rev = "$reverse_motif_a/$actual_motif_a"}; # group name + $red_rev{$red_rev}++; + @group = keys %group; + for ($j = 0; $j < scalar (@group); $j++) + { + for ($k = $min_repeats; $k <= $max_repeats; $k++) + { + if ($group[$j]->{$k}) {$red_rev->{"total"} += $group[$j]->{$k};$red_rev->{$k} += $group[$j]->{$k}} + } + } + }; +for ($i = $min_repeats; $i <= $max_repeats; $i++) {print OUT "\t$i"}; +print OUT "\ttotal\n"; +@red_rev = sort {length ($a) <=> length ($b) || $a cmp $b } keys %red_rev; +for ($i = 0; $i < scalar (@red_rev); $i++) + { + my $typ = (length ($red_rev[$i])-1)/2; + print OUT $red_rev[$i]; + for ($j = $min_repeats; $j <= $max_repeats; $j++) + { + if ($j < $typrep{$typ}) {print OUT "\t-";next}; + if ($red_rev[$i]->{$j}) {print OUT "\t",$red_rev[$i]->{$j}} + else {print OUT "\t"} + }; + print OUT "\t",$red_rev[$i]->{"total"},"\n"; + }; +#add by Leshi to close the Out +close (OUT);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/p3_in.pl Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +# Author: Thomas Thiel +# Program name: primer3_in.pl +# Description: creates a PRIMER3 input file based on SSR search results + +open (IN,"<$ARGV[0]") || die ("\nError: Couldn't open misa.pl results file (*.misa) !\n\n"); + +#my $filename = $ARGV[0]; +#$filename =~ s/\.misa//; +open (SRC,"<$ARGV[1]") || die ("\nError: Couldn't open source file containing original FASTA sequences !\n\n"); +open (OUT,">$ARGV[2]"); + +undef $/; +$in = <IN>; +study $in; + +$/= ">"; + +#my $count; +while (<SRC>) + { + next unless (my ($id,$seq) = /(.*?)\n(.*)/s); + $seq =~ s/[\d\s>]//g;#remove digits, spaces, line breaks,... + while ($in =~ /$id\t(\d+)\t\S+\t\S+\t(\d+)\t(\d+)/g) + { + my ($ssr_nr,$size,$start) = ($1,$2,$3); + #$count++; + print OUT "PRIMER_SEQUENCE_ID=$id"."_$ssr_nr\nSEQUENCE=$seq\n"; + print OUT "PRIMER_PRODUCT_SIZE_RANGE=100-280\n"; + print OUT "TARGET=",$start-3,",",$size+6,"\n"; + print OUT "PRIMER_MAX_END_STABILITY=250\n=\n" + }; + }; +#print "\n$count records created.\n";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/p3_misa_parameter.pl Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,223 @@ +#!/usr/bin/perl -w +# Author: Thomas Thiel +# Program name: misa.pl + +###_______________________________________________________________________________ +### +###Program name:p3_ misa_parameter.pl +###Author: Thomas Thiel +###Release date: 14/12/01 (version 1.0) +### +###_______________________________________________________________________________ +### +## _______________________________________________________________________________ +## +## DESCRIPTION: Tool for the identification and localization of +## (I) perfect microsatellites as well as +## (II) compound microsatellites (two individual microsatellites, +## disrupted by a certain number of bases) +## +## SYNTAX: misa.pl <FASTA file> +## +## <FASTAfile> Single file in FASTA format containing the sequence(s). +## +## In order to specify the search criteria, an additional file containing +## the microsatellite search parameters is required named "misa.ini", which +## has the following structure: +## (a) Following a text string beginning with 'def', pairs of numbers are +## expected, whereas the first number defines the unit size and the +## second number the lower threshold of repeats for that specific unit. +## (b) Following a text string beginning with 'int' a single number defines +## the maximal number of bases between two adjacent microsatellites in +## order to specify the compound microsatellite type. +## Example: +## definition(unit_size,min_repeats): 1-10 2-6 3-5 4-5 5-5 6-5 +## interruptions(max_difference_for_2_SSRs): 100 +## +## EXAMPLE: misa.pl seqs.fasta +## Modified by Leshi Chen for primer design +## _______________________________________________________________________________ +## + + +#§§§§§ DECLARATION §§§§§# + +# Check for arguments. If none display syntax # + + +if (@ARGV == 0) + { + open (IN,"<$0"); + while (<IN>) {if (/^\#\# (.*)/) {$message .= "$1\n"}}; + close (IN); + die $message; + }; + +# Check if help is required # + +if ($ARGV[0] =~ /-help/i) + { + open (IN,"<$0"); + while (<IN>) {if (/^\#\#\#(.*)/) {$message .= "$1\n"}}; + close (IN); + die $message; + }; + +# Open FASTA file # + +open (IN,"<$ARGV[0]") || die ("\nError: FASTA file doesn't exist !\n\n"); +#open (OUT,">$ARGV[0].misa"); updated by Leshi chen for galaxy integration +open (OUT,">$ARGV[1]"); +print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\n"; + +# Reading arguments updated by Leshi chen to get local path otherwise will create error # +#use Cwd 'abs_path'; +#use Cwd 'getcwd'; +#print getcwd()&"misa.ini"; +#print OUT abs_path($0); +#open (SPECS,"\/root\/galaxy_dist\/tools\/pfr_2010\/"."misa.ini") || die ("\nError: Specifications file doesn't exist ! \n\n misa.ini not found ! \n\n"); +my $arg_def= $ARGV[2]||''; +my $arg_interuption= $ARGV[3]||''; +#my $tmb = ''; +#my $_ = ''; +my %typrep; +my $amb = 0; + +%typrep = $arg_def =~/(\d+)-(\d+)/gi; +#print "1:" , $arg_def , "\n"; +#print "hh: ", %typrep , "\n"; +#print $arg_def , "\n"; +#print $arg_interuption ,"\n"; +#print $arg_def =~/(\d+)/gi , "\n"; +#%typrep = $arg_def =~/(\d+)/gi; +print %typrep , "\n"; +$amb = $arg_interuption; +print $amb , "\n"; +#while (<SPECS>)# + # {# + # %typrep = $1 =~ /(\d+)/gi if (/^def\S*\s+(.*)/i);# + # if (/^int\S*\s+(\d+)/i) {$amb = $1}# + # };# +my @typ = sort { $a <=> $b } keys %typrep; +print @typ . "\n"; +#die (%typrep , "--" , @typ , "--" , $amb); +#§§§§§ CORE §§§§§# + +$/ = ">"; +my $max_repeats = 1; #count repeats +my $min_repeats = 1000; #count repeats +my (%count_motif,%count_class); #count +my ($number_sequences,$size_sequences,%ssr_containing_seqs); #stores number and size of all sequences examined +my $ssr_in_compound = 0; +my ($id,$seq); +while (<IN>) + { + next unless (($id,$seq) = /(.*?)\n(.*)/s); + my ($nr,%start,@order,%end,%motif,%repeats); # store info of all SSRs from each sequence + $seq =~ s/[\d\s>]//g; #remove digits, spaces, line breaks,... + $id =~ s/^\s*//g; $id =~ s/\s*$//g;$id =~ s/\s/_/g; #replace whitespace with "_" + $number_sequences++; + $size_sequences += length $seq; + for ($i=0; $i < scalar(@typ); $i++) #check each motif class + { + my $motiflen = $typ[$i]; + my $minreps = $typrep{$typ[$i]} - 1; + if ($min_repeats > $typrep{$typ[$i]}) {$min_repeats = $typrep{$typ[$i]}}; #count repeats + my $search = "(([acgt]{$motiflen})\\2{$minreps,})"; + while ( $seq =~ /$search/ig ) #scan whole sequence for that class + { + my $motif = uc $2; + my $redundant; #reject false type motifs [e.g. (TT)6 or (ACAC)5] + for ($j = $motiflen - 1; $j > 0; $j--) + { + my $redmotif = "([ACGT]{$j})\\1{".($motiflen/$j-1)."}"; + $redundant = 1 if ( $motif =~ /$redmotif/ ) + }; + next if $redundant; + $motif{++$nr} = $motif; + my $ssr = uc $1; + $repeats{$nr} = length($ssr) / $motiflen; + $end{$nr} = pos($seq); + $start{$nr} = $end{$nr} - length($ssr) + 1; + # count repeats + # count_motifs doesn't required as statistic has been removed - modified by leshi + #$count_motifs{$motif{$nr}}++; #counts occurrence of individual motifs + $motif{$nr}->{$repeats{$nr}}++; #counts occurrence of specific SSR in its appearing repeat + $count_class{$typ[$i]}++; #counts occurrence in each motif class + if ($max_repeats < $repeats{$nr}) {$max_repeats = $repeats{$nr}}; + }; + }; + next if (!$nr); #no SSRs + $ssr_containing_seqs{$nr}++; + @order = sort { $start{$a} <=> $start{$b} } keys %start; #put SSRs in right order + $i = 0; + my $count_seq; #counts + my ($start,$end,$ssrseq,$ssrtype,$size); + while ($i < $nr) + { + my $space = $amb + 1; + if (!$order[$i+1]) #last or only SSR + { + $count_seq++; + my $motiflen = length ($motif{$order[$i]}); + $ssrtype = "p".$motiflen; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i++]}; + next + }; + if (($start{$order[$i+1]} - $end{$order[$i]}) > $space) + { + $count_seq++; + my $motiflen = length ($motif{$order[$i]}); + $ssrtype = "p".$motiflen; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i++]}; + next + }; + my ($interssr); + if (($start{$order[$i+1]} - $end{$order[$i]}) < 1) + { + $count_seq++; $ssr_in_compound++; + $ssrtype = 'c*'; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}($motif{$order[$i+1]})$repeats{$order[$i+1]}*"; + $start = $start{$order[$i]}; $end = $end{$order[$i+1]} + } + else + { + $count_seq++; $ssr_in_compound++; + $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1); + $ssrtype = 'c'; + $ssrseq = "($motif{$order[$i]})$repeats{$order[$i]}$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}"; + $start = $start{$order[$i]}; $end = $end{$order[$i+1]}; + #$space -= length $interssr + }; + while ($order[++$i + 1] and (($start{$order[$i+1]} - $end{$order[$i]}) <= $space)) + { + if (($start{$order[$i+1]} - $end{$order[$i]}) < 1) + { + $ssr_in_compound++; + $ssrseq .= "($motif{$order[$i+1]})$repeats{$order[$i+1]}*"; + $ssrtype = 'c*'; + $end = $end{$order[$i+1]} + } + else + { + $ssr_in_compound++; + $interssr = lc substr($seq,$end{$order[$i]},($start{$order[$i+1]} - $end{$order[$i]}) - 1); + $ssrseq .= "$interssr($motif{$order[$i+1]})$repeats{$order[$i+1]}"; + $end = $end{$order[$i+1]}; + #$space -= length $interssr + } + }; + $i++; + } + continue + { + print OUT "$id\t$count_seq\t$ssrtype\t$ssrseq\t",($end - $start + 1),"\t$start\t$end\n" + }; + }; + +close (OUT); +#open (OUT,">$ARGV[0].statistics"); updated by Leshi chen for galaxy integration +# the statistics part has been removed as we only need misa for primer +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/p3_misa_parameter.xml Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<tool id="p3_misa_1_parameter" name="Detect SSRs using MISA"> + <description>Detect simple sequence repeats using MISA</description> + <command interpreter="perl">p3_misa_parameter.pl $inputfastaFile $misa_outputfile "1-$definition_1 2-$definition_2 3-$definition_3 4-$definition_4 5-$definition_5 6-$definition_6" "$interruptions" </command> + <inputs> + <param format="fasta" name="inputfastaFile" type="data" label="fasta Source file"/> + <param name="definition_1" size="20" type="text" value="10" label="Min Repeats for Unit Size 1"> </param> + <param name="definition_2" size="20" type="text" value="6" label="Min Repeats for Unit Size 2"> </param> + <param name="definition_3" size="20" type="text" value="5" label="Min Repeats for Unit Size 3"> </param> + <param name="definition_4" size="20" type="text" value="5" label="Min Repeats for Unit Size 4"> </param> + <param name="definition_5" size="20" type="text" value="5" label="Min Repeats for Unit Size 5"> </param> + <param name="definition_6" size="20" type="text" value="5" label="Min Repeats for Unit Size 6"> </param> + <param name="interruptions" type="text" area="false" size="5" label="Interruptions:max_difference_between_2_SSRs" value="100" /> + </inputs> + <outputs> + <data format="tabular" name="misa_outputfile" /> + </outputs> +<help> + +.. class:: infomark + +Detect simple sequence repeats using **MISA** - MIcroSAtellite identification tool + +The MISA script was developed at IPK by Thomas Thiel + +http://pgrc.ipk-gatersleben.de/misa/ + +CITATION +--------- + +Exploiting EST databases for the development and characterization of gene-derived SSR-markers in barley (Hordeum vulgare L.) + +T. Thiel, W. Michalek, R. Varshney and A. Graner + +THEORETICAL AND APPLIED GENETICS Volume 106, Number 3, 411-422 + +DOI: 10.1007/s00122-002-1031-0 + +**TIP** +The primer design tool will work more predictably if fasta header lines are simplied to only include the sequence ID + + + +</help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/p3_out.pl Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,76 @@ +#!/usr/bin/perl -w +# Author: Thomas Thiel +# Program name: prim_output.pl +# Description: converts the Primer3 output into an table + +open (SRC,"<$ARGV[0]") || die ("\nError: Couldn't open Primer3 results file (*.p3out) !\n\n"); +my $filename = $ARGV[0]; +$filename =~ s/\.p3out//; +open (IN,"<$ARGV[1]") || die ("\nError: Couldn't open source file containing MISA (*.misa) results !\n\n"); +open (OUT,">$ARGV[2]") || die ("nError: Couldn't create file !\n\n"); + +my ($seq_names_failed,$count_failed,$count); +print OUT "ID\tSSR nr.\tSSR type\tSSR\tsize\tstart\tend\t"; +print OUT "FORWARD PRIMER1 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER1 (5'-3')\tTm(°C)\tsize\tPRODUCT1 size (bp)\tstart (bp)\tend (bp)\t"; +print OUT "FORWARD PRIMER2 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER2 (5'-3')\tTm(°C)\tsize\tPRODUCT2 size (bp)\tstart (bp)\tend (bp)\t"; +print OUT "FORWARD PRIMER3 (5'-3')\tTm(°C)\tsize\tREVERSE PRIMER3 (5'-3')\tTm(°C)\tsize\tPRODUCT3 size (bp)\tstart (bp)\tend (bp)\n"; + +undef $/; +my $in = <IN>; +study $in; + +$/ = "=\n"; + +while (<SRC>) + { + my ($id,$ssr_nr) = (/PRIMER_SEQUENCE_ID=(\S+)_(\d+)/); + + $in =~ /($id\t$ssr_nr\t.*)\n/; + my $misa = $1; + + /PRIMER_LEFT_SEQUENCE=(.*)/ || do {$count_failed++;print OUT "$misa\n"; next}; + my $info = "$1\t"; + + /PRIMER_LEFT_TM=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_RIGHT_SEQUENCE=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT_TM=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_PRODUCT_SIZE=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT=(\d+),\d+/; $info .= "$1\t"; + /PRIMER_RIGHT=(\d+),\d+/; $info .= "$1\t"; + + + /PRIMER_LEFT_1_SEQUENCE=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_1_TM=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_1=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_RIGHT_1_SEQUENCE=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT_1_TM=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT_1=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_PRODUCT_SIZE_1=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_1=(\d+),\d+/; $info .= "$1\t"; + /PRIMER_RIGHT_1=(\d+),\d+/; $info .= "$1\t"; + + + /PRIMER_LEFT_2_SEQUENCE=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_2_TM=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_2=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_RIGHT_2_SEQUENCE=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT_2_TM=(.*)/; $info .= "$1\t"; + /PRIMER_RIGHT_2=\d+,(\d+)/; $info .= "$1\t"; + + /PRIMER_PRODUCT_SIZE_2=(.*)/; $info .= "$1\t"; + /PRIMER_LEFT_2=(\d+),\d+/; $info .= "$1\t"; + /PRIMER_RIGHT_2=(\d+),\d+/; $info .= "$1"; + + $count++; + print OUT "$misa\t$info\n" + }; + +#print "\nPrimer modelling was successful for $count sequences.\n"; +#print "Primer modelling failed for $count_failed sequences.\n";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MISA/tool_conf_entry_MISA.xml Wed Sep 14 23:57:57 2011 -0400 @@ -0,0 +1,5 @@ +<section name="SSR Marker Design" id="ssr_marker"> + <tool file="tools/MISA/p3_misa_parameter.xml" /> + <tool file="tools/MISA/clean_fasta_header.xml" /> + <tool file="tools/MISA/design_MISA.xml" /> + </section>