Repository 'multispecies_orthologous_microsats'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/multispecies_orthologous_microsats

Changeset 0:275433d3a395 (2013-09-25)
Next changeset 1:f2aeaacb43c2 (2015-10-09)
Commit message:
Uploaded tool tarball.
added:
multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml
test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz
test-data/regVariation/microsatellite/Galaxy17_short_raw.txt
test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz
b
diff -r 000000000000 -r 275433d3a395 multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl Wed Sep 25 11:26:57 2013 -0400
[
b'@@ -0,0 +1,5606 @@\n+#!/usr/bin/perl\n+use strict;\n+use warnings;\n+use Term::ANSIColor;\n+use File::Basename;\n+use IO::Handle;\n+use Cwd;\n+use File::Path;\n+use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);\n+use File::Path qw(make_path remove_tree);\n+use File::Temp qw/ tempfile tempdir /;\n+my $tdir = tempdir( CLEANUP => 1 );\n+chdir $tdir;\n+my $dir = getcwd;  \n+#print "dir = $dir\\n";\n+\n+#$ENV{\'PATH\'} .= \':\' . dirname($0);\n+my $date = `date`;\n+\n+my ($mafile, $mafile_sputt, $orthfile, $threshold_array,  $allspeciesin, $tree_definition_all, $separation) = @ARGV;\n+if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\\n"; }\n+\n+$tree_definition_all =~ s/\\s+//g;\n+$threshold_array =~ s/\\s+//g;\n+$allspeciesin =~ s/\\s+//g;\n+#-------------------------------------------------------------------------------\n+# WHICH SPUTNIK USED?\n+my $sputnikpath = ();\n+$sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;\n+#$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";\n+#print "sputnik_Mac-PowerPC non-existant\\n" if !-e $sputnikpath;\n+#exit if !-e $sputnikpath;\n+#$sputnikpath = "bx-sputnik" ;\n+#print "ARGV input = @ARGV\\n";\n+#print "ARGV input :\\n mafile=$mafile\\n orthfile=$orthfile\\n threshold_array=$threshold_array\\n  species_set=$species_set\\n tree_definition=$tree_definition\\n separation=$separation\\n";\n+#-------------------------------------------------------------------------------\n+# RUNFILE\n+#-------------------------------------------------------------------------------\n+$distance = 1; #bp\n+$distance++;\n+my @tree_definitions=MakeTrees($tree_definition_all);\n+my $allspeciesset = $tree_definition_all;\n+$allspeciesset =~ s/[\\(\\) ]+//g;\n+@allspecies = split(/,/,$allspeciesset);\n+\n+my @outputfiles = ();\n+my $round = 0;\n+#my $tdir = tempdir( CLEANUP => 0 );\n+#chdir $tdir;\n+\n+foreach my $tree_definition (@tree_definitions){\n+\tmy @commas = ($tree_definition =~ /,/g) ;\n+\t#print "commas = @commas\\n"; <STDIN>;\n+\tnext if scalar(@commas) <= 1;\n+\t#print "species_set = $species_set\\n";\n+\t$treeSpeciesNum = scalar(@commas) + 1;\n+\t$species_set = $tree_definition;\n+\t$species_set =~ s/[\\)\\( ;]+//g;\n+\t#print "species_set = $species_set\\n"; <STDIN>;\n+\n+\t$round++;\n+\t#-------------------------------------------------------------------------------\n+\t# MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)\n+\t$threshold_array=~ s/,/_/g;\n+\tmy @thresharr = split("_",$threshold_array);\n+\t@thresholds=@thresharr;\n+\t#my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));\n+\t#print "current dit=$dir\\n";\n+\t#-------------------------------------------------------------------------------\n+\t# CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY\n+\tmy @chrfiles=();\n+\t\n+\t#my $mafile =  "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];\n+\tmy $chromt=int(rand(10000));\n+\tmy $p_chr=$chromt;\n+\t\n+\t\n+\tmy @exactspeciesset_unarranged = split(/,/,$species_set);\n+\t$tree_definition=~s/[\\)\\(, ]/\\t/g;\n+\tmy @treespecies=split(/\\t+/,$tree_definition);\n+\tmy @exactspecies=();\n+\t\n+\tforeach my $spec (@treespecies){\n+\t\tforeach my $espec (@exactspeciesset_unarranged){\n+\t\t\tpush @exactspecies, $spec if $spec eq $espec;\n+\t\t}\n+\t}\n+\t#print "exactspecies=@exactspecies\\n";\n+\t$focalspec = $exactspecies[0];\n+\tmy $arranged_species_set=join(".",@exactspecies);\n+\tmy $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");\n+\tmy $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");\n+\t#print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \\n'..b'4 xxxxxxxxxxxxxx  multiSpecies_orthFinder4 xxxxxxxxxxxxxx \n+\n+#xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx  MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx  MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx \n+\n+sub MakeTrees{\n+\tmy $tree = $_[0];\n+\tmy @parts=($tree);\n+#\tmy @parts=();\n+\t\n+\twhile (1){\n+\t\t$tree =~ s/^\\(//g;\n+\t\t$tree =~ s/\\)$//g;\n+\t\tmy @arr = ();\n+\t\n+\t\tif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\\(\\),]+)\\)$/){\n+\t\t\t@arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\\(\\),]+)$/;\n+\t\t\t$tree = $2;\n+\t\t\tpush @parts, $tree;\n+\t\t}\n+\t\telsif ($tree =~ /^\\(([a-zA-Z0-9_\\(\\),]+),([a-zA-Z0-9_]+)$/){\n+\t\t\t@arr = $tree =~ /^([a-zA-Z0-9_\\(\\),]+),([a-zA-Z0-9_]+)$/;\n+\t\t\t$tree = $1;\n+\t\t\tpush @parts, $tree;\n+\t\t}\n+\t\telsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){\n+\t\t\tlast;\n+\t\t}\n+\t}\t\n+\treturn @parts;\n+}\n+\n+#xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx  qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx  qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx \n+\n+sub qualityFilter{\n+\tmy $unmaskedorthfile = $_[0];\n+\tmy $seqfile = $_[1];\n+\tmy $maskedorthfile = $_[2];\n+\tmy $filteredout = $maskedorthfile."_residue";\n+\topen (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!";\n+\n+\tmy %keyhash = ();\n+\t\n+\twhile (my $line = <PMORTH>){\n+\t\tmy $key = join("\\t", $1,$2,$3,$4) if $line =~ /($focalspec)\\s+([a-zA-Z0-9\\-_]+)\\s+([0-9]+)\\s+([0-9]+)/;\n+\t\tpush @{$keyhash{$key}}, $line;\n+\t}\n+\t\n+\topen (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!";\n+\topen (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!";\n+        open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!";\n+\n+\n+\t\n+\twhile (my $line = <SEQ>){\n+\t\tchomp $line;\n+\t\tif ($line =~ /($focalspec)\\s+([a-zA-Z0-9\\-_]+)\\s+([0-9]+)\\s+([0-9]+)/){\n+\t\t\tmy $key = join("\\t", $1,$2,$3,$4);\n+\t\t\tnext if !exists $keyhash{$key};\n+\t\t\tmy @orths = @{$keyhash{$key}} if exists $keyhash{$key};\n+\t\t\tdelete $keyhash{$key};\n+\t\t\t\n+\t\t\tmy $sine = <SEQ>;\n+\t\t\t\n+\t\t\tforeach my $orth (@orths){\n+\t\t\t\t#print "-----------------------------------------------------------------\\n";\n+\t\t\t\t#print $orth;\n+\t\t\t\tmy $orthcopy = $orth;\n+\t\t\t\t$orth =~ s/^>//;\n+\t\t\t\tmy @parts = split(/>/,$orth);\n+\t\t\t\t\n+\t\t\t\tmy @starts = ();\n+\t\t\t\tmy @ends = ();\n+\t\t\t\t\n+\t\t\t\tforeach my $part (@parts){\n+\t\t\t\t\tmy $no_of_species = adjustCoordinates($part);\n+\t\t\t\t\tmy @pields = split(/\\t/,$part);\n+\t\t\t\t\t\n+\t\t\t\t#\tprint "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\\n";\n+\t\t\t\t\t\n+\t\t\t\t\tpush @starts, $pields[$startcord];\n+\t\t\t\t\tpush @ends, $pields[$endcord];\n+\t\t\t\t}\n+\t\t\t\t\n+\t\t\t\t#print "starts = @starts ... ends = @ends\\n";\n+\t\t\t\t\n+\t\t\t\tmy $leftend = smallest_number(@starts)-10;\n+\t\t\t\tmy $rightend = largest_number(@ends)+10;\n+\n+\t\t\t\tmy $maskarea = substr($sine, $leftend, $rightend-$leftend+1);\n+\t\t\t\tprint RES $orth if $maskarea =~ /#/;\t\t\n+\t\t\t\t\n+\t\t\t\t\n+\t\t\t\tnext if $maskarea =~ /#/;\n+\t\t\t\t\n+\t\t\t\tprint MORTH $orthcopy;\n+\t\t\t}\n+\t\t}\n+\t\telse{\n+\t\t\tnext;\n+\t\t}\n+\t\t\n+\t\t\n+\t}\n+\t\n+#\tprint "UNDONE: ", scalar(keys %keyhash),"\\n";\n+#\tprint MORTH "UNDONE: ", scalar(keys %keyhash),"\\n";\n+\t\n+}\n+\n+sub adjustCoordinates{\n+\tmy $line = $_[0];\n+\tmy $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\\._\\-]+)|(scaffold[0-9a-zA-Z\\._\\-]+)|(supercontig[0-9a-zA-Z\\._\\-]+)/x/ig;\n+\tmy @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\\._\\-]+)/x/g);\n+# print "line = $line\\n";\n+\t$infocord = 2 + (4*$no_of_species) - 1;\n+\t$typecord = 2 + (4*$no_of_species) + 1 - 1;\n+\t$motifcord = 2 + (4*$no_of_species) + 2 - 1;\n+\t$gapcord = $motifcord+1;\n+\t$startcord = $gapcord+1;\n+\t$strandcord = $startcord+1;\n+\t$endcord = $strandcord + 1;\n+\t$microsatcord = $endcord + 1;\n+\t$sequencepos = 2 + (5*$no_of_species) + 1 -1 ; \n+\t$interr_poscord = $microsatcord + 3;\n+\t$no_of_interruptionscord = $microsatcord + 4;\n+\t$interrcord = $microsatcord + 2;\n+#\tprint "$line\\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\\n" if $line !~ /calJac/i;\n+\treturn $no_of_species;\n+}\n+\n+\n+\n+\n'
b
diff -r 000000000000 -r 275433d3a395 multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml Wed Sep 25 11:26:57 2013 -0400
b
@@ -0,0 +1,93 @@
+<tool id="multispecies_orthologous_microsats" name="Extract orthologous microsatellites" version="1.0.0">
+  <description> for multiple (>2) species alignments</description>
+  <command interpreter="perl">
+    multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl   
+    $input1 
+    $input2
+   $out_file1 
+   $thresholds 
+   $species 
+   "$treedefinition"
+   $separation 
+
+  </command>
+  <inputs>
+    <page>
+        <param format="maf" name="input1" type="data" label="Select unfiltered MAF alignments" help= "NOTE: Currently users are requested to select only the alignments that contain five, four or three species' genomes. )"/>
+        <param format="maf" name="input2" type="data" label="Select the filtered version of above MAF alignments" help= "NOTE: Please use the tool 'Filter nucleotides' to filter nucleotides based on quality, in multiple species. When using the Filter nucleotide tool, ensure that you click 'Select All' for the option 'Mask Species')"/>
+        <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellite blocks"
+     help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/>
+     <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites"
+     help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/>
+    <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="NOTE: Currently users are requested to select one of these three combinations: hg18-panTro2-ponAbe2, hg18-panTro2-ponAbe2-rheMac2 or hg18-panTro2-ponAbe2-rheMac2-calJac1">
+      <options>
+        <filter type="data_meta" ref="input1" key="species" />
+      </options>
+    </param>
+     <param name="treedefinition" size="200" type="text" value = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" 
+     help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/>
+    </page>
+  </inputs>
+  <outputs>
+    <data format="txt" name="out_file1" metadata_source="input1"/>
+  </outputs>
+  <requirements>
+     <requirement type="binary">bx-sputnik</requirement>
+  </requirements>
+  <tests>
+    <test>
+      <param name="input1" value="regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz"/>
+      <param name="input2" value="regVariation/microsatellite/Galaxy17_masked_short.maf.gz"/>
+      <param name="thresholds" value="9,10,12,12"/>
+      <param name="species" value="hg18,panTro2,ponAbe2,rheMac2,calJac1"/>
+      <param name="treedefinition" value="((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/>
+      <param name="separation" value="10"/>
+      <output name="out_file1" file="regVariation/microsatellite/Galaxy17_short_raw.txt"/>
+    </test>
+  </tests>
+
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool finds ortholgous microsatellite blocks between aligned species
+  
+-----
+
+.. class:: warningmark
+
+**Note**
+
+A non-tabular format is created in which each row contains all information pertaining to a microsatellite locus from multiple species in the alignment.
+The rows read like this:
+
+>hg18 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaa------aaaAAA >rheMac2 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaaaaa---AAAAAA
+
+Information from each species starts with an ">" followed by the species name, for instance, ">rheMac2". Below we describe all information listed for a microsatellite sequence in each species.
+
+After the species tag the alignemnt number is listed.
+What follows is details of the alignment block from all the species, including the chromosome number, start and end coordinates in each species. For instance:
+
+hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728
+
+suggests that the alignment block as five species: hg18, panTro2, ponAbe2, rheMac2 and calJac1.
+
+Then the type of microsatellite is written, for instance, "mononucleotide".
+
+Then the microsatellite motif.
+
+Then the number of gaps in the alignment, in the respective species (as noted above, rheMac2 in this case).
+
+Then the start coordinate, the strand, and the end coordinate WITHIN the alignment block.
+
+At the end is listed the microsatellite sequence.
+
+If the microsatellite contains interruptions (which are not important for this tool), then the interruptions' information will be written out after the microsatellite sequence.
+
+
+</help>  
+
+
+</tool>
b
diff -r 000000000000 -r 275433d3a395 test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz
b
Binary file test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz has changed
b
diff -r 000000000000 -r 275433d3a395 test-data/regVariation/microsatellite/Galaxy17_short_raw.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regVariation/microsatellite/Galaxy17_short_raw.txt Wed Sep 25 11:26:57 2013 -0400
[
@@ -0,0 +1,5 @@
+>rheMac2 1 hg18 chr22 16115000 16115212 panTro2 chr22 16131432 16131644 ponAbe2 chr22 13834301 13834513 rheMac2 chr10 61914657 61914870 calJac1 Contig13114 14954 15124 mononucleotide A 1 198 + 206 aaaaaaaaa
+>calJac1 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide AAAC 1 182 + 193 AAACAAACAAAC
+>hg18 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 7 568 + 577 TTTTTTTTTT >panTro2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 6 568 + 577 TTTTTTTTTT >ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 5 568 + 576 TTTTTTTTT >rheMac2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 22 568 + 579 TTTTTTTTTTTT
+>ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide GAGG 8 821 + 833 GAGGGAGGGAGGG
+>hg18 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >panTro2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >ponAbe2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >rheMac2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >calJac1 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1
b
diff -r 000000000000 -r 275433d3a395 test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz
b
Binary file test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz has changed