changeset 17:57df76d861e4

Modifications for ToolShed proprietary data types
author Jim Johnson <jj@umn.edu>
date Tue, 17 Jan 2012 11:08:15 -0600
parents 541e3c97c240
children 697156806162
files mothur/lib/galaxy/datatypes/converters/ref_to_seq_taxonomy_converter.py mothur/lib/galaxy/datatypes/converters/ref_to_seq_taxonomy_converter.xml mothur/lib/galaxy/datatypes/metagenomics.py mothur/tool-data/datatypes.conf.xml
diffstat 4 files changed, 217 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mothur/lib/galaxy/datatypes/converters/ref_to_seq_taxonomy_converter.py	Tue Jan 17 11:08:15 2012 -0600
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+"""
+convert a ref.taxonommy file to a seq.taxonomy file
+Usage:
+%python ref_to_seq_taxonomy_converter.py <ref.taxonommy_filename> <seq.taxonomy_filename> 
+"""
+
+import sys, os, re
+from math import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s" % msg )
+    sys.exit()
+
+def __main__():
+    infile_name = sys.argv[1]
+    outfile = open( sys.argv[2], 'w' )
+    pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)]))*(;)?)$'
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip() # eliminate trailing space and new line characters
+        if not line or line.startswith( '#' ):
+            continue
+        fields = line.split('\t')
+        # make sure the 2nd field (taxonomy) ends with a ;
+        outfile.write('%s\t%s;\n' % (fields[0], re.sub(';$','',fields[1])))
+
+    outfile.close()
+
+if __name__ == "__main__": __main__() 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mothur/lib/galaxy/datatypes/converters/ref_to_seq_taxonomy_converter.xml	Tue Jan 17 11:08:15 2012 -0600
@@ -0,0 +1,12 @@
+<tool id="CONVERTER_ref_to_seq_taxomony" name="Convert Ref taxonomy to Seq Taxonomy" version="1.0.0">
+  <description>converts 2 or 3 column sequence taxonomy file to a 2 column mothur taxonomy_outline format</description>
+  <command interpreter="python">ref_to_seq_taxonomy_converter.py $input $output</command>
+  <inputs>
+    <param name="input" type="data" format="ref.taxonomy" label="a Sequence Taxomony file"/>
+  </inputs>
+  <outputs>
+    <data name="output" format="seq.taxonomy"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
--- a/mothur/lib/galaxy/datatypes/metagenomics.py	Fri Dec 09 12:12:16 2011 -0600
+++ b/mothur/lib/galaxy/datatypes/metagenomics.py	Tue Jan 17 11:08:15 2012 -0600
@@ -4,18 +4,19 @@
 for Mothur
 """
 
-import data
-import logging, os, sys, time, tempfile, shutil, string, glob, re
+import logging, os, os.path, sys, time, tempfile, shutil, string, glob, re
 import galaxy.model
+from galaxy.datatypes import data
+from galaxy.datatypes.sniff import *
 from galaxy.datatypes import metadata
 from galaxy.datatypes import tabular
 from galaxy.datatypes import sequence
 from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes.data import Text
 from galaxy.datatypes.tabular import Tabular
 from galaxy.datatypes.sequence import Fasta
 from galaxy import util
 from galaxy.datatypes.images import Html
-from sniff import *
 
 log = logging.getLogger(__name__)
 
@@ -342,7 +343,7 @@
         Tabular.__init__( self, **kwd )
         self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
 
-class DistanceMatrix(data.Text):
+class DistanceMatrix( Text ):
     file_ext = 'dist'
     """Add metadata elements"""
     MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )
@@ -403,7 +404,7 @@
         """Initialize secondary structure map datatype"""
         Tabular.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
-        data.Text.init_meta( self, dataset, copy_from=copy_from )
+        Text.init_meta( self, dataset, copy_from=copy_from )
     def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
         dataset.metadata.sequences = 0 
 
@@ -543,7 +544,7 @@
         self.column_names = ['name']
         self.columns = 1
 
-class Oligos( data.Text ):
+class Oligos( Text ):
     file_ext = 'oligos'
 
     def sniff( self, filename ):
@@ -699,7 +700,7 @@
         self.masked = True
         self.filtered = True
 
-class LaneMask(data.Text):
+class LaneMask(Text):
     file_ext = 'filter'
 
     def sniff( self, filename ):
@@ -872,7 +873,7 @@
         Tabular.__init__( self, **kwd )
         self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']
 
-class Phylip(data.Text):
+class Phylip(Text):
     file_ext = 'phy'
 
     def sniff( self, filename ):
@@ -1033,6 +1034,124 @@
             out = "Can't create peek %s" % str( exc )
         return out
 
+class Newick( Text ):
+    """
+    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
+    http://evolution.genetics.washington.edu/phylip/newicktree.html
+    http://en.wikipedia.org/wiki/Newick_format
+    Example:
+    (B,(A,C,E),D);
+    or example with branch lengths:
+    (B:6.0,(A:5.0,C:3.0,E:4.0):5.0,D:11.0);
+    or an example with embedded comments but no branch lengths:
+    ((a [&&PRIME S=x], b [&&PRIME S=y]), c [&&PRIME S=z]); 
+    Example with named interior noe:
+    (B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0);
+    """
+    file_ext = 'tre'
+
+    def __init__(self, **kwd):
+        Text.__init__( self, **kwd )
+
+    def sniff( self, filename ):   ## TODO
+        """
+        Determine whether the file is in Newick format
+        Note: Last non-space char of a tree should be a semicolon: ';'
+        Usually the first char will be a open parenthesis: '('
+        (,,(,));                               no nodes are named
+        (A,B,(C,D));                           leaf nodes are named
+        (A,B,(C,D)E)F;                         all nodes are named
+        (:0.1,:0.2,(:0.3,:0.4):0.5);           all but root node have a distance to parent
+        (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;       all have a distance to parent
+        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);       distances and leaf names (popular)
+        (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;     distances and all names
+        ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;    a tree rooted on a leaf node (rare)
+        """
+        if not os.path.exists(filename):
+            return False
+        try:
+            ## For now, guess this is a Newick file if it starts with a '(' and ends with a ';'
+            flen = os.path.getsize(filename)
+            fh = open( filename )
+            len = min(flen,2000)
+            # check end of the file for a semicolon
+            fh.seek(-len,os.SEEK_END)
+            buf = fh.read(len).strip()
+            buf = buf.strip()
+            if not buf.endswith(';'):
+                return False
+            # See if this starts with a open parenthesis
+            if len < flen:
+                fh.seek(0)
+                buf = fh.read(len).strip()
+            if buf.startswith('('):
+                return True
+        except:
+            pass
+        finally:
+            close(fh)
+        return False
+
+class Nhx( Newick ):
+    """
+    New Hampshire eXtended  Newick with embedded 
+    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
+    http://evolution.genetics.washington.edu/phylip/newicktree.html
+    http://en.wikipedia.org/wiki/Newick_format
+    Example:
+    (gene1_Hu[&&NHX:S=Hu_Homo_sapiens], (gene2_Hu[&&NHX:S=Hu_Homo_sapiens], gene2_Mu[&&NHX:S=Mu_Mus_musculus]));
+    """
+    file_ext = 'nhx'
+
+class Nexus( Text ):
+    """
+    http://en.wikipedia.org/wiki/Nexus_file
+    Example:
+    #NEXUS
+    BEGIN TAXA;
+          Dimensions NTax=4;
+          TaxLabels fish frog snake mouse;
+    END;
+    
+    BEGIN CHARACTERS;
+          Dimensions NChar=20;
+          Format DataType=DNA;
+          Matrix
+            fish   ACATA GAGGG TACCT CTAAG
+            frog   ACATA GAGGG TACCT CTAAG
+            snake  ACATA GAGGG TACCT CTAAG
+            mouse  ACATA GAGGG TACCT CTAAG
+    END;
+    
+    BEGIN TREES;
+          Tree best=(fish, (frog, (snake, mouse)));
+    END;
+    """
+    file_ext = 'nex'
+
+    def __init__(self, **kwd):
+        Text.__init__( self, **kwd )
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is in nexus format
+        First line should be:
+        #NEXUS
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            line = fh.readline()
+            line = line.strip()
+            if line and line == '#NEXUS':
+                fh.close()
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
 
 ## Qiime Classes
 
@@ -1165,7 +1284,7 @@
     """
     file_ext = 'qiimeparams'
 
-class QiimePrefs(data.Text):
+class QiimePrefs(Text):
     """
     A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py.
     Example:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mothur/tool-data/datatypes.conf.xml	Tue Jan 17 11:08:15 2012 -0600
@@ -0,0 +1,46 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="metagenomics.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="otu" type="galaxy.datatypes.metagenomics:Otu" display_in_upload="true"/>
+        <datatype extension="list" type="galaxy.datatypes.metagenomics:OtuList" display_in_upload="true"/>
+        <datatype extension="sabund" type="galaxy.datatypes.metagenomics:Sabund" display_in_upload="true"/>
+        <datatype extension="rabund" type="galaxy.datatypes.metagenomics:Rabund" display_in_upload="true"/>
+        <datatype extension="shared" type="galaxy.datatypes.metagenomics:SharedRabund" display_in_upload="true"/>
+        <datatype extension="relabund" type="galaxy.datatypes.metagenomics:RelAbund" display_in_upload="true"/>
+        <datatype extension="names" type="galaxy.datatypes.metagenomics:Names" display_in_upload="true"/>
+        <datatype extension="design" type="galaxy.datatypes.metagenomics:Design" display_in_upload="true"/>
+        <datatype extension="summary" type="galaxy.datatypes.metagenomics:Summary" display_in_upload="true"/>
+        <datatype extension="groups" type="galaxy.datatypes.metagenomics:Group" display_in_upload="true"/>
+        <datatype extension="oligos" type="galaxy.datatypes.metagenomics:Oligos" display_in_upload="true"/>
+        <datatype extension="align" type="galaxy.datatypes.metagenomics:SequenceAlignment" display_in_upload="true"/>
+        <datatype extension="accnos" type="galaxy.datatypes.metagenomics:AccNos" display_in_upload="true"/>
+        <datatype extension="map" type="galaxy.datatypes.metagenomics:SecondaryStructureMap" display_in_upload="true"/>
+        <datatype extension="align.check" type="galaxy.datatypes.metagenomics:AlignCheck" display_in_upload="true"/>
+        <datatype extension="align.report" type="galaxy.datatypes.metagenomics:AlignReport" display_in_upload="true"/>
+        <datatype extension="filter" type="galaxy.datatypes.metagenomics:LaneMask" display_in_upload="true"/>
+        <datatype extension="dist" type="galaxy.datatypes.metagenomics:DistanceMatrix" display_in_upload="true"/>
+        <datatype extension="pair.dist" type="galaxy.datatypes.metagenomics:PairwiseDistanceMatrix" display_in_upload="true"/>
+        <datatype extension="square.dist" type="galaxy.datatypes.metagenomics:SquareDistanceMatrix" display_in_upload="true"/>
+        <datatype extension="lower.dist" type="galaxy.datatypes.metagenomics:LowerTriangleDistanceMatrix" display_in_upload="true"/>
+        <datatype extension="ref.taxonomy" type="galaxy.datatypes.metagenomics:RefTaxonomy" display_in_upload="true">
+            <converter file="ref_to_seq_taxonomy_converter.xml" target_datatype="seq.taxonomy"/>
+        </datatype>
+        <datatype extension="seq.taxonomy" type="galaxy.datatypes.metagenomics:SequenceTaxonomy" display_in_upload="true"/>
+        <datatype extension="rdp.taxonomy" type="galaxy.datatypes.metagenomics:RDPSequenceTaxonomy" display_in_upload="true"/>
+        <datatype extension="cons.taxonomy" type="galaxy.datatypes.metagenomics:ConsensusTaxonomy" display_in_upload="true"/>
+        <datatype extension="tax.summary" type="galaxy.datatypes.metagenomics:TaxonomySummary" display_in_upload="true"/>
+        <datatype extension="freq" type="galaxy.datatypes.metagenomics:Frequency" display_in_upload="true"/>
+        <datatype extension="quan" type="galaxy.datatypes.metagenomics:Quantile" display_in_upload="true"/>
+        <datatype extension="filtered.quan" type="galaxy.datatypes.metagenomics:FilteredQuantile" display_in_upload="true"/>
+        <datatype extension="masked.quan" type="galaxy.datatypes.metagenomics:MaskedQuantile" display_in_upload="true"/>
+        <datatype extension="filtered.masked.quan" type="galaxy.datatypes.metagenomics:FilteredMaskedQuantile" display_in_upload="true"/>
+        <datatype extension="axes" type="galaxy.datatypes.metagenomics:Axes" display_in_upload="true"/>
+        <datatype extension="sff.flow" type="galaxy.datatypes.metagenomics:SffFlow" display_in_upload="true"/>
+        <datatype extension="tre" type="galaxy.datatypes.metagenomics:Newick" display_in_upload="true"/>
+        <datatype extension="nhx" type="galaxy.datatypes.metagenomics:Nhx" display_in_upload="true"/>
+        <datatype extension="nex" type="galaxy.datatypes.metagenomics:Nexus" display_in_upload="true"/>
+    </registration>
+</datatypes>