Mercurial > repos > qfab > metagenomics_datatypes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metagenomics_datatypes/ReadMe.txt	Thu May 29 00:39:20 2014 -0400
@@ -0,0 +1,29 @@
+Galaxy datatypes required by the Metagenomics Workflow
+======================================================================
+
+This wrapper for the metagenomics datatypes is based on the Mothur
+toolsuite wrapper of James E Johnson - University of Minnesota.
+
+This wrapper will automatically register the datatypes required by
+the Metagenomics Workflow.
+
+
+Disclaimer
+======================================================================
+
+This source code is provided by QFAB Bioinformatics "as is", in the hope
+that it will be useful, and any express or implied warranties, including,
+but not limited to, the implied warranties of merchantability and fitness
+for a particular purpose are disclaimed.
+IN NO EVENT SHALL QFAB BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES HOWEVER CAUSED AND ON ANY
+THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOURCE
+CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+License
+======================================================================
+This work by QFAB Bioinformatics (as part of the GVL project
+http://genome.edu.au) is licensed under a Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International License.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metagenomics_datatypes/datatypes_conf.xml	Thu May 29 00:39:20 2014 -0400
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="metagenomics.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="rabund" type="galaxy.datatypes.metagenomics:Rabund" display_in_upload="true"/>
+        <datatype extension="otu" type="galaxy.datatypes.metagenomics:Otu" display_in_upload="true"/>
+        <datatype extension="list" type="galaxy.datatypes.metagenomics:OtuList" display_in_upload="true"/>
+        <datatype extension="sabund" type="galaxy.datatypes.metagenomics:Sabund" display_in_upload="true"/>
+        <datatype extension="rabund" type="galaxy.datatypes.metagenomics:Rabund" display_in_upload="true"/>
+        <datatype extension="grpabund" type="galaxy.datatypes.metagenomics:GroupRabund" display_in_upload="true"/>
+        <datatype extension="shared" type="galaxy.datatypes.metagenomics:SharedRabund" display_in_upload="true"/>
+        <datatype extension="relabund" type="galaxy.datatypes.metagenomics:RelAbund" display_in_upload="true"/>
+        <datatype extension="accnos" type="galaxy.datatypes.metagenomics:AccNos" display_in_upload="true"/>
+        <datatype extension="tre" type="galaxy.datatypes.data:Newick" display_in_upload="true"/>
+        <datatype extension="nhx" type="galaxy.datatypes.metagenomics:Nhx" display_in_upload="true"/>
+        <datatype extension="nex" type="galaxy.datatypes.metagenomics:Nexus" display_in_upload="true"/>
+    </registration>
+    <sniffers>
+        <sniffer type="galaxy.datatypes.metagenomics:Rabund"/>
+        <sniffer type="galaxy.datatypes.metagenomics:Otu"/>
+        <sniffer type="galaxy.datatypes.metagenomics:Sabund"/>
+        <sniffer type="galaxy.datatypes.metagenomics:GroupAbund"/>
+        <sniffer type="galaxy.datatypes.metagenomics:SharedRabund"/>
+        <sniffer type="galaxy.datatypes.metagenomics:RelAbund"/>
+        <sniffer type="galaxy.datatypes.metagenomics:Newick"/>
+        <sniffer type="galaxy.datatypes.metagenomics:Nexus"/>
+    </sniffers>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metagenomics_datatypes/metagenomics.py	Thu May 29 00:39:20 2014 -0400
@@ -0,0 +1,384 @@
+"""
+metagenomics class
+"""
+
+import logging, os, os.path, sys, time, tempfile, shutil, string, glob, re
+import galaxy.model
+from galaxy.datatypes.sniff import *
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes.data import Text, Data
+from galaxy.datatypes.tabular import Tabular
+from galaxy.datatypes.sequence import Fasta
+from galaxy import util
+from galaxy.datatypes.images import Html
+
+log = logging.getLogger(__name__)
+
+class Otu( Text ):
+    file_ext = 'otu'
+    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0 )
+    MetadataElement( name="labels", default=[], desc="Label Names", readonly=True, visible=True, no_value=[] )
+    def __init__(self, **kwd):
+        Text.__init__( self, **kwd )
+    def set_meta( self, dataset, overwrite = True, **kwd ):
+        if dataset.has_data():
+            label_names = set()
+            ncols = 0
+            data_lines = 0
+            comment_lines = 0
+            try:
+                fh = open( dataset.file_name )
+                for line in fh:
+                    fields = line.strip().split('\t')
+                    if len(fields) >= 2:
+                        data_lines += 1
+                        ncols = max(ncols,len(fields))
+                        label_names.add(fields[0])
+                    else:
+                        comment_lines += 1
+                # Set the discovered metadata values for the dataset
+                dataset.metadata.data_lines = data_lines
+                dataset.metadata.columns = ncols
+                dataset.metadata.labels = []
+                dataset.metadata.labels += label_names
+                dataset.metadata.labels.sort()
+            finally:
+                fh.close()
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) format
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 2:
+                            return False
+                        try:
+                            check = int(linePieces[1])
+                            if check + 2 != len(linePieces):
+                                return False
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count == 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class OtuList( Otu ):
+    file_ext = 'list'
+    def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/List_file
+        The first column is a label that represents the distance that sequences were assigned to OTUs.
+        The number in the second column is the number of OTUs that have been formed.
+        Subsequent columns contain the names of sequences within each OTU separated by a comma.
+        distance_label  otu_count       OTU1    OTU2    OTUn
+        """
+        Otu.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        Otu.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, **kwd ):
+        Otu.set_meta(self,dataset, overwrite = True, **kwd )
+        """
+        # too many columns to be stored in metadata
+        if dataset != None and dataset.metadata.columns > 2:
+            for i in range(2,dataset.metadata.columns):
+               dataset.metadata.column_types[i] = 'str'
+        """
+
+class Sabund( Otu ):
+    file_ext = 'sabund'
+    def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Sabund_file
+        """
+        Otu.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        Otu.init_meta( self, dataset, copy_from=copy_from )
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) format
+        label<TAB>count[<TAB>value(1..n)]
+
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 2:
+                            return False
+                        try:
+                            check = int(linePieces[1])
+                            if check + 2 != len(linePieces):
+                                return False
+                            for i in range( 2, len(linePieces)):
+                                ival = int(linePieces[i])
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count >= 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class Rabund( Sabund ):
+    file_ext = 'rabund'
+    def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Rabund_file
+        """
+        Sabund.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        Sabund.init_meta( self, dataset, copy_from=copy_from )
+
+class GroupAbund( Otu ):
+    file_ext = 'grpabund'
+    MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
+    def __init__(self, **kwd):
+        Otu.__init__( self, **kwd )
+        # self.column_names[0] = ['label']
+        # self.column_names[1] = ['group']
+        # self.column_names[2] = ['count']
+    """
+    def init_meta( self, dataset, copy_from=None ):
+        Otu.init_meta( self, dataset, copy_from=copy_from )
+    """
+    def init_meta( self, dataset, copy_from=None ):
+        Otu.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ):
+        # See if file starts with header line
+        if dataset.has_data():
+            label_names = set()
+            group_names = set()
+            data_lines = 0
+            comment_lines = 0
+            ncols = 0
+            try:
+                fh = open( dataset.file_name )
+                line = fh.readline()
+                fields = line.strip().split('\t')
+                ncols = max(ncols,len(fields))
+                if fields[0] == 'label' and fields[1] == 'Group':
+                    skip=1
+                    comment_lines += 1
+                else:
+                    skip=0
+                    data_lines += 1
+                    label_names.add(fields[0])
+                    group_names.add(fields[1])
+                for line in fh:
+                    data_lines += 1
+                    fields = line.strip().split('\t')
+                    ncols = max(ncols,len(fields))
+                    label_names.add(fields[0])
+                    group_names.add(fields[1])
+                # Set the discovered metadata values for the dataset
+                dataset.metadata.data_lines = data_lines
+                dataset.metadata.columns = ncols
+                dataset.metadata.labels = []
+                dataset.metadata.labels += label_names
+                dataset.metadata.labels.sort()
+                dataset.metadata.groups = []
+                dataset.metadata.groups += group_names
+                dataset.metadata.groups.sort()
+                dataset.metadata.skip = skip
+            finally:
+                fh.close()
+
+    def sniff( self, filename, vals_are_int=False):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Shared format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 3:
+                            return False
+                        if count > 0 or linePieces[0] != 'label':
+                            try:
+                                check = int(linePieces[2])
+                                if check + 3 != len(linePieces):
+                                    return False
+                                for i in range( 3, len(linePieces)):
+                                    if vals_are_int:
+                                        ival = int(linePieces[i])
+                                    else:
+                                        fval = float(linePieces[i])
+                            except ValueError:
+                                return False
+                        count += 1
+                        if count >= 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class SharedRabund( GroupAbund ):
+    file_ext = 'shared'
+    def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Shared_file
+        A shared file is analogous to an rabund file.
+        The data in a shared file represent the number of times that an OTU is observed in multiple samples.
+        The structure of a shared file is analogous to an rabund file.
+        The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file.
+        The second column contains the group name that designates where the data is coming from for that row.
+        The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow.
+        Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group.
+        """
+        GroupAbund.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        GroupAbund.init_meta( self, dataset, copy_from=copy_from )
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Shared format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        # return GroupAbund.sniff(self,filename,True)
+        isme = GroupAbund.sniff(self,filename,True)
+        return isme
+
+class RelAbund( GroupAbund ):
+    file_ext = 'relabund'
+    def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Relabund_file
+        The structure of a relabund file is analogous to an shared file.
+        The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list).
+        The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow.
+        Finally, the remaining columns indicate the relative abundance of each OTU from that group.
+        """
+        GroupAbund.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        GroupAbund.init_meta( self, dataset, copy_from=copy_from )
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        # return GroupAbund.sniff(self,filename,False)
+        isme = GroupAbund.sniff(self,filename,False)
+        return isme
+
+class AccNos(Tabular):
+    file_ext = 'accnos'
+    def __init__(self, **kwd):
+        """A list of names"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name']
+        self.columns = 1
+
+class Newick( Text ):
+    """
+    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
+    http://evolution.genetics.washington.edu/phylip/newicktree.html
+    http://en.wikipedia.org/wiki/Newick_format
+    Example:
+    (B,(A,C,E),D);
+    or example with branch lengths:
+    (B:6.0,(A:5.0,C:3.0,E:4.0):5.0,D:11.0);
+    or an example with embedded comments but no branch lengths:
+    ((a [&&PRIME S=x], b [&&PRIME S=y]), c [&&PRIME S=z]);
+    Example with named interior noe:
+    (B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0);
+    """
+    file_ext = 'tre'
+
+    def __init__(self, **kwd):
+        Text.__init__( self, **kwd )
+
+    def sniff( self, filename ):   ## TODO
+        """
+        Determine whether the file is in Newick format
+        Note: Last non-space char of a tree should be a semicolon: ';'
+        Usually the first char will be a open parenthesis: '('
+        (,,(,));                               no nodes are named
+        (A,B,(C,D));                           leaf nodes are named
+        (A,B,(C,D)E)F;                         all nodes are named
+        (:0.1,:0.2,(:0.3,:0.4):0.5);           all but root node have a distance to parent
+        (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;       all have a distance to parent
+        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);       distances and leaf names (popular)
+        (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;     distances and all names
+        ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;    a tree rooted on a leaf node (rare)
+        """
+        if not os.path.exists(filename):
+            return False
+        try:
+            ## For now, guess this is a Newick file if it starts with a '(' and ends with a ';'
+            flen = os.path.getsize(filename)
+            fh = open( filename )
+            len = min(flen,2000)
+            # check end of the file for a semicolon
+            fh.seek(-len,os.SEEK_END)
+            buf = fh.read(len).strip()
+            buf = buf.strip()
+            if not buf.endswith(';'):
+                return False
+            # See if this starts with a open parenthesis
+            if len < flen:
+                fh.seek(0)
+                buf = fh.read(len).strip()
+            if buf.startswith('('):
+                return True
+        except:
+            pass
+        finally:
+            close(fh)
+        return False
+
+class Nhx( Newick ):
+    """
+    New Hampshire eXtended  Newick with embedded
+    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
+    http://evolution.genetics.washington.edu/phylip/newicktree.html
+    http://en.wikipedia.org/wiki/Newick_format
+    Example:
+    (gene1_Hu[&&NHX:S=Hu_Homo_sapiens], (gene2_Hu[&&NHX:S=Hu_Homo_sapiens], gene2_Mu[&&NHX:S=Mu_Mus_musculus]));
+    """
+    file_ext = 'nhx'