Repository 'scikit_bio_diversity_beta_diversity'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/scikit_bio_diversity_beta_diversity

Changeset 0:63706c95c9ed (2016-09-23)
Next changeset 1:024a9b86f853 (2016-11-14)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scikit_bio commit d46d41c5fec10407bd6b5cb77a11d9b43b82b95e
added:
macros.xml
scikit_bio_diversity_beta_diversity.py
scikit_bio_diversity_beta_diversity.xml
test-data/input_abundance_1.tabular
test-data/input_tree_1.newick
test-data/output_weighted_unifrac_1.tabular
b
diff -r 000000000000 -r 63706c95c9ed macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Fri Sep 23 12:17:38 2016 -0400
[
@@ -0,0 +1,39 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.4.2">scikit-bio</requirement>
+            <yield />
+        </requirements>
+    </xml>
+
+    <xml name="version_command">
+        <version_command><![CDATA[python -c "import skbio;print 'scikit-bio version', skbio.__version__"]]></version_command>
+    </xml>
+
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" />
+            <exit_code range=":-1" />
+        </stdio>
+    </xml>
+    <token name="@VERSION@">0.4.2</token>
+
+    <xml name="params_tree">
+        <param name="input_tree" type="data" format="txt" label="Newick Tree file" optional="True" help="You can provide a file or a string"/>
+        <param name="tree" type="text" value="" label="Newick Tree text" help="You can provide a file or a string">
+            <sanitizer invalid_char="">
+                <valid initial="string.printable"/>
+            </sanitizer>
+        </param>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">@unpublished{scikit-bio:2016,
+      title  = "scikit-bio",
+      author = "Contributors",
+      url    = "http://scikit-bio.org/",
+      year   = "2016 (accessed April 1, 2016)"
+    }</citation>
+        </citations>
+    </xml>
+</macros>
b
diff -r 000000000000 -r 63706c95c9ed scikit_bio_diversity_beta_diversity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scikit_bio_diversity_beta_diversity.py Fri Sep 23 12:17:38 2016 -0400
[
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+# Reports a beta diversity matrix for tabular input file
+# using scikit-bio
+# Daniel Blankenberg
+
+
+import sys
+import optparse
+import codecs
+from skbio.diversity import beta_diversity
+from skbio import TreeNode
+
+
+__VERSION__ = "0.0.1"
+
+DELIMITER = '\t'
+
+NEEDS_TREE = [ 'unweighted_unifrac', 'weighted_unifrac' ]
+
+NEEDS_OTU_NAMES = [ 'unweighted_unifrac', 'weighted_unifrac' ]
+
+
+def __main__():
+    parser = optparse.OptionParser( usage="%prog [options]" )
+    parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
+    parser.add_option( '-i', '--input', dest='input', action='store', type="string", default=None, help='Input abundance Filename' )
+    parser.add_option( '', '--otu_column', dest='otu_column', action='store', type="int", default=None, help='OTU ID Column (1 based)' )
+    parser.add_option( '', '--sample_columns', dest='sample_columns', action='store', type="string", default=None, help='Comma separated list of sample columns, unset to use all.' )
+    parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Abundance file has a header line' )
+    parser.add_option( '', '--distance_metric', dest='distance_metric', action='store', type="string", default=None, help='Distance metric to use' )
+    parser.add_option( '', '--tree', dest='tree', action='store', type="string", default=None, help='Newick Tree Filename' )
+    parser.add_option( '-o', '--output', dest='output', action='store', type="string", default=None, help='Output Filename' )
+    (options, args) = parser.parse_args()
+    if options.version:
+        print >> sys.stderr, "scikit-bio betadiversity from tabular file", __VERSION__
+        sys.exit()
+
+    if options.otu_column is not None:
+        otu_column = options.otu_column - 1
+    else:
+        otu_column = None
+
+    if options.sample_columns is None:
+        with open( options.input, 'rb' ) as fh:
+            line = fh.readline()
+            columns = range( len( line.split( DELIMITER ) ) )
+            if otu_column in columns:
+                columns.remove( otu_column )
+    else:
+        columns = map( lambda x: int( x ) - 1, options.sample_columns.split( "," ) )
+
+    max_col = max( columns + [otu_column] )
+    counts = [ [] for x in columns ]
+    sample_names = []
+    otu_names = []
+    with open( options.input, 'rb' ) as fh:
+        if options.header:
+            header = fh.readline().rstrip('\n\r').split( DELIMITER )
+            sample_names = [ header[i] for i in columns ]
+        else:
+            sample_names = [ "SAMPLE_%i" % x for x in range( len( columns ) ) ]
+        for i, line in enumerate( fh ):
+            fields = line.rstrip('\n\r').split( DELIMITER )
+            if len(fields) <= max_col:
+                print >> sys.stederr, "Bad data line: ", fields
+                continue
+            if otu_column is not None:
+                otu_names.append( fields[ otu_column ] )
+            else:
+                otu_names.append( "OTU_%i" % i )
+            for j, col in enumerate( columns ):
+                counts[ j ].append( int( fields[ col ] ) )
+
+    extra_kwds = {}
+    if options.distance_metric in NEEDS_OTU_NAMES:
+        extra_kwds['otu_ids'] = otu_names
+    if options.distance_metric in NEEDS_TREE:
+        assert options.tree, Exception( "You must provide a newick tree when using '%s'" % options.distance_metric )
+        # NB: TreeNode apparently needs unicode files
+        with codecs.open( options.tree, 'rb', 'utf-8' ) as fh:
+            extra_kwds['tree'] = TreeNode.read( fh )
+
+    bd_dm = beta_diversity( options.distance_metric, counts, ids=sample_names, **extra_kwds )
+    bd_dm.write( options.output )
+
+if __name__ == "__main__":
+    __main__()
b
diff -r 000000000000 -r 63706c95c9ed scikit_bio_diversity_beta_diversity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scikit_bio_diversity_beta_diversity.xml Fri Sep 23 12:17:38 2016 -0400
[
@@ -0,0 +1,123 @@
+<tool id="scikit_bio_diversity_beta_diversity" name="Beta Diversity" version="@VERSION@.0">
+    <description>
+        using scikit-bio
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+        python ${__tool_directory__}/scikit_bio_diversity_beta_diversity.py
+        --input "${input_abundance}"
+        #if $otu_column:
+            --otu_column "${otu_column}"
+        #end if
+        #if $sample_columns:
+            --sample_columns "${sample_columns}"
+        #end if
+        ${header}
+        --distance_metric "${distance_metric.beta_diversity_method}"
+        #if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ]:
+            --tree
+            #if $distance_metric.input_tree:
+                "${distance_metric.input_tree}"
+            #else:
+                "${input_tree_config_file}"
+            #end if
+        #end if
+        --output "${output_beta_diversity}"
+    ]]>
+    </command>
+    <configfiles>
+        <configfile name="input_tree_config_file">#if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ] then $distance_metric.tree else ''#</configfile>
+    </configfiles>
+    <inputs>
+        <param name="input_abundance" type="data" format="tabular" label="File with abundance values for community" help="Rows are samples; columns are species/phyla/community classifier"/>
+        <param name="otu_column" label="Group name column" type="data_column" data_ref="input_abundance" value="1" optional="True" help="Species, phylum, etc"/>
+        <param name="sample_columns" label="Select Sample count columns" type="data_column" multiple="True" value="" optional="True" data_ref="input_abundance" help="Leave blank for all"/>
+        <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" label="Input has a header line"/>
+        <conditional name="distance_metric">
+            <param name="beta_diversity_method" type="select" multiple="False" label="Diversity index to compute">
+                <option value="unweighted_unifrac">unweighted_unifrac</option>
+                <option value="weighted_unifrac" selected="True">weighted_unifrac</option>
+                <option value="euclidean">euclidean</option>
+                <option value="minkowski">minkowski</option>
+                <option value="cityblock">cityblock</option>
+                <option value="seuclidean">seuclidean</option>
+                <option value="sqeuclidean">sqeuclidean</option>
+                <option value="cosine">cosine</option>
+                <option value="correlation">correlation</option>
+                <option value="hamming">hamming</option>
+                <option value="jaccard">jaccard</option>
+                <option value="chebyshev">chebyshev</option>
+                <option value="canberra">canberra</option>
+                <option value="braycurtis">braycurtis</option>
+                <option value="mahalanobis">mahalanobis</option>
+                <option value="yule">yule</option>
+                <option value="matching">matching</option>
+                <option value="dice">dice</option>
+                <option value="kulsinski">kulsinski</option>
+                <option value="rogerstanimoto">rogerstanimoto</option>
+                <option value="russellrao">russellrao</option>
+                <option value="sokalmichener">sokalmichener</option>
+                <option value="sokalsneath">sokalsneath</option>
+                <option value="wminkowski">wminkowski</option>
+            </param>
+            <when value="euclidean"/>
+            <when value="minkowski"/>
+            <when value="cityblock"/>
+            <when value="seuclidean"/>
+            <when value="sqeuclidean"/>
+            <when value="cosine"/>
+            <when value="correlation"/>
+            <when value="hamming"/>
+            <when value="jaccard"/>
+            <when value="chebyshev"/>
+            <when value="canberra"/>
+            <when value="braycurtis"/>
+            <when value="mahalanobis"/>
+            <when value="yule"/>
+            <when value="matching"/>
+            <when value="dice"/>
+            <when value="kulsinski"/>
+            <when value="rogerstanimoto"/>
+            <when value="russellrao"/>
+            <when value="sokalmichener"/>
+            <when value="sokalsneath"/>
+            <when value="wminkowski"/>
+            <when value="unweighted_unifrac">
+                <expand macro="params_tree" />
+            </when>
+            <when value="weighted_unifrac">
+                <expand macro="params_tree" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_beta_diversity" label="${tool.name} on ${on_string} (${distance_metric.beta_diversity_method})"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_abundance" ftype="tabular" value="input_abundance_1.tabular"/>
+            <param name="otu_column" value="1"/>
+            <param name="sample_columns" value="2,3,4"/>
+            <param name="header" value="True"/>
+            <conditional name="distance_metric">
+                <param name="beta_diversity_method" value="weighted_unifrac"/>
+                <param name="input_tree" value="input_tree_1.newick"/>
+                <param name="tree" value=""/>
+            </conditional>
+            <output name="output_beta_diversity" ftype="tabular" file="output_weighted_unifrac_1.tabular" />
+        </test>
+    </tests>
+    <help>
+        <![CDATA[
+        
+Calculates beta diversity using the selected metric.
+
+        ]]>
+    </help>
+    <expand macro="citations" />
+</tool>
b
diff -r 000000000000 -r 63706c95c9ed test-data/input_abundance_1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_abundance_1.tabular Fri Sep 23 12:17:38 2016 -0400
[
@@ -0,0 +1,86 @@
+#ID sample_one sample_two sample_3
+Crenarchaeota 0 0 1
+Euryarchaeota 0 1 0
+AC1 0 1 2
+AD3 1 1 4
+Acidobacteria 13 14 372
+Actinobacteria 16758 1443 101451
+AncK6 0 0 0
+Aquificae 1 0 12
+Armatimonadetes 4 7 13
+BHI80-139 0 0 8
+BRC1 1 5 9
+Bacteroidetes 5868 270336 13264
+CD12 0 0 0
+Caldiserica 0 0 2
+Caldithrix 0 0 0
+Chlamydiae 1 1 13
+Chlorobi 3 9 11
+Chloroflexi 31 21 463
+Chrysiogenetes 0 0 2
+Cyanobacteria 5 16 123
+Deferribacteres 0 1 1
+EM19 0 0 0
+EM3 0 0 0
+Elusimicrobia 4 4 3
+FBP 0 0 0
+FCPU426 0 0 2
+Fibrobacteres 4 9 24
+Firmicutes 136317 71445 302692
+Fusobacteria 1268 1636 5463
+GAL15 0 0 0
+GN01 0 0 4
+GN02 0 3 48
+GN04 2 6 3
+GOUTA4 0 1 0
+Gemmatimonadetes 1 4 46
+H-178 0 0 0
+Hyd24-12 0 0 0
+KSB3 0 0 11
+Kazan-3B-28 0 0 1
+LCP-89 0 0 0
+LD1 1 1 1
+Lentisphaerae 0 2 12
+MAT-CR-M4-B07 0 0 0
+MVP-21 0 0 0
+MVS-104 0 0 0
+NC10 0 0 0
+NKB19 4 11 17
+NPL-UPA2 0 0 0
+Nitrospirae 2 1 9
+OD1 1 3 19
+OP1 2 2 102
+OP11 0 0 15
+OP3 0 1 8
+OP8 1 0 9
+OP9 1 0 57
+OctSpA1-106 0 0 0
+PAUC34f 0 0 0
+Planctomycetes 16 7 131
+Poribacteria 0 0 0
+Proteobacteria 48361 12121 153808
+SAR406 1 2 7
+SBR1093 0 0 3
+SC4 0 0 2
+SR1 16 4 61
+Spirochaetes 6 11 184
+Synergistetes 2 2 13
+TA06 0 0 0
+TM6 0 2 4
+TM7 76 61 2210
+TPD-58 0 0 0
+Tenericutes 2 3 25
+Thermotogae 1 0 11
+VHS-B3-43 0 0 0
+Verrucomicrobia 55 1240 44
+WPS-2 1 0 0
+WS1 1 0 5
+WS2 0 0 2
+WS3 1 3 0
+WS4 0 0 0
+WS5 0 1 1
+WS6 0 0 1
+WWE1 0 0 7
+ZB3 0 0 2
+[Caldithrix] 3 2 4
+[Thermi] 1 1 22
b
diff -r 000000000000 -r 63706c95c9ed test-data/input_tree_1.newick
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_tree_1.newick Fri Sep 23 12:17:38 2016 -0400
[
@@ -0,0 +1,1 @@
+((Crenarchaeota:1.00000,Euryarchaeota:1.00000,Nanoarchaeota:1.00000,'[Parvarchaeota]':1.00000)Archaea:1.00000,(AC1:1.00000,AD3:1.00000,Acidobacteria:1.00000,Actinobacteria:1.00000,AncK6:1.00000,Aquificae:1.00000,Armatimonadetes:1.00000,BHI80-139:1.00000,BRC1:1.00000,Bacteroidetes:1.00000,CD12:1.00000,Caldiserica:1.00000,Caldithrix:1.00000,Chlamydiae:1.00000,Chlorobi:1.00000,Chloroflexi:1.00000,Chrysiogenetes:1.00000,Cyanobacteria:1.00000,Deferribacteres:1.00000,Dictyoglomi:1.00000,EM19:1.00000,EM3:1.00000,Elusimicrobia:1.00000,FBP:1.00000,FCPU426:1.00000,Fibrobacteres:1.00000,Firmicutes:1.00000,Fusobacteria:1.00000,GAL15:1.00000,GN01:1.00000,GN02:1.00000,GN04:1.00000,GOUTA4:1.00000,Gemmatimonadetes:1.00000,H-178:1.00000,Hyd24-12:1.00000,KSB3:1.00000,Kazan-3B-28:1.00000,LCP-89:1.00000,LD1:1.00000,Lentisphaerae:1.00000,MAT-CR-M4-B07:1.00000,MVP-21:1.00000,MVS-104:1.00000,NC10:1.00000,NKB19:1.00000,NPL-UPA2:1.00000,Nitrospirae:1.00000,OC31:1.00000,OD1:1.00000,OP1:1.00000,OP11:1.00000,OP3:1.00000,OP8:1.00000,OP9:1.00000,OctSpA1-106:1.00000,PAUC34f:1.00000,Planctomycetes:1.00000,Poribacteria:1.00000,Proteobacteria:1.00000,SAR406:1.00000,SBR1093:1.00000,SC4:1.00000,SR1:1.00000,Spirochaetes:1.00000,Synergistetes:1.00000,TA06:1.00000,TM6:1.00000,TM7:1.00000,TPD-58:1.00000,Tenericutes:1.00000,Thermotogae:1.00000,VHS-B3-43:1.00000,Verrucomicrobia:1.00000,WPS-2:1.00000,WS1:1.00000,WS2:1.00000,WS3:1.00000,WS4:1.00000,WS5:1.00000,WS6:1.00000,WWE1:1.00000,ZB3:1.00000,'[Caldithrix]':1.00000,'[Thermi]':1.00000)Bacteria:1.00000)root:1.00000;
b
diff -r 000000000000 -r 63706c95c9ed test-data/output_weighted_unifrac_1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_weighted_unifrac_1.tabular Fri Sep 23 12:17:38 2016 -0400
b
@@ -0,0 +1,4 @@
+ sample_one sample_two sample_3
+sample_one 0.0 1.45881907807 0.274219368588
+sample_two 1.45881907807 0.0 1.46956460092
+sample_3 0.274219368588 1.46956460092 0.0