Mercurial > repos > iuc > scikit_bio_diversity_beta_diversity
changeset 0:63706c95c9ed draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scikit_bio commit d46d41c5fec10407bd6b5cb77a11d9b43b82b95e
author | iuc |
---|---|
date | Fri, 23 Sep 2016 12:17:38 -0400 |
parents | |
children | 024a9b86f853 |
files | macros.xml scikit_bio_diversity_beta_diversity.py scikit_bio_diversity_beta_diversity.xml test-data/input_abundance_1.tabular test-data/input_tree_1.newick test-data/output_weighted_unifrac_1.tabular |
diffstat | 6 files changed, 341 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,39 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.4.2">scikit-bio</requirement> + <yield /> + </requirements> + </xml> + + <xml name="version_command"> + <version_command><![CDATA[python -c "import skbio;print 'scikit-bio version', skbio.__version__"]]></version_command> + </xml> + + <xml name="stdio"> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + </xml> + <token name="@VERSION@">0.4.2</token> + + <xml name="params_tree"> + <param name="input_tree" type="data" format="txt" label="Newick Tree file" optional="True" help="You can provide a file or a string"/> + <param name="tree" type="text" value="" label="Newick Tree text" help="You can provide a file or a string"> + <sanitizer invalid_char=""> + <valid initial="string.printable"/> + </sanitizer> + </param> + </xml> + <xml name="citations"> + <citations> + <citation type="bibtex">@unpublished{scikit-bio:2016, + title = "scikit-bio", + author = "Contributors", + url = "http://scikit-bio.org/", + year = "2016 (accessed April 1, 2016)" + }</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scikit_bio_diversity_beta_diversity.py Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +# Reports a beta diversity matrix for tabular input file +# using scikit-bio +# Daniel Blankenberg + + +import sys +import optparse +import codecs +from skbio.diversity import beta_diversity +from skbio import TreeNode + + +__VERSION__ = "0.0.1" + +DELIMITER = '\t' + +NEEDS_TREE = [ 'unweighted_unifrac', 'weighted_unifrac' ] + +NEEDS_OTU_NAMES = [ 'unweighted_unifrac', 'weighted_unifrac' ] + + +def __main__(): + parser = optparse.OptionParser( usage="%prog [options]" ) + parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) + parser.add_option( '-i', '--input', dest='input', action='store', type="string", default=None, help='Input abundance Filename' ) + parser.add_option( '', '--otu_column', dest='otu_column', action='store', type="int", default=None, help='OTU ID Column (1 based)' ) + parser.add_option( '', '--sample_columns', dest='sample_columns', action='store', type="string", default=None, help='Comma separated list of sample columns, unset to use all.' ) + parser.add_option( '', '--header', dest='header', action='store_true', default=False, help='Abundance file has a header line' ) + parser.add_option( '', '--distance_metric', dest='distance_metric', action='store', type="string", default=None, help='Distance metric to use' ) + parser.add_option( '', '--tree', dest='tree', action='store', type="string", default=None, help='Newick Tree Filename' ) + parser.add_option( '-o', '--output', dest='output', action='store', type="string", default=None, help='Output Filename' ) + (options, args) = parser.parse_args() + if options.version: + print >> sys.stderr, "scikit-bio betadiversity from tabular file", __VERSION__ + sys.exit() + + if options.otu_column is not None: + otu_column = options.otu_column - 1 + else: + otu_column = None + + if options.sample_columns is None: + with open( options.input, 'rb' ) as fh: + line = fh.readline() + columns = range( len( line.split( DELIMITER ) ) ) + if otu_column in columns: + columns.remove( otu_column ) + else: + columns = map( lambda x: int( x ) - 1, options.sample_columns.split( "," ) ) + + max_col = max( columns + [otu_column] ) + counts = [ [] for x in columns ] + sample_names = [] + otu_names = [] + with open( options.input, 'rb' ) as fh: + if options.header: + header = fh.readline().rstrip('\n\r').split( DELIMITER ) + sample_names = [ header[i] for i in columns ] + else: + sample_names = [ "SAMPLE_%i" % x for x in range( len( columns ) ) ] + for i, line in enumerate( fh ): + fields = line.rstrip('\n\r').split( DELIMITER ) + if len(fields) <= max_col: + print >> sys.stederr, "Bad data line: ", fields + continue + if otu_column is not None: + otu_names.append( fields[ otu_column ] ) + else: + otu_names.append( "OTU_%i" % i ) + for j, col in enumerate( columns ): + counts[ j ].append( int( fields[ col ] ) ) + + extra_kwds = {} + if options.distance_metric in NEEDS_OTU_NAMES: + extra_kwds['otu_ids'] = otu_names + if options.distance_metric in NEEDS_TREE: + assert options.tree, Exception( "You must provide a newick tree when using '%s'" % options.distance_metric ) + # NB: TreeNode apparently needs unicode files + with codecs.open( options.tree, 'rb', 'utf-8' ) as fh: + extra_kwds['tree'] = TreeNode.read( fh ) + + bd_dm = beta_diversity( options.distance_metric, counts, ids=sample_names, **extra_kwds ) + bd_dm.write( options.output ) + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scikit_bio_diversity_beta_diversity.xml Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,123 @@ +<tool id="scikit_bio_diversity_beta_diversity" name="Beta Diversity" version="@VERSION@.0"> + <description> + using scikit-bio + </description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="stdio" /> + <expand macro="version_command" /> + <command><![CDATA[ + python ${__tool_directory__}/scikit_bio_diversity_beta_diversity.py + --input "${input_abundance}" + #if $otu_column: + --otu_column "${otu_column}" + #end if + #if $sample_columns: + --sample_columns "${sample_columns}" + #end if + ${header} + --distance_metric "${distance_metric.beta_diversity_method}" + #if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ]: + --tree + #if $distance_metric.input_tree: + "${distance_metric.input_tree}" + #else: + "${input_tree_config_file}" + #end if + #end if + --output "${output_beta_diversity}" + ]]> + </command> + <configfiles> + <configfile name="input_tree_config_file">#if str( $distance_metric.beta_diversity_method ) in [ 'unweighted_unifrac', 'weighted_unifrac' ] then $distance_metric.tree else ''#</configfile> + </configfiles> + <inputs> + <param name="input_abundance" type="data" format="tabular" label="File with abundance values for community" help="Rows are samples; columns are species/phyla/community classifier"/> + <param name="otu_column" label="Group name column" type="data_column" data_ref="input_abundance" value="1" optional="True" help="Species, phylum, etc"/> + <param name="sample_columns" label="Select Sample count columns" type="data_column" multiple="True" value="" optional="True" data_ref="input_abundance" help="Leave blank for all"/> + <param name="header" type="boolean" truevalue="--header" falsevalue="" checked="False" label="Input has a header line"/> + <conditional name="distance_metric"> + <param name="beta_diversity_method" type="select" multiple="False" label="Diversity index to compute"> + <option value="unweighted_unifrac">unweighted_unifrac</option> + <option value="weighted_unifrac" selected="True">weighted_unifrac</option> + <option value="euclidean">euclidean</option> + <option value="minkowski">minkowski</option> + <option value="cityblock">cityblock</option> + <option value="seuclidean">seuclidean</option> + <option value="sqeuclidean">sqeuclidean</option> + <option value="cosine">cosine</option> + <option value="correlation">correlation</option> + <option value="hamming">hamming</option> + <option value="jaccard">jaccard</option> + <option value="chebyshev">chebyshev</option> + <option value="canberra">canberra</option> + <option value="braycurtis">braycurtis</option> + <option value="mahalanobis">mahalanobis</option> + <option value="yule">yule</option> + <option value="matching">matching</option> + <option value="dice">dice</option> + <option value="kulsinski">kulsinski</option> + <option value="rogerstanimoto">rogerstanimoto</option> + <option value="russellrao">russellrao</option> + <option value="sokalmichener">sokalmichener</option> + <option value="sokalsneath">sokalsneath</option> + <option value="wminkowski">wminkowski</option> + </param> + <when value="euclidean"/> + <when value="minkowski"/> + <when value="cityblock"/> + <when value="seuclidean"/> + <when value="sqeuclidean"/> + <when value="cosine"/> + <when value="correlation"/> + <when value="hamming"/> + <when value="jaccard"/> + <when value="chebyshev"/> + <when value="canberra"/> + <when value="braycurtis"/> + <when value="mahalanobis"/> + <when value="yule"/> + <when value="matching"/> + <when value="dice"/> + <when value="kulsinski"/> + <when value="rogerstanimoto"/> + <when value="russellrao"/> + <when value="sokalmichener"/> + <when value="sokalsneath"/> + <when value="wminkowski"/> + <when value="unweighted_unifrac"> + <expand macro="params_tree" /> + </when> + <when value="weighted_unifrac"> + <expand macro="params_tree" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="tabular" name="output_beta_diversity" label="${tool.name} on ${on_string} (${distance_metric.beta_diversity_method})"/> + </outputs> + <tests> + <test> + <param name="input_abundance" ftype="tabular" value="input_abundance_1.tabular"/> + <param name="otu_column" value="1"/> + <param name="sample_columns" value="2,3,4"/> + <param name="header" value="True"/> + <conditional name="distance_metric"> + <param name="beta_diversity_method" value="weighted_unifrac"/> + <param name="input_tree" value="input_tree_1.newick"/> + <param name="tree" value=""/> + </conditional> + <output name="output_beta_diversity" ftype="tabular" file="output_weighted_unifrac_1.tabular" /> + </test> + </tests> + <help> + <![CDATA[ + +Calculates beta diversity using the selected metric. + + ]]> + </help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_abundance_1.tabular Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,86 @@ +#ID sample_one sample_two sample_3 +Crenarchaeota 0 0 1 +Euryarchaeota 0 1 0 +AC1 0 1 2 +AD3 1 1 4 +Acidobacteria 13 14 372 +Actinobacteria 16758 1443 101451 +AncK6 0 0 0 +Aquificae 1 0 12 +Armatimonadetes 4 7 13 +BHI80-139 0 0 8 +BRC1 1 5 9 +Bacteroidetes 5868 270336 13264 +CD12 0 0 0 +Caldiserica 0 0 2 +Caldithrix 0 0 0 +Chlamydiae 1 1 13 +Chlorobi 3 9 11 +Chloroflexi 31 21 463 +Chrysiogenetes 0 0 2 +Cyanobacteria 5 16 123 +Deferribacteres 0 1 1 +EM19 0 0 0 +EM3 0 0 0 +Elusimicrobia 4 4 3 +FBP 0 0 0 +FCPU426 0 0 2 +Fibrobacteres 4 9 24 +Firmicutes 136317 71445 302692 +Fusobacteria 1268 1636 5463 +GAL15 0 0 0 +GN01 0 0 4 +GN02 0 3 48 +GN04 2 6 3 +GOUTA4 0 1 0 +Gemmatimonadetes 1 4 46 +H-178 0 0 0 +Hyd24-12 0 0 0 +KSB3 0 0 11 +Kazan-3B-28 0 0 1 +LCP-89 0 0 0 +LD1 1 1 1 +Lentisphaerae 0 2 12 +MAT-CR-M4-B07 0 0 0 +MVP-21 0 0 0 +MVS-104 0 0 0 +NC10 0 0 0 +NKB19 4 11 17 +NPL-UPA2 0 0 0 +Nitrospirae 2 1 9 +OD1 1 3 19 +OP1 2 2 102 +OP11 0 0 15 +OP3 0 1 8 +OP8 1 0 9 +OP9 1 0 57 +OctSpA1-106 0 0 0 +PAUC34f 0 0 0 +Planctomycetes 16 7 131 +Poribacteria 0 0 0 +Proteobacteria 48361 12121 153808 +SAR406 1 2 7 +SBR1093 0 0 3 +SC4 0 0 2 +SR1 16 4 61 +Spirochaetes 6 11 184 +Synergistetes 2 2 13 +TA06 0 0 0 +TM6 0 2 4 +TM7 76 61 2210 +TPD-58 0 0 0 +Tenericutes 2 3 25 +Thermotogae 1 0 11 +VHS-B3-43 0 0 0 +Verrucomicrobia 55 1240 44 +WPS-2 1 0 0 +WS1 1 0 5 +WS2 0 0 2 +WS3 1 3 0 +WS4 0 0 0 +WS5 0 1 1 +WS6 0 0 1 +WWE1 0 0 7 +ZB3 0 0 2 +[Caldithrix] 3 2 4 +[Thermi] 1 1 22
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_tree_1.newick Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,1 @@ +((Crenarchaeota:1.00000,Euryarchaeota:1.00000,Nanoarchaeota:1.00000,'[Parvarchaeota]':1.00000)Archaea:1.00000,(AC1:1.00000,AD3:1.00000,Acidobacteria:1.00000,Actinobacteria:1.00000,AncK6:1.00000,Aquificae:1.00000,Armatimonadetes:1.00000,BHI80-139:1.00000,BRC1:1.00000,Bacteroidetes:1.00000,CD12:1.00000,Caldiserica:1.00000,Caldithrix:1.00000,Chlamydiae:1.00000,Chlorobi:1.00000,Chloroflexi:1.00000,Chrysiogenetes:1.00000,Cyanobacteria:1.00000,Deferribacteres:1.00000,Dictyoglomi:1.00000,EM19:1.00000,EM3:1.00000,Elusimicrobia:1.00000,FBP:1.00000,FCPU426:1.00000,Fibrobacteres:1.00000,Firmicutes:1.00000,Fusobacteria:1.00000,GAL15:1.00000,GN01:1.00000,GN02:1.00000,GN04:1.00000,GOUTA4:1.00000,Gemmatimonadetes:1.00000,H-178:1.00000,Hyd24-12:1.00000,KSB3:1.00000,Kazan-3B-28:1.00000,LCP-89:1.00000,LD1:1.00000,Lentisphaerae:1.00000,MAT-CR-M4-B07:1.00000,MVP-21:1.00000,MVS-104:1.00000,NC10:1.00000,NKB19:1.00000,NPL-UPA2:1.00000,Nitrospirae:1.00000,OC31:1.00000,OD1:1.00000,OP1:1.00000,OP11:1.00000,OP3:1.00000,OP8:1.00000,OP9:1.00000,OctSpA1-106:1.00000,PAUC34f:1.00000,Planctomycetes:1.00000,Poribacteria:1.00000,Proteobacteria:1.00000,SAR406:1.00000,SBR1093:1.00000,SC4:1.00000,SR1:1.00000,Spirochaetes:1.00000,Synergistetes:1.00000,TA06:1.00000,TM6:1.00000,TM7:1.00000,TPD-58:1.00000,Tenericutes:1.00000,Thermotogae:1.00000,VHS-B3-43:1.00000,Verrucomicrobia:1.00000,WPS-2:1.00000,WS1:1.00000,WS2:1.00000,WS3:1.00000,WS4:1.00000,WS5:1.00000,WS6:1.00000,WWE1:1.00000,ZB3:1.00000,'[Caldithrix]':1.00000,'[Thermi]':1.00000)Bacteria:1.00000)root:1.00000;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_weighted_unifrac_1.tabular Fri Sep 23 12:17:38 2016 -0400 @@ -0,0 +1,4 @@ + sample_one sample_two sample_3 +sample_one 0.0 1.45881907807 0.274219368588 +sample_two 1.45881907807 0.0 1.46956460092 +sample_3 0.274219368588 1.46956460092 0.0