Galaxy |

Changeset 1:db64b6287cd6 (2014-08-20)

Previous changeset 0:e7cd19afda2e (2014-05-13) Next changeset 2:a31c10fe09c8 (2015-07-07)

Commit message:
Modified datatypes

removed:
Readme_lefse_Galaxy.txt
datatypes_conf.xml
format_input.py
format_input.xml
format_input_selector.py
lefse.py
lefse2circlader.py
lefse_met.png
lefse_ove.png
plot_cladogram.py
plot_cladogram.xml
plot_features.py
plot_features.xml
plot_res.py
plot_res.xml
plot_single_feature.xml
qiime2lefse.py
requirements.txt
run_lefse.py
run_lefse.xml
test-data/lefse_input
test-data/lefse_output_a

diff -r e7cd19afda2e -r db64b6287cd6 Readme_lefse_Galaxy.txt
--- a/Readme_lefse_Galaxy.txt Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,29 +0,0 @@
-Installation instructions for lefse in a galaxy environment.
-These instructions require the Mercurial versioning system, galaxy, and an internet connection.
-
-1. In the  "galaxy-dist/tools" directory install lefse by cloning the repository:
-
- bitbucket.org/biobakery/lefse_galaxy
-
-2.  Rename the galaxy_lefse  directory thus created to "lefse"
-
-3. Update member tool_conf.xml  in the galaxy directory adding the following:
-
-  <section name="LEfSe" id="lefse">
-   <tool file="lefse/format_input.xml" />
-   <tool file="lefse/run_lefse.xml" />
-   <tool file="lefse/plot_res.xml" />
-   <tool file="lefse/plot_cladogram.xml" />
-   <tool file="lefse/plot_single_feature.xml" />
-   <tool file="lefse/plot_features.xml" />
-  </section>
-
-
-4. Update member datatypes_conf.xml  in the galaxy directory adding the following:
- <datatype extension="lefse" type="galaxy.datatypes.data:Lefse" display_in_upload="true"/>
-    <datatype extension="lefse_res" type="galaxy.datatypes.tabular:LefseRes" display_in_upload="true"/>
-
-5. Copy the *.png members to galaxy-dist/static/images
-
-6. Recycle galaxy
-

diff -r e7cd19afda2e -r db64b6287cd6 datatypes_conf.xml
--- a/datatypes_conf.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,7 +0,0 @@
-<?xml version="1.0"?>
-<datatypes>
-    <registration>
-    <datatype extension="lefse" type="galaxy.datatypes.data:Lefse" display_in_upload="true"/>
-    <datatype extension="lefse_res" type="galaxy.datatypes.tabular:LefseRes" display_in_upload="true"/>
-    </registration>
-</datatypes>

diff -r e7cd19afda2e -r db64b6287cd6 format_input.py
--- a/format_input.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,286 +0,0 @@\n-#!/usr/bin/env python\n-\n-import sys,os,argparse,pickle,re\n-\n-def read_input_file(inp_file):\n-\twith open(inp_file) as inp:\n-\t\treturn [[v.strip() for v in line.strip().split("\\t")] for line in inp.readlines()]\n-\n-def transpose(data):\n-\treturn zip(*data)\n-\n-def read_params(args):\n-\tparser = argparse.ArgumentParser(description=\'LEfSe formatting modules\')\n-\tparser.add_argument(\'input_file\', metavar=\'INPUT_FILE\', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")\n-\tparser.add_argument(\'output_file\', metavar=\'OUTPUT_FILE\', type=str,\n-\t\thelp="the output file containing the data for LEfSe")\n-\tparser.add_argument(\'--output_table\', type=str, required=False, default="",\n-\t\thelp="the formatted table in txt format")\n-\tparser.add_argument(\'-f\',dest="feats_dir", choices=["c","r"], type=str, default="r",\n-\t\thelp="set whether the features are on rows (default) or on columns")\n-\tparser.add_argument(\'-c\',dest="class", metavar="[1..n_feats]", type=int, default=1,\n-\t\thelp="set which feature use as class (default 1)")\t\n-\tparser.add_argument(\'-s\',dest="subclass", metavar="[1..n_feats]", type=int, default=None,\n-\t\thelp="set which feature use as subclass (default -1 meaning no subclass)")\n-\tparser.add_argument(\'-o\',dest="norm_v", metavar="float", type=float, default=-1.0,\n-\t\thelp="set the normalization value (default -1.0 meaning no normalization)")\n-\tparser.add_argument(\'-u\',dest="subject", metavar="[1..n_feats]", type=int, default=None,\n-\t\thelp="set which feature use as subject (default -1 meaning no subject)")\n-\tparser.add_argument(\'-m\',dest="missing_p", choices=["f","s"], type=str, default="d",\n-\t\thelp="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")\n-\tparser.add_argument(\'-n\',dest="subcl_min_card", metavar="int", type=int, default=10,\n-\t\thelp="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")\n-\n-\targs = parser.parse_args()\n-\t\n-\treturn vars(args)\t\n-\n-def remove_missing(data,roc):\n-\tif roc == "c": data = transpose(data)\n-\tmax_len = max([len(r) for r in data])\n-\tto_rem = []\n-\tfor i,r in enumerate(data):\n-\t\tif len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)\n-\tif len(to_rem):\n-\t\tfor i in to_rem.reverse():\n-\t\t\tdata.pop(i)\n-\tif roc == "c": return transpose(data)\n-\treturn data\n-\n- \n-def sort_by_cl(data,n,c,s,u): \n-\tdef sort_lines1(a,b): \n- \treturn int(a[c] > b[c])*2-1 \n-\tdef sort_lines2u(a,b): \n- \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n-\t\treturn int(a[u] > b[u])*2-1\n-\tdef sort_lines2s(a,b): \n- \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n-\t\treturn int(a[s] > b[s])*2-1\n-\tdef sort_lines3(a,b): \n- \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n-\t\tif a[s] != b[s]: return int(a[s] > b[s])*2-1 \n-\t\treturn int(a[u] > b[u])*2-1 \n- if n == 3: data.sort(sort_lines3) \n- if n == 2: \n-\t if s is None: \n-\t data.sort(sort_lines2u)\n-\t else:\n-\t data.sort(sort_lines2s)\n- if n == 1: data.sort(sort_lines1) \n-\treturn data \n-\n-def group_small_subclas'..b')[:-1]))]\n-\t\t\t\t\tab =[]\n-\t\t\t\t\tfor l in [f for f in zip(*ab_all)]:\n-\t\t\t\t\t\tab.append(sum([float(ll) for ll in l]))\n-\t\t\t\t\tff[fc] = ab\n-\t\t\t\t\tadded = True\n-\t\t\tif added:\n-\t\t\t\tbreak\n-\t\t\n-\treturn ff\n-\t\t\t\t\n-\t\t\t\t\n-def add_missing_levels(ff):\n-\tif sum( [f.count(".") for f in ff] ) < 1: return ff\n-\t\n-\tclades2leaves = {}\n-\tfor f in ff:\n-\t\tfs = f.split(".")\n-\t\tif len(fs) < 2:\n-\t\t\tcontinue\n-\t\tfor l in range(len(fs)):\n-\t\t\tn = ".".join( fs[:l] )\n-\t\t\tif n in clades2leaves:\n-\t\t\t\tclades2leaves[n].append( f )\n-\t\t\telse:\n-\t\t\t\tclades2leaves[n] = [f]\n-\tfor k,v in clades2leaves.items():\n-\t\tif k and k not in ff:\n-\t\t\tff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]\n-\treturn ff\n-\n-\t\t\t\n-\n-def modify_feature_names(fn):\n-\tret = fn\n-\n-\tfor v in [\' \',r\'\\$\',r\'\\@\',r\'#\',r\'%\',r\'\\^\',r\'\\&\',r\'\\*\',r\'\\"\',r\'\\\'\']:\n- ret = [re.sub(v,"",f) for f in ret]\n- for v in ["/",r\'\$\',r\'\$\',r\'-\',r\'\\+\',r\'=\',r\'{\',r\'}\',r\'\\[\',r\'\\]\',\n-\t\tr\',\',r\'\\.\',r\';\',r\':\',r\'\\?\',r\'\\<\',r\'\\>\',r\'\\.\',r\'\\,\']:\n- ret = [re.sub(v,"_",f) for f in ret]\n- for v in ["\\|"]:\n- ret = [re.sub(v,".",f) for f in ret]\n-\t\n-\tret2 = []\n-\tfor r in ret:\n-\t\tif r[0] in [\'0\',\'1\',\'2\',\'3\',\'4\',\'5\',\'6\',\'7\',\'8\',\'9\']:\n-\t\t\tret2.append("f_"+r)\n-\t\telse: ret2.append(r)\t\n-\t\t\t\t\n-\treturn ret2 \n-\t\t\n-\n-def rename_same_subcl(cl,subcl):\n-\ttoc = []\n-\tfor sc in set(subcl):\n-\t\tif len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:\n-\t\t\ttoc.append(sc)\n-\tnew_subcl = []\n-\tfor i,sc in enumerate(subcl):\n-\t\tif sc in toc: new_subcl.append(cl[i]+"_"+sc)\n-\t\telse: new_subcl.append(sc)\n-\treturn new_subcl\n-\n-if __name__ == \'__main__\':\n-\tparams = read_params(sys.argv)\n-\n-\tif type(params[\'subclass\']) is int and int(params[\'subclass\']) < 1:\n-\t\tparams[\'subclass\'] = None\n-\tif type(params[\'subject\']) is int and int(params[\'subject\']) < 1:\n-\t\tparams[\'subject\'] = None\n-\tdata = read_input_file(sys.argv[1])\n-\n-\tif params[\'feats_dir\'] == "c":\n-\t\tdata = transpose(data)\n-\n-\tncl = 1\n-\tif not params[\'subclass\'] is None: ncl += 1\t\n-\tif not params[\'subject\'] is None: ncl += 1\t\n-\n-\tfirst_line = zip(*data)[0]\n-\t\n-\tfirst_line = modify_feature_names(list(first_line))\n-\n-\tdata = zip(\tfirst_line,\n-\t\t\t*sort_by_cl(zip(*data)[1:],\n-\t\t\t ncl,\n-\t\t\t params[\'class\']-1,\n-\t\t\t params[\'subclass\']-1 if not params[\'subclass\'] is None else None,\n-\t\t\t params[\'subject\']-1 if not params[\'subject\'] is None else None))\n-#\tdata.insert(0,first_line)\n-#\tdata = remove_missing(data,params[\'missing_p\'])\n-\tcls = {}\n-\n-\tcls_i = [(\'class\',params[\'class\']-1)]\n-\tif params[\'subclass\'] > 0: cls_i.append((\'subclass\',params[\'subclass\']-1))\n-\tif params[\'subject\'] > 0: cls_i.append((\'subject\',params[\'subject\']-1))\n-\tcls_i.sort(lambda x, y: -cmp(x[1],y[1]))\n-\tfor v in cls_i: cls[v[0]] = data.pop(v[1])[1:]\n-\tif not params[\'subclass\'] > 0: cls[\'subclass\'] = [str(cl)+"_subcl" for cl in cls[\'class\']]\n-\t\n-\tcls[\'subclass\'] = rename_same_subcl(cls[\'class\'],cls[\'subclass\'])\n-#\tif \'subclass\' in cls.keys(): cls = group_small_subclasses(cls,params[\'subcl_min_card\'])\n-\tclass_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))\n- \n-\tfeats = dict([(d[0],d[1:]) for d in data])\n- \n-\tfeats = add_missing_levels(feats)\n- \n-\tfeats = numerical_values(feats,params[\'norm_v\'])\n-\tout = {}\n-\tout[\'feats\'] = feats\n-\tout[\'norm\'] = params[\'norm_v\'] \n-\tout[\'cls\'] = cls\n-\tout[\'class_sl\'] = class_sl\n-\tout[\'subclass_sl\'] = subclass_sl\n-\tout[\'class_hierarchy\'] = class_hierarchy\n-\n-\tif params[\'output_table\']:\n-\t\twith open( params[\'output_table\'], "w") as outf: \n-\t\t\tif \'class\' in cls: outf.write( "\\t".join(list(["class"])+list(cls[\'class\'])) + "\\n" )\n-\t\t\tif \'subclass\' in cls: outf.write( "\\t".join(list(["subclass"])+list(cls[\'subclass\'])) + "\\n" )\n-\t\t\tif \'subject\' in cls: outf.write( "\\t".join(list(["subject"])+list(cls[\'subject\'])) + "\\n" )\n-\t\t\tfor k,v in out[\'feats\'].items(): outf.write( "\\t".join([k]+[str(vv) for vv in v]) + "\\n" )\n-\n-\twith open(params[\'output_file\'], \'wb\') as back_file:\n-\t\tpickle.dump(out,back_file) \t\n-\n'

diff -r e7cd19afda2e -r db64b6287cd6 format_input.xml
--- a/format_input.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,219 +0,0 @@\n-<tool id="LEfSe_for" name="A) Format Data for LEfSe" version="1.0">\n- <code file="format_input_selector.py"/> \n-\t<description></description>\n-\n-<command interpreter="python">format_input.py $inp_data $formatted_input -f $cond.feat_dir -c $cond.cls_n -s $cond.subcls_n -u $cond.subj_n -o $norm </command> \n- <inputs>\n- <page>\n-\t<param format="tabular" name="inp_data" type="data" label="Upload a tabular file of relative abundances and class labels (possibly also subclass and subjects labels) for LEfSe - See samples below - Please use Galaxy Get-Data/Upload-File. Use File-Type = Tabular" help=""/>\n-\t<param name="cond" type="data_column" data_ref="inp_data" accept_default="true" /> \n-\n-\t<conditional name="cond" type="data_column" data_ref="inp_data" accept_default="true">\n- <param name="feat_dir" type="select" data_ref="inp_data" label="Select whether the vectors (features and meta-data information) are listed in rows or columns" help="">\n- <option value="r" selected=\'True\'>Rows</option>\n- <option value="c">Columns</option>\n- </param>\n-\n- <when value="r">\n- <param name="cls_n" label="Select which row to use as class" size ="70" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'cl\')" />\n- <param name="subcls_n" label="Select which row to use as subclass" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'subclass\')" />\n- <param name="subj_n" label="Select which row to use as subject" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'subject\')" />\n- </when>\n- <when value="c">\n- <param name="cls_n" label="Select which column to use as class" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'cl\')" />\n- <param name="subcls_n" label="Select which column to use as subclass" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'subclass\')" />\n- <param name="subj_n" label="Select which column to use as subject" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'subject\')" />\n- </when>\n-\n- </conditional>\n-\n-\t<param name="norm" type="select" label="Per-sample normalization of the sum of the values to 1M (recommended when very low values are present)" help="">\n- <option value="1000000.0" selected=\'True\'>Yes</option>\n- <option value="-1">No</option>\n- </param>\n-\n-\n- </page>\n- </inputs>\n- <outputs>\n- <data format="lefse" name="formatted_input" />\n- </outputs>\n- \n- <tests>\n- <test>\n- <param name="inp_data" value="lefse_input" ftype="tabular" />\n- <param name="cond.feat_dir" value="r" />\n-\t <param name="cond.cls_n" value="1" />\n-\t <param name="cond.subcls" value="-1" />\n-\t <param name="cond.subj" value="-1" />\n-\t <param name="norm" value="1000000" />\n- <output name="formatted_input" file="lefse_output_a" />\n- </test>\n- </tests> \n- \n- \n- \n- \n-<help>\n-\t\n-\t\n-**What it does**\n-\n-LDA Effect Size (LEfSe) `(Segata et. al 2010)`_ is an algorithm for high-dimensional biomarker discovery and \n-explanation that identifies genomic features (genes, pathways, or taxa) characterizing \n-the differences between two or more biological conditions (or classes, see figure below). It \n-emphasizes both statistical significance and biological relevance, allowing \n-researchers to identify differentially abundant features that are also consistent with \n-biologically meaningful categories (subclasses). LEfSe first robustly \n-identifies features that are statistically different among biological classes. It then \n-performs additional tests to assess whether these differences are consistent with \n-respect to expected biological behavior. \n-\n-Specifically,'..b'e same name are processed as different subclasses). The subject vector (optional) reports a third dimension denoting meta-data (subject id, sample type, ... ) which is independent from the class and subclass definition. \n-\n-The labels can have a hierarchical organization (see example below) reflecting taxonomies (like NCBI or RDB microbial taxonomy, SEED subsystems or GO terms). The taxonomic levels are specified using the character \\|.\n-\n-The per-sample normalization is usually applied for metagenomic data in which the relative abundances are taken into account. \n-\n-------\n-\n-**Example**\n-\n-Although both column and row feature organization is accepted, given the high-dimensional nature of metagenomic data, the listing of the features in rows is preferred. A partial example of an input file follows (all values are separated by single-tab)::\n-\n-\tbodysite\t\t\t\tmucosal\t\tmucosal\t\tmucosal\t\tmucosal\t\tmucosal\t\tnon_mucosal\tnon_mucosal\tnon_mucosal\tnon_mucosal\tnon_mucosal\n-\tsubsite\t\t\t\t\toral\t\tgut\t\toral\t\toral\t\tgut\t\tskin\t\tnasal\t\tskin\t\tear\t\tnasal\n-\tid\t\t\t\t\t1023\t\t1023\t\t1672\t\t1876\t\t1672\t\t159005010\t1023\t\t1023\t\t1023\t\t1672\n-\tBacteria\t\t\t\t0.99999\t\t0.99999\t\t0.999993\t0.999989\t0.999997\t0.999927\t0.999977\t0.999987\t0.999997\t0.999993\n-\tBacteria|Actinobacteria\t\t\t0.311037\t0.000864363\t0.00446132\t0.0312045\t0.000773642\t0.359354\t0.761108\t0.603002\t0.95913\t\t0.753688\n-\tBacteria|Bacteroidetes\t\t\t0.0689602\t0.804293\t0.00983343\t0.0303561\t0.859838\t0.0195298\t0.0212741\t0.145729\t0.0115617\t0.0114511\n-\tBacteria|Firmicutes\t\t\t0.494223\t0.173411\t0.715345\t0.813046\t0.124552\t0.177961\t0.189178\t0.188964\t0.0226835\t0.192665\n-\tBacteria|Proteobacteria\t\t\t0.0914284\t0.0180378\t0.265664\t0.109549\t0.00941215\t0.430869\t0.0225884\t0.0532684\t0.00512034\t0.0365453\n-\tBacteria|Firmicutes|Clostridia\t\t0.090041\t0.170246\t0.00483188\t0.0465328\t0.122702\t0.0402301\t0.0460614\t0.135201\t0.0115835\t0.0537381\n-\n-In this case one may want to use bodysite as class, subsite as subclass and id as subject. Notice that the features have a hierarchical structure specified using the character \\|.\n-\n-**Example with the "mouse model dataset"**\n-\n-You can try the LEfSe modules using the dataset available here_. This is a 16S dataset from `(Garrett et. al 2010)`_ and `(Veiga et. al 2010)`_ for studying the characteristics of the fecal microbiota in a mouse model of spontaneous colitis. The dataset contains 30 abundance profiles (obtained processing the 16S reads with RDP) belonging to 10 rag2 (control) and 20 truc (case) mice. The metadata consists of class information only, as we don\'t have subject or subclass information. The dataset contains the features organized in rows; you need to select the first row as class, whereas you have to select "no subclass" and "no subject" options.\n- \n- \n-.. _here: http://www.huttenhower.org/webfm_send/73\n-.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n-.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380\n-.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388\n-.. _contact us: nsegata@hsph.harvard.edu\n-\n-\n-\n-\n-**How to Cite LEfSe**\n-\n-If you find LEfSe usefull in your research please city our paper `(Segata et. al 2010)`_:\n-\n-| `Nicola Segata`_, Jacques Izard, Levi Walron, Dirk Gevers, Larisa Miropolsky, Wendy Garrett, `Curtis Huttenhower`_.\n-| "`Metagenomic Biomarker Discovery and Explanation`_" \n-| Genome Biology, 2011 Jun 24;12(6):R60\n-\n-\n-Please do not hesitate to `contact us`_ for any questions of comments. \n-\n-.. _here: http://www.huttenhower.org/webfm_send/73\n-.. _(Segata et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n-.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380\n-.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388\n-.. _contact us: nsegata@hsph.harvard.edu\n-.. _Nicola Segata: nsegata@hsph.harvard.edu\n-.. _Curtis Huttenhower: chuttenh@hsph.harvard.edu\n-.. _Metagenomic Biomarker Discovery and Explanation: http://genomebiology.com/2011/12/6/R60 \t\n-\t\n-\t\n-\t\n-\n-</help>\n-</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 format_input_selector.py
--- a/format_input_selector.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,77 +0,0 @@
-from galaxy import datatypes,model
-import sys,string,time
-
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def get_opt(data):
- return [('r','r',False),('c','c',False)]
-
-def red(st,l):
- if len(st) <= l: return st
- l1,l2 = l/2,l/2
- return st[:l1]+".."+st[len(st)-l2:]
-
-def get_row_names(data,t):
- if data == "": return []
- max_len =38
- fname = data.dataset.file_name
- opt = []
- rc = ''
-# lines = [(red(v.split()[0],max_len),'%s' % str(v.split()[0]),False) for i,v in enumerate(open(fname))]
- if t == 'b': lines = [(red(v.split()[0],max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname)) if len(v.split()) > 3 ]
- else: lines = [(red(v.split()[0],max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname))]
- return sorted(opt+lines)
-
-def get_cols(data,t,c):
- if data == "": return []
- max_len =32
- fname = data.dataset.file_name
- opt = []
- if c != 'cl':
- opt.append(('no '+c,'%d' % -1,False))
- if t == 'c':
- rc = ''
- lines = [(red((rc+"#"+str(i+1)+":"+v[0]),max_len),'%d' % (i+1),False) for i,v in enumerate(zip(*[line.split() for line in open(fname)]))]
- else:
- rc = ''
- lines = [(red((rc+"#"+str(i+1)+":"+v.split()[0]),max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname))]
- return opt+lines
-
-"""
-def get_phecols(i,addNone,hint):
-   hint = hint.lower()
-   fname = i.dataset.file_name
-   try:
-        f = open(fname,'r')
-   except:
-        return [('get_phecols unable to open file "%s"' % fname,'None',False),]
-   header = f.next()
-   h = header.strip().split()
-   dat = [(x,'%d' % i,False) for i,x in enumerate(h)]
-   matches = [i for i,x in enumerate(h) if x.lower().find(hint) <> -1]
-   if len(matches) > 0:
-       sel = matches[0]
-       dat[sel] = (dat[sel][0],dat[sel][1],True)
-   if addNone:
-        dat.insert(0,('None - no Manhattan plot','0', False ))
-   return dat
-"""
-
-
-"""
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    outfile = 'out_html'
-    job_name = param_dict.get( 'name', 'Manhattan QQ plots' )
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    newname = '%s.html' % job_name.translate(trantab)
-    data = out_data[outfile]
-    data.name = newname
-    data.info='%s run at %s' % (job_name,timenow())
-    out_data[outfile] = data
-    app.model.context.flush()
-"""

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/datatypes_conf.xml Wed Aug 20 16:56:51 2014 -0400

@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<datatypes>
+    <registration>
+    <datatype extension="lefse" type="galaxy.datatypes.data:Text"  subclass="true" display_in_upload="true"/>
+    <datatype extension="lefse_internal_res" type="galaxy.datatypes.data:Text" subclass="true" display_in_upload="true"/>
+    <datatype extension="lefse_internal_for" type="galaxy.datatypes.data:Text"  subclass="true" display_in_upload="true"/>
+    </registration>
+</datatypes>

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/format_input.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/format_input.py Wed Aug 20 16:56:51 2014 -0400

[

b'@@ -0,0 +1,286 @@\n+#!/usr/bin/env python\n+\n+import sys,os,argparse,pickle,re\n+\n+def read_input_file(inp_file):\n+\twith open(inp_file) as inp:\n+\t\treturn [[v.strip() for v in line.strip().split("\\t")] for line in inp.readlines()]\n+\n+def transpose(data):\n+\treturn zip(*data)\n+\n+def read_params(args):\n+\tparser = argparse.ArgumentParser(description=\'LEfSe formatting modules\')\n+\tparser.add_argument(\'input_file\', metavar=\'INPUT_FILE\', type=str, help="the input file, feature hierarchical level can be specified with | or . and those symbols must not be present for other reasons in the input file.")\n+\tparser.add_argument(\'output_file\', metavar=\'OUTPUT_FILE\', type=str,\n+\t\thelp="the output file containing the data for LEfSe")\n+\tparser.add_argument(\'--output_table\', type=str, required=False, default="",\n+\t\thelp="the formatted table in txt format")\n+\tparser.add_argument(\'-f\',dest="feats_dir", choices=["c","r"], type=str, default="r",\n+\t\thelp="set whether the features are on rows (default) or on columns")\n+\tparser.add_argument(\'-c\',dest="class", metavar="[1..n_feats]", type=int, default=1,\n+\t\thelp="set which feature use as class (default 1)")\t\n+\tparser.add_argument(\'-s\',dest="subclass", metavar="[1..n_feats]", type=int, default=None,\n+\t\thelp="set which feature use as subclass (default -1 meaning no subclass)")\n+\tparser.add_argument(\'-o\',dest="norm_v", metavar="float", type=float, default=-1.0,\n+\t\thelp="set the normalization value (default -1.0 meaning no normalization)")\n+\tparser.add_argument(\'-u\',dest="subject", metavar="[1..n_feats]", type=int, default=None,\n+\t\thelp="set which feature use as subject (default -1 meaning no subject)")\n+\tparser.add_argument(\'-m\',dest="missing_p", choices=["f","s"], type=str, default="d",\n+\t\thelp="set the policy to adopt with missin values: f removes the features with missing values, s removes samples with missing values (default f)")\n+\tparser.add_argument(\'-n\',dest="subcl_min_card", metavar="int", type=int, default=10,\n+\t\thelp="set the minimum cardinality of each subclass (subclasses with low cardinalities will be grouped together, if the cardinality is still low, no pairwise comparison will be performed with them)")\n+\n+\targs = parser.parse_args()\n+\t\n+\treturn vars(args)\t\n+\n+def remove_missing(data,roc):\n+\tif roc == "c": data = transpose(data)\n+\tmax_len = max([len(r) for r in data])\n+\tto_rem = []\n+\tfor i,r in enumerate(data):\n+\t\tif len([v for v in r if not( v == "" or v.isspace())]) < max_len: to_rem.append(i)\n+\tif len(to_rem):\n+\t\tfor i in to_rem.reverse():\n+\t\t\tdata.pop(i)\n+\tif roc == "c": return transpose(data)\n+\treturn data\n+\n+ \n+def sort_by_cl(data,n,c,s,u): \n+\tdef sort_lines1(a,b): \n+ \treturn int(a[c] > b[c])*2-1 \n+\tdef sort_lines2u(a,b): \n+ \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n+\t\treturn int(a[u] > b[u])*2-1\n+\tdef sort_lines2s(a,b): \n+ \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n+\t\treturn int(a[s] > b[s])*2-1\n+\tdef sort_lines3(a,b): \n+ \tif a[c] != b[c]: return int(a[c] > b[c])*2-1 \n+\t\tif a[s] != b[s]: return int(a[s] > b[s])*2-1 \n+\t\treturn int(a[u] > b[u])*2-1 \n+ if n == 3: data.sort(sort_lines3) \n+ if n == 2: \n+\t if s is None: \n+\t data.sort(sort_lines2u)\n+\t else:\n+\t data.sort(sort_lines2s)\n+ if n == 1: data.sort(sort_lines1) \n+\treturn data \n+\n+def group_small_subclas'..b')[:-1]))]\n+\t\t\t\t\tab =[]\n+\t\t\t\t\tfor l in [f for f in zip(*ab_all)]:\n+\t\t\t\t\t\tab.append(sum([float(ll) for ll in l]))\n+\t\t\t\t\tff[fc] = ab\n+\t\t\t\t\tadded = True\n+\t\t\tif added:\n+\t\t\t\tbreak\n+\t\t\n+\treturn ff\n+\t\t\t\t\n+\t\t\t\t\n+def add_missing_levels(ff):\n+\tif sum( [f.count(".") for f in ff] ) < 1: return ff\n+\t\n+\tclades2leaves = {}\n+\tfor f in ff:\n+\t\tfs = f.split(".")\n+\t\tif len(fs) < 2:\n+\t\t\tcontinue\n+\t\tfor l in range(len(fs)):\n+\t\t\tn = ".".join( fs[:l] )\n+\t\t\tif n in clades2leaves:\n+\t\t\t\tclades2leaves[n].append( f )\n+\t\t\telse:\n+\t\t\t\tclades2leaves[n] = [f]\n+\tfor k,v in clades2leaves.items():\n+\t\tif k and k not in ff:\n+\t\t\tff[k] = [sum(a) for a in zip(*[[float(fn) for fn in ff[vv]] for vv in v])]\n+\treturn ff\n+\n+\t\t\t\n+\n+def modify_feature_names(fn):\n+\tret = fn\n+\n+\tfor v in [\' \',r\'\\$\',r\'\\@\',r\'#\',r\'%\',r\'\\^\',r\'\\&\',r\'\\*\',r\'\\"\',r\'\\\'\']:\n+ ret = [re.sub(v,"",f) for f in ret]\n+ for v in ["/",r\'\$\',r\'\$\',r\'-\',r\'\\+\',r\'=\',r\'{\',r\'}\',r\'\\[\',r\'\\]\',\n+\t\tr\',\',r\'\\.\',r\';\',r\':\',r\'\\?\',r\'\\<\',r\'\\>\',r\'\\.\',r\'\\,\']:\n+ ret = [re.sub(v,"_",f) for f in ret]\n+ for v in ["\\|"]:\n+ ret = [re.sub(v,".",f) for f in ret]\n+\t\n+\tret2 = []\n+\tfor r in ret:\n+\t\tif r[0] in [\'0\',\'1\',\'2\',\'3\',\'4\',\'5\',\'6\',\'7\',\'8\',\'9\']:\n+\t\t\tret2.append("f_"+r)\n+\t\telse: ret2.append(r)\t\n+\t\t\t\t\n+\treturn ret2 \n+\t\t\n+\n+def rename_same_subcl(cl,subcl):\n+\ttoc = []\n+\tfor sc in set(subcl):\n+\t\tif len(set([cl[i] for i in range(len(subcl)) if sc == subcl[i]])) > 1:\n+\t\t\ttoc.append(sc)\n+\tnew_subcl = []\n+\tfor i,sc in enumerate(subcl):\n+\t\tif sc in toc: new_subcl.append(cl[i]+"_"+sc)\n+\t\telse: new_subcl.append(sc)\n+\treturn new_subcl\n+\n+if __name__ == \'__main__\':\n+\tparams = read_params(sys.argv)\n+\n+\tif type(params[\'subclass\']) is int and int(params[\'subclass\']) < 1:\n+\t\tparams[\'subclass\'] = None\n+\tif type(params[\'subject\']) is int and int(params[\'subject\']) < 1:\n+\t\tparams[\'subject\'] = None\n+\tdata = read_input_file(sys.argv[1])\n+\n+\tif params[\'feats_dir\'] == "c":\n+\t\tdata = transpose(data)\n+\n+\tncl = 1\n+\tif not params[\'subclass\'] is None: ncl += 1\t\n+\tif not params[\'subject\'] is None: ncl += 1\t\n+\n+\tfirst_line = zip(*data)[0]\n+\t\n+\tfirst_line = modify_feature_names(list(first_line))\n+\n+\tdata = zip(\tfirst_line,\n+\t\t\t*sort_by_cl(zip(*data)[1:],\n+\t\t\t ncl,\n+\t\t\t params[\'class\']-1,\n+\t\t\t params[\'subclass\']-1 if not params[\'subclass\'] is None else None,\n+\t\t\t params[\'subject\']-1 if not params[\'subject\'] is None else None))\n+#\tdata.insert(0,first_line)\n+#\tdata = remove_missing(data,params[\'missing_p\'])\n+\tcls = {}\n+\n+\tcls_i = [(\'class\',params[\'class\']-1)]\n+\tif params[\'subclass\'] > 0: cls_i.append((\'subclass\',params[\'subclass\']-1))\n+\tif params[\'subject\'] > 0: cls_i.append((\'subject\',params[\'subject\']-1))\n+\tcls_i.sort(lambda x, y: -cmp(x[1],y[1]))\n+\tfor v in cls_i: cls[v[0]] = data.pop(v[1])[1:]\n+\tif not params[\'subclass\'] > 0: cls[\'subclass\'] = [str(cl)+"_subcl" for cl in cls[\'class\']]\n+\t\n+\tcls[\'subclass\'] = rename_same_subcl(cls[\'class\'],cls[\'subclass\'])\n+#\tif \'subclass\' in cls.keys(): cls = group_small_subclasses(cls,params[\'subcl_min_card\'])\n+\tclass_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))\n+ \n+\tfeats = dict([(d[0],d[1:]) for d in data])\n+ \n+\tfeats = add_missing_levels(feats)\n+ \n+\tfeats = numerical_values(feats,params[\'norm_v\'])\n+\tout = {}\n+\tout[\'feats\'] = feats\n+\tout[\'norm\'] = params[\'norm_v\'] \n+\tout[\'cls\'] = cls\n+\tout[\'class_sl\'] = class_sl\n+\tout[\'subclass_sl\'] = subclass_sl\n+\tout[\'class_hierarchy\'] = class_hierarchy\n+\n+\tif params[\'output_table\']:\n+\t\twith open( params[\'output_table\'], "w") as outf: \n+\t\t\tif \'class\' in cls: outf.write( "\\t".join(list(["class"])+list(cls[\'class\'])) + "\\n" )\n+\t\t\tif \'subclass\' in cls: outf.write( "\\t".join(list(["subclass"])+list(cls[\'subclass\'])) + "\\n" )\n+\t\t\tif \'subject\' in cls: outf.write( "\\t".join(list(["subject"])+list(cls[\'subject\'])) + "\\n" )\n+\t\t\tfor k,v in out[\'feats\'].items(): outf.write( "\\t".join([k]+[str(vv) for vv in v]) + "\\n" )\n+\n+\twith open(params[\'output_file\'], \'wb\') as back_file:\n+\t\tpickle.dump(out,back_file) \t\n+\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/format_input.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/format_input.xml Wed Aug 20 16:56:51 2014 -0400

b'@@ -0,0 +1,219 @@\n+<tool id="LEfSe_for" name="A) Format Data for LEfSe" version="1.0">\n+ <code file="format_input_selector.py"/> \n+\t<description></description>\n+\n+<command interpreter="python">format_input.py $inp_data $formatted_input -f $cond.feat_dir -c $cond.cls_n -s $cond.subcls_n -u $cond.subj_n -o $norm </command> \n+ <inputs>\n+ <page>\n+\t<param format="tabular" name="inp_data" type="data" label="Upload a tabular file of relative abundances and class labels (possibly also subclass and subjects labels) for LEfSe - See samples below - Please use Galaxy Get-Data/Upload-File. Use File-Type = tabular" help=""/>\n+\t<param name="cond" type="data_column" data_ref="inp_data" accept_default="true" /> \n+\n+\t<conditional name="cond" type="data_column" data_ref="inp_data" accept_default="true">\n+ <param name="feat_dir" type="select" data_ref="inp_data" label="Select whether the vectors (features and meta-data information) are listed in rows or columns" help="">\n+ <option value="r" selected=\'True\'>Rows</option>\n+ <option value="c">Columns</option>\n+ </param>\n+\n+ <when value="r">\n+ <param name="cls_n" label="Select which row to use as class" size ="70" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'cl\')" />\n+ <param name="subcls_n" label="Select which row to use as subclass" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'subclass\')" />\n+ <param name="subj_n" label="Select which row to use as subject" type=\'select\' dynamic_options="get_cols(inp_data,\'r\',\'subject\')" />\n+ </when>\n+ <when value="c">\n+ <param name="cls_n" label="Select which column to use as class" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'cl\')" />\n+ <param name="subcls_n" label="Select which column to use as subclass" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'subclass\')" />\n+ <param name="subj_n" label="Select which column to use as subject" type=\'select\' dynamic_options="get_cols(inp_data,\'c\',\'subject\')" />\n+ </when>\n+\n+ </conditional>\n+\n+\t<param name="norm" type="select" label="Per-sample normalization of the sum of the values to 1M (recommended when very low values are present)" help="">\n+ <option value="1000000.0" selected=\'True\'>Yes</option>\n+ <option value="-1">No</option>\n+ </param>\n+\n+\n+ </page>\n+ </inputs>\n+ <outputs>\n+ <data format="lefse_internal_for" name="formatted_input" />\n+ </outputs>\n+ \n+ <tests>\n+ <test>\n+ <param name="inp_data" value="lefse_input" ftype="tabular" />\n+ <param name="cond.feat_dir" value="r" />\n+\t <param name="cond.cls_n" value="1" />\n+\t <param name="cond.subcls" value="-1" />\n+\t <param name="cond.subj" value="-1" />\n+\t <param name="norm" value="1000000" />\n+ <output name="formatted_input" file="lefse_output_a" />\n+ </test>\n+ </tests> \n+ \n+ \n+ \n+ \n+<help>\n+\t\n+\t\n+**What it does**\n+\n+LDA Effect Size (LEfSe) `(Segata et. al 2010)`_ is an algorithm for high-dimensional biomarker discovery and \n+explanation that identifies genomic features (genes, pathways, or taxa) characterizing \n+the differences between two or more biological conditions (or classes, see figure below). It \n+emphasizes both statistical significance and biological relevance, allowing \n+researchers to identify differentially abundant features that are also consistent with \n+biologically meaningful categories (subclasses). LEfSe first robustly \n+identifies features that are statistically different among biological classes. It then \n+performs additional tests to assess whether these differences are consistent with \n+respect to expected biological behavior. \n+\n+'..b'e same name are processed as different subclasses). The subject vector (optional) reports a third dimension denoting meta-data (subject id, sample type, ... ) which is independent from the class and subclass definition. \n+\n+The labels can have a hierarchical organization (see example below) reflecting taxonomies (like NCBI or RDB microbial taxonomy, SEED subsystems or GO terms). The taxonomic levels are specified using the character \\|.\n+\n+The per-sample normalization is usually applied for metagenomic data in which the relative abundances are taken into account. \n+\n+------\n+\n+**Example**\n+\n+Although both column and row feature organization is accepted, given the high-dimensional nature of metagenomic data, the listing of the features in rows is preferred. A partial example of an input file follows (all values are separated by single-tab)::\n+\n+\tbodysite\t\t\t\tmucosal\t\tmucosal\t\tmucosal\t\tmucosal\t\tmucosal\t\tnon_mucosal\tnon_mucosal\tnon_mucosal\tnon_mucosal\tnon_mucosal\n+\tsubsite\t\t\t\t\toral\t\tgut\t\toral\t\toral\t\tgut\t\tskin\t\tnasal\t\tskin\t\tear\t\tnasal\n+\tid\t\t\t\t\t1023\t\t1023\t\t1672\t\t1876\t\t1672\t\t159005010\t1023\t\t1023\t\t1023\t\t1672\n+\tBacteria\t\t\t\t0.99999\t\t0.99999\t\t0.999993\t0.999989\t0.999997\t0.999927\t0.999977\t0.999987\t0.999997\t0.999993\n+\tBacteria|Actinobacteria\t\t\t0.311037\t0.000864363\t0.00446132\t0.0312045\t0.000773642\t0.359354\t0.761108\t0.603002\t0.95913\t\t0.753688\n+\tBacteria|Bacteroidetes\t\t\t0.0689602\t0.804293\t0.00983343\t0.0303561\t0.859838\t0.0195298\t0.0212741\t0.145729\t0.0115617\t0.0114511\n+\tBacteria|Firmicutes\t\t\t0.494223\t0.173411\t0.715345\t0.813046\t0.124552\t0.177961\t0.189178\t0.188964\t0.0226835\t0.192665\n+\tBacteria|Proteobacteria\t\t\t0.0914284\t0.0180378\t0.265664\t0.109549\t0.00941215\t0.430869\t0.0225884\t0.0532684\t0.00512034\t0.0365453\n+\tBacteria|Firmicutes|Clostridia\t\t0.090041\t0.170246\t0.00483188\t0.0465328\t0.122702\t0.0402301\t0.0460614\t0.135201\t0.0115835\t0.0537381\n+\n+In this case one may want to use bodysite as class, subsite as subclass and id as subject. Notice that the features have a hierarchical structure specified using the character \\|.\n+\n+**Example with the "mouse model dataset"**\n+\n+You can try the LEfSe modules using the dataset available here_. This is a 16S dataset from `(Garrett et. al 2010)`_ and `(Veiga et. al 2010)`_ for studying the characteristics of the fecal microbiota in a mouse model of spontaneous colitis. The dataset contains 30 abundance profiles (obtained processing the 16S reads with RDP) belonging to 10 rag2 (control) and 20 truc (case) mice. The metadata consists of class information only, as we don\'t have subject or subclass information. The dataset contains the features organized in rows; you need to select the first row as class, whereas you have to select "no subclass" and "no subject" options.\n+ \n+ \n+.. _here: http://www.huttenhower.org/webfm_send/73\n+.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n+.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380\n+.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388\n+.. _contact us: nsegata@hsph.harvard.edu\n+\n+\n+\n+\n+**How to Cite LEfSe**\n+\n+If you find LEfSe usefull in your research please city our paper `(Segata et. al 2010)`_:\n+\n+| `Nicola Segata`_, Jacques Izard, Levi Walron, Dirk Gevers, Larisa Miropolsky, Wendy Garrett, `Curtis Huttenhower`_.\n+| "`Metagenomic Biomarker Discovery and Explanation`_" \n+| Genome Biology, 2011 Jun 24;12(6):R60\n+\n+\n+Please do not hesitate to `contact us`_ for any questions of comments. \n+\n+.. _here: http://www.huttenhower.org/webfm_send/73\n+.. _(Segata et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n+.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380\n+.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388\n+.. _contact us: nsegata@hsph.harvard.edu\n+.. _Nicola Segata: nsegata@hsph.harvard.edu\n+.. _Curtis Huttenhower: chuttenh@hsph.harvard.edu\n+.. _Metagenomic Biomarker Discovery and Explanation: http://genomebiology.com/2011/12/6/R60 \t\n+\t\n+\t\n+\t\n+\n+</help>\n+</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/format_input_selector.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/format_input_selector.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,77 @@
+from galaxy import datatypes,model
+import sys,string,time
+
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+def get_opt(data):
+ return [('r','r',False),('c','c',False)]
+
+def red(st,l):
+ if len(st) <= l: return st
+ l1,l2 = l/2,l/2
+ return st[:l1]+".."+st[len(st)-l2:]
+
+def get_row_names(data,t):
+ if data == "": return []
+ max_len =38
+ fname = data.dataset.file_name
+ opt = []
+ rc = ''
+# lines = [(red(v.split()[0],max_len),'%s' % str(v.split()[0]),False) for i,v in enumerate(open(fname))]
+ if t == 'b': lines = [(red(v.split()[0],max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname)) if len(v.split()) > 3 ]
+ else: lines = [(red(v.split()[0],max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname))]
+ return sorted(opt+lines)
+
+def get_cols(data,t,c):
+ if data == "": return []
+ max_len =32
+ fname = data.dataset.file_name
+ opt = []
+ if c != 'cl':
+ opt.append(('no '+c,'%d' % -1,False))
+ if t == 'c':
+ rc = ''
+ lines = [(red((rc+"#"+str(i+1)+":"+v[0]),max_len),'%d' % (i+1),False) for i,v in enumerate(zip(*[line.split() for line in open(fname)]))]
+ else:
+ rc = ''
+ lines = [(red((rc+"#"+str(i+1)+":"+v.split()[0]),max_len),'%d' % (i+1),False) for i,v in enumerate(open(fname))]
+ return opt+lines
+
+"""
+def get_phecols(i,addNone,hint):
+   hint = hint.lower()
+   fname = i.dataset.file_name
+   try:
+        f = open(fname,'r')
+   except:
+        return [('get_phecols unable to open file "%s"' % fname,'None',False),]
+   header = f.next()
+   h = header.strip().split()
+   dat = [(x,'%d' % i,False) for i,x in enumerate(h)]
+   matches = [i for i,x in enumerate(h) if x.lower().find(hint) <> -1]
+   if len(matches) > 0:
+       sel = matches[0]
+       dat[sel] = (dat[sel][0],dat[sel][1],True)
+   if addNone:
+        dat.insert(0,('None - no Manhattan plot','0', False ))
+   return dat
+"""
+
+
+"""
+def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
+    outfile = 'out_html'
+    job_name = param_dict.get( 'name', 'Manhattan QQ plots' )
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    newname = '%s.html' % job_name.translate(trantab)
+    data = out_data[outfile]
+    data.name = newname
+    data.info='%s run at %s' % (job_name,timenow())
+    out_data[outfile] = data
+    app.model.context.flush()
+"""

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/lefse.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/lefse.py Wed Aug 20 16:56:51 2014 -0400

[

b'@@ -0,0 +1,241 @@\n+import os,sys,math,pickle\n+import random as lrand\n+import rpy2.robjects as robjects\n+import argparse\n+import numpy\n+#import svmutil\n+\n+def init(): \n+\tlrand.seed(1982)\n+\trobjects.r(\'library(splines)\')\n+\trobjects.r(\'library(stats4)\')\n+\trobjects.r(\'library(survival)\')\n+\trobjects.r(\'library(mvtnorm)\')\n+\trobjects.r(\'library(modeltools)\')\n+\trobjects.r(\'library(coin)\')\n+\trobjects.r(\'library(MASS)\')\n+\n+def get_class_means(class_sl,feats):\n+\tmeans = {}\n+\tclk = class_sl.keys()\n+\tfor fk,f in feats.items():\n+\t\tmeans[fk] = [numpy.mean((f[class_sl[k][0]:class_sl[k][1]])) for k in clk] \n+\treturn clk,means \n+\t\n+def save_res(res,filename): \n+\twith open(filename, \'w\') as out:\n+\t\tfor k,v in res[\'cls_means\'].items():\n+\t\t\tout.write(k+"\\t"+str(math.log(max(max(v),1.0),10.0))+"\\t")\n+\t\t\tif k in res[\'lda_res_th\']:\n+\t\t\t\tfor i,vv in enumerate(v):\n+\t\t\t\t\tif vv == max(v):\n+\t\t\t\t\t\tout.write(str(res[\'cls_means_kord\'][i])+"\\t")\n+\t\t\t\t\t\tbreak\n+\t\t\t\tout.write(str(res[\'lda_res\'][k])) \n+\t\t\telse: out.write("\\t")\n+\t\t\tout.write( "\\t" + res[\'wilcox_res\'][k]+"\\n")\n+\n+def load_data(input_file, nnorm = False):\n+\twith open(input_file, \'rb\') as inputf:\n+\t\tinp = pickle.load(inputf)\n+\tif nnorm: return inp[\'feats\'],inp[\'cls\'],inp[\'class_sl\'],inp[\'subclass_sl\'],inp[\'class_hierarchy\'],inp[\'norm\'] \n+\telse: return inp[\'feats\'],inp[\'cls\'],inp[\'class_sl\'],inp[\'subclass_sl\'],inp[\'class_hierarchy\']\n+\n+def load_res(input_file):\n+\twith open(input_file, \'rb\') as inputf:\t\n+\t\tinp = pickle.load(inputf)\n+\treturn inp[\'res\'],inp[\'params\'],inp[\'class_sl\'],inp[\'subclass_sl\']\t\t\n+\n+\n+def test_kw_r(cls,feats,p,factors):\n+\trobjects.globalenv["y"] = robjects.FloatVector(feats)\n+\tfor i,f in enumerate(factors):\n+\t\trobjects.globalenv[\'x\'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f]))\n+\tfo = "y~x1"\n+ #for i,f in enumerate(factors[1:]):\n+ #\tif f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue\n+ #\tif len(set(cls[f])) == len(cls[f]): continue\n+ #\tfo += "+x"+str(i+2)\n+\tkw_res = robjects.r(\'kruskal.test(\'+fo+\',)$p.value\')\n+\treturn float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])\n+\n+def test_rep_wilcoxon_r(sl,cl_hie,feats,th,multiclass_strat,mul_cor,fn,min_c,comp_only_same_subcl,curv=False):\n+\tcomp_all_sub = not comp_only_same_subcl\n+\ttot_ok = 0\n+\talpha_mtc = th\n+\tall_diff = []\n+\tfor pair in [(x,y) for x in cl_hie.keys() for y in cl_hie.keys() if x < y]:\n+\t\tdir_cmp = "not_set" #\n+\t\tl_subcl1, l_subcl2 = (len(cl_hie[pair[0]]), len(cl_hie[pair[1]]))\n+\t\tif mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2)\n+\t\tok = 0\n+\t\tcurv_sign = 0\n+\t\tfirst = True\n+\t\tfor i,k1 in enumerate(cl_hie[pair[0]]):\n+\t\t\tbr = False\n+\t\t\tfor j,k2 in enumerate(cl_hie[pair[1]]):\n+\t\t\t\tif not comp_all_sub and k1[len(pair[0]):] != k2[len(pair[1]):]: \n+\t\t\t\t\tok += 1\t\n+\t\t\t\t\tcontinue \n+\t\t\t\tcl1 = feats[sl[k1][0]:sl[k1][1]]\n+\t\t\t\tcl2 = feats[sl[k2][0]:sl[k2][1]]\n+\t\t\t\tmed_comp = False\n+\t\t\t\tif len(cl1) < min_c or len(cl2) < min_c: \n+\t\t\t\t\tmed_comp = True\n+\t\t\t\tsx,sy = numpy.median(cl1),numpy.median(cl2)\n+\t\t\t\tif cl1[0] == cl2[0] and len(set(cl1)) == 1 and len(set(cl2)) == 1: \n+\t\t\t\t\ttres, first = False, False\n+\t\t\t\telif not med_comp:\n+\t\t\t\t\trobjects.globalenv["x"] = robjects.FloatVector(cl1+cl2)\n+\t\t\t\t\trobjects.globalenv["y"] = robjects.FactorVector(robjects.StrVector(["a" for a in cl1]+["b" for b in cl2]))\t\n+\t\t\t\t\tpv = float(robjects.r(\'pvalue(wilcox_test(x~y,data=data.frame(x,y)))\')[0])\n+\t\t\t\t\ttres = pv < alpha_mtc*2.0\n+\t\t\t\tif first:\n+\t\t\t\t\tfirst = False\n+\t\t\t\t\tif not curv and ( med_comp or tres ): \n+\t\t\t\t\t\tdir_cmp = sx < sy\n+ #if sx == sy: br = True\n+\t\t\t\t\telif curv: \n+\t\t\t\t\t\tdir_cmp = None\n+\t\t\t\t\t\tif med_comp or tres: \n+\t\t\t\t\t\t\tcurv_sign += 1\n+\t\t\t\t\t\t\tdir_cmp = sx < sy\n+\t\t\t\t\telse: br = True\n+\t\t\t\telif not curv and med_comp:\n+\t\t\t\t\tif ((sx < sy) != dir_cmp or sx == sy): br = True\n+\t\t\t\telif curv:\n+\t\t\t\t\tif tres and dir_cmp == None:\n+\t\t\t\t\t\tcurv_sign += 1\n+\t\t\t\t\t\tdir_cmp = sx < sy\n+\t\t\t\t\tif tres and dir_cmp != (sx < sy):\n+\t\t\t'..b' for i,v in enumerate(feats[k]):\n+ if feats[\'class\'][i] == c:\n+ feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01)))\n+ rdict = {}\n+ for a,b in feats.items():\n+ if a == \'class\' or a == \'subclass\' or a == \'subject\':\n+ rdict[a] = robjects.StrVector(b)\n+ else: rdict[a] = robjects.FloatVector(b)\n+ robjects.globalenv["d"] = robjects.DataFrame(rdict)\n+ lfk = len(feats[fk[0]])\n+ rfk = int(float(len(feats[fk[0]]))*fract_sample)\n+ f = "class ~ "+fk[0]\n+ for k in fk[1:]: f += " + " + k.strip()\n+ ncl = len(set(cls[\'class\']))\n+ min_cl = int(float(min([cls[\'class\'].count(c) for c in set(cls[\'class\'])]))*fract_sample*fract_sample*0.5) \n+ min_cl = max(min_cl,1) \n+ pairs = [(a,b) for a in set(cls[\'class\']) for b in set(cls[\'class\']) if a > b]\n+\n+\tfor k in fk:\t\n+\t\tfor i in range(boots):\n+\t\t\tmeans[k].append([])\t\n+ for i in range(boots):\n+ for rtmp in range(1000):\n+ rand_s = [lrand.randint(0,lfk-1) for v in range(rfk)]\n+ if not contast_within_classes_or_few_per_class(feats,rand_s,min_cl,ncl): break\n+ rand_s = [r+1 for r in rand_s]\n+\t\tmeans[k][i] = []\n+\t\tfor p in pairs:\n+\t \trobjects.globalenv["rand_s"] = robjects.IntVector(rand_s)\n+ \trobjects.globalenv["sub_d"] = robjects.r(\'d[rand_s,]\')\n+ \tz = robjects.r(\'z <- suppressWarnings(lda(as.formula(\'+f+\'),data=sub_d,tol=\'+str(tol_min)+\'))\')\n+\t\t\trobjects.r(\'w <- z$scaling[,1]\')\n+\t\t\trobjects.r(\'w.unit <- w/sqrt(sum(w^2))\')\n+\t\t\trobjects.r(\'ss <- sub_d[,-match("class",colnames(sub_d))]\')\n+\t\t\tif \'subclass\' in feats:\n+\t\t\t\trobjects.r(\'ss <- ss[,-match("subclass",colnames(ss))]\')\n+\t\t\tif \'subject\' in feats:\n+\t\t\t\trobjects.r(\'ss <- ss[,-match("subject",colnames(ss))]\')\n+\t\t\trobjects.r(\'xy.matrix <- as.matrix(ss)\')\n+\t\t\trobjects.r(\'LD <- xy.matrix%*%w.unit\')\n+\t\t\trobjects.r(\'effect.size <- abs(mean(LD[sub_d[,"class"]=="\'+p[0]+\'"]) - mean(LD[sub_d[,"class"]=="\'+p[1]+\'"]))\')\n+\t\t\tscal = robjects.r(\'wfinal <- w.unit * effect.size\')\n+\t\t\trres = robjects.r(\'mm <- z$means\')\n+\t\t\trowns = list(rres.rownames)\n+\t\t\tlenc = len(list(rres.colnames))\n+\t\t\tcoeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal]\n+ \tres = dict([(pp,[float(ff) for ff in rres.rx(pp,True)] if pp in rowns else [0.0]*lenc ) for pp in [p[0],p[1]]])\n+\t\t\tfor j,k in enumerate(fk):\n+\t\t\t\tgm = abs(res[p[0]][j] - res[p[1]][j])\n+ \tmeans[k][i].append((gm+coeff[j])*0.5)\n+ res = {}\n+ for k in fk:\n+\t\tm = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))])\n+ res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10)\n+ return res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th])\n+\n+\n+def test_svm(cls,feats,cl_sl,boots,fract_sample,lda_th,tol_min,nsvm):\n+\treturn NULL\n+"""\n+\tfk = feats.keys()\n+\tclss = list(set(cls[\'class\']))\n+\ty = [clss.index(c)*2-1 for c in list(cls[\'class\'])]\n+\txx = [feats[f] for f in fk]\n+\tif nsvm: \n+\t\tmaxs = [max(v) for v in xx]\n+\t\tmins = [min(v) for v in xx]\n+\t\tx = [ dict([(i+1,(v-mins[i])/(maxs[i]-mins[i])) for i,v in enumerate(f)]) for f in zip(*xx)]\n+\telse: x = [ dict([(i+1,v) for i,v in enumerate(f)]) for f in zip(*xx)]\n+\t\n+\tlfk = len(feats[fk[0]])\n+\trfk = int(float(len(feats[fk[0]]))*fract_sample)\n+\tmm = []\n+\n+\tbest_c = svmutil.svm_ms(y, x, [pow(2.0,i) for i in range(-5,10)],\'-t 0 -q\')\n+\tfor i in range(boots):\n+\t\trand_s = [lrand.randint(0,lfk-1) for v in range(rfk)]\n+\t\tr = svmutil.svm_w([y[yi] for yi in rand_s], [x[xi] for xi in rand_s], best_c,\'-t 0 -q\')\n+\t\tmm.append(r[:len(fk)])\n+\tm = [numpy.mean(v) for v in zip(*mm)]\n+\tres = dict([(v,m[i]) for i,v in enumerate(fk)])\n+\treturn res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th])\n+"""\t\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/lefse2circlader.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/lefse2circlader.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+from __future__ import with_statement
+
+import sys
+import os
+import argparse
+
+def read_params(args):
+    parser = argparse.ArgumentParser(description='Convert LEfSe output to '
+                        'Circlader input')
+    parser.add_argument(    'inp_f', metavar='INPUT_FILE', nargs='?',
+                            default=None, type=str,
+                            help="the input file [stdin if not present]")
+    parser.add_argument(    'out_f', metavar='OUTPUT_FILE', nargs='?',
+                            default=None, type=str,
+                            help="the output file [stdout if not present]")
+    parser.add_argument('-l', metavar='levels with label', default=0, type=int)
+
+    return vars(parser.parse_args())
+
+def lefse2circlader(par):
+    finp,fout = bool(par['inp_f']), bool(par['out_f'])
+
+    with open(par['inp_f']) if finp else sys.stdin as inpf:
+        put_bm = (l.strip().split('\t') for l in inpf.readlines())
+    biomarkers = [p for p in put_bm if len(p) > 2]
+
+    circ = [    [   b[0],
+                    "" if b[0].count('.') > par['l'] else b[0].split('.')[-1],
+                    b[2],
+                    b[2]+"_col" ] for b in biomarkers]
+
+    with open(par['out_f'],'w') if fout else sys.stdout as out_file:
+        for c in circ:
+            out_file.write( "\t".join( c ) + "\n" )
+
+if __name__ == '__main__':
+    params = read_params(sys.argv)
+    lefse2circlader(params)
+
+
+

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/lefse_ove.png

Binary file home/ubuntu/lefse_to_export/lefse_ove.png has changed

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_cladogram.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_cladogram.py Wed Aug 20 16:56:51 2014 -0400

[

b'@@ -0,0 +1,342 @@\n+#!/usr/bin/env python\n+\n+import os,sys,matplotlib,argparse,string\n+matplotlib.use(\'Agg\')\n+from pylab import *\n+from lefse import *\n+import numpy as np\n+\n+colors = [\'r\',\'g\',\'b\',\'m\',\'c\',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],\'k\']\n+dark_colors = [[0.4,0.0,0.0],[0.0,0.2,0.0],[0.0,0.0,0.4],\'m\',\'c\',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],\'k\']\n+\n+class CladeNode:\n+\tdef __init__(self, name, abundance, viz = True):\n+ \t\tself.id = name\n+ \t\tself.name = name.split(".")\n+\t\tself.last_name = self.name[-1]\n+\t\tself.abundance = abundance\n+\t\tself.pos = (-1.0,-1.0)\n+\t\tself.children = {}\n+\t\tself.isleaf = True\n+\t\tself.color = \'y\'\n+\t\tself.next_leaf = -1\n+\t\tself.prev_leaf = -1\n+\t\tself.viz = viz\n+\tdef __repr__(self):\n+\t\treturn self.last_name\n+\tdef add_child(self,node):\n+\t\tself.isleaf = False\n+\t\tself.children[node.__repr__()] = node\n+\tdef get_children(self):\n+\t\tck = sorted(self.children.keys())\n+\t\treturn [self.children[k] for k in ck]\n+\tdef get_color(self):\n+\t\treturn self.color\n+\tdef set_color(self,c):\n+\t\tself.color = c\n+\tdef set_pos(self,pos):\n+\t\tself.pos = pos\n+\n+def read_params(args):\n+\tparser = argparse.ArgumentParser(description=\'Cladoplot\')\n+\tparser.add_argument(\'input_file\', metavar=\'INPUT_FILE\', type=str, help="tab delimited input file")\n+\tparser.add_argument(\'output_file\', metavar=\'OUTPUT_FILE\', type=str, help="the file for the output image")\n+\tparser.add_argument(\'--clade_sep\',dest="clade_sep", type=float, default=1.5)\n+\tparser.add_argument(\'--max_lev\',dest="max_lev", type=int, default=-1)\n+\tparser.add_argument(\'--max_point_size\',dest="max_point_size", type=float, default=6.0)\n+\tparser.add_argument(\'--min_point_size\',dest="min_point_size", type=float, default=1)\n+\tparser.add_argument(\'--point_edge_width\',dest="markeredgewidth", type=float, default=.25)\n+\tparser.add_argument(\'--siblings_connector_width\',dest="siblings_connector_width", type=float, default=2)\n+\tparser.add_argument(\'--parents_connector_width\',dest="parents_connector_width", type=float, default=0.75)\n+\tparser.add_argument(\'--radial_start_lev\',dest="radial_start_lev", type=int, default=1)\n+\tparser.add_argument(\'--labeled_start_lev\',dest="labeled_start_lev", type=int, default=2)\n+\tparser.add_argument(\'--labeled_stop_lev\',dest="labeled_stop_lev", type=int, default=5)\n+\tparser.add_argument(\'--abrv_start_lev\',dest="abrv_start_lev", type=int, default=3)\n+\tparser.add_argument(\'--abrv_stop_lev\',dest="abrv_stop_lev", type=int, default=5)\n+\tparser.add_argument(\'--expand_void_lev\',dest="expand_void_lev", type=int, default=1)\n+\tparser.add_argument(\'--class_legend_vis\',dest="class_legend_vis", type=int, default=1)\n+\tparser.add_argument(\'--colored_connector\',dest="colored_connectors", type=int, default=1)\n+\tparser.add_argument(\'--alpha\',dest="alpha", type=float, default=0.2)\n+\tparser.add_argument(\'--title\',dest="title", type=str, default="Cladogram")\n+\tparser.add_argument(\'--sub_clade\',dest="sub_clade", type=str, default="")\n+\tparser.add_argument(\'--title_font_size\',dest="title_font_size", type=str, default="14")\n+\tparser.add_argument(\'--right_space_prop\',dest="r_prop", type=float, default=0.1)\n+\tparser.add_argument(\'--left_space_prop\',dest="l_prop", type=float, default=0.1)\n+\tparser.add_argument(\'--label_font_size\',dest="label_font_size", type=str, default="6")\n+\tparser.add_argument(\'--background_color\',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")\n+\tparser.add_argument(\'--colored_labels\',dest="col_lab", type=int, choices=[0,1], default=1, help="draw the label with class color (1) or in black (0)")\n+\tparser.add_argument(\'--class_legend_font_size\',dest="class_legend_font_size", type=str, default="10")\n+\tparser.add_argument(\'--dpi\',dest="dpi", type=int, default=72)\n+\tparser.add_argument(\'--format\', dest="format", choices=["png","svg","pdf"], default="svg", type=str, help="the format for the output file")\n+\tparser.add_argument(\'--all_feats\', dest="all_feats", type=str, d'..b'rc_ext = 0.65 if dim > 0.1 else 1.0 \n+\t\tclto = (de-l+1)*dim+dim*(dd+1-(l-dd-1))*perc_ext\n+\t\tclto = (de-l+1)*dim+dim*(dd-(l-params[\'labeled_start_lev\'])+1)*perc_ext\n+\t\tdes = float(180.0*(fr_0+fr_1)/np.pi)*0.5-90\n+\t\tlab = ""\n+\t\ttxt = father.last_name\n+\t\tif params[\'abrv_start_lev\'] < l <= params[\'abrv_stop_lev\'] + 1:\n+\t\t\tide = u_i.next()\n+\t\t\tlab = str(ide)+": "+father.last_name \n+\t\t\ttxt = str(ide)\n+#\t\tax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(depth-1), alpha = params[\'alpha\'], color=col, edgecolor=col)\n+\t\tax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(de), alpha = params[\'alpha\'], color=col, edgecolor=col)\n+\t\tax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, alpha = 1.0, color=col, edgecolor=params[\'fore_color\'], label=lab)\n+\t\tif l <= params[\'abrv_stop_lev\'] + 1:\n+\t\t\tif not params[\'col_lab\']: col = params[\'fore_color\']\n+\t\t\telse: \n+\t\t\t\tif col not in colors: col = params[\'fore_color\']\n+\t\t\t\telse: col = dark_colors[colors.index(col)%len(dark_colors)]\n+\t\t\tax.text((fr_0+fr_1)*0.5, clto+float(l-1)/float(de)-dim*perc_ext/2.0, txt, size = params[\'label_font_size\'], rotation=des, ha ="center", va="center", color=col)\t\n+ return fr_0, fr_1\n+\n+def draw_tree(out_file,tree,params):\n+\tplt_size = 7\n+\tnlev = tree[\'nlev\']\n+\tpt_scale = (params[\'min_point_size\'],max(1.0,((tree[\'max_abs\']-tree[\'min_abs\']))/(params[\'max_point_size\']-params[\'min_point_size\'])))\n+\tdepth = len(nlev)\n+\tsep = (2.0*np.pi)/float(nlev[-1]) \n+\tseps = [params[\'clade_sep\']*sep/float(depth-i+1) for i in range(1,len(tree[\'nlev\'])+1)]\n+\ttotseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])])\n+\tclade_sep_err = True if totseps > np.pi else False\n+\twhile totseps > np.pi:\n+\t\tparams[\'clade_sep\'] *= 0.75 \t \n+\t\tseps = [params[\'clade_sep\']*sep/(float(depth-i+1)*0.25) for i in range(1,len(tree[\'nlev\'])+1)]\n+\t\ttotseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])])\n+\tif clade_sep_err: print \'clade_sep parameter too large, lowered to\',params[\'clade_sep\']\n+\n+\tfig = plt.figure(edgecolor=params[\'back_color\'],facecolor=params[\'back_color\'])\n+\tax = fig.add_subplot(111, polar=True, frame_on=False, axis_bgcolor=params[\'back_color\'] )\n+\tplt.subplots_adjust(right=1.0-params[\'r_prop\'],left=params[\'l_prop\']) \t\n+\tax.grid(False)\n+\txticks([])\n+\tyticks([])\n+\n+\tds = (2.0*np.pi-totseps)/float(nlev[-1])\n+\n+\tadd_all_pos(tree[\'root\'],0.0,ds,seps,0.0,depth)\n+\t\n+\tplot_lines(tree[\'root\'],params,depth,ax,0)\n+\tplot_points(tree[\'root\'],params,pt_scale,ax)\n+\tplot_names(tree[\'root\'],params,depth,ax,uniqueid(),seps)\n+\n+\tr = np.arange(0, 3.0, 0.01)\n+\ttheta = 2*np.pi*r\n+\t\n+\tdef get_col_attr(x):\n+ \t\treturn hasattr(x, \'set_color\') and not hasattr(x, \'set_facecolor\')\n+\n+\th, l = ax.get_legend_handles_labels()\n+\tif len(l) > 0:\n+\t\tleg = ax.legend(bbox_to_anchor=(1.05, 1), frameon=False, loc=2, borderaxespad=0.,prop={\'size\':params[\'label_font_size\']})\n+\t\tif leg != None:\n+\t\t\tgca().add_artist(leg)\n+\t\t\tfor o in leg.findobj(get_col_attr):\n+\t o.set_color(params[\'fore_color\'])\n+\t\n+\tcll = sorted(tree[\'classes\']) if params[\'all_feats\'] == "" else sorted(params[\'all_feats\'].split(":"))\n+\tnll = [ax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, color=colors[i%len(colors)], label=c) for i,c in enumerate(cll) if c in tree[\'classes\']]\n+\tcl = [c for c in cll if c in tree[\'classes\']]\n+\n+\tax.set_title(params[\'title\'],size=params[\'title_font_size\'],color=params[\'fore_color\'])\n+\n+\tif params[\'class_legend_vis\']:\n+\t\tl2 = legend(nll, cl, loc=2, prop={\'size\':params[\'class_legend_font_size\']}, frameon=False)\n+\t\tif l2 != None:\n+\t\t\tfor o in l2.findobj(get_col_attr):\n+ \t\t\t\to.set_color(params[\'fore_color\'])\n+\n+\tplt.savefig(out_file,format=params[\'format\'],facecolor=params[\'back_color\'],edgecolor=params[\'fore_color\'],dpi=params[\'dpi\'])\n+\tplt.close()\t\n+\n+if __name__ == \'__main__\':\n+\tparams = read_params(sys.argv)\n+\tparams[\'fore_color\'] = \'w\' if params[\'back_color\'] == \'k\' else \'k\'\n+\tclad_tree = read_data(params[\'input_file\'],params)\t\n+\tdraw_tree(params[\'output_file\'],clad_tree,params)\n+\t\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_cladogram.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_cladogram.xml Wed Aug 20 16:56:51 2014 -0400

b'@@ -0,0 +1,211 @@\n+<tool id="LEfSe_cla" name="D) Plot Cladogram" version="1.0">\n+ <description></description>\n+<command interpreter="python">plot_cladogram.py $inp_data $cladogram --title "$textm.title" --expand_void_lev $structural.expand_void_lev --max_lev $structural.max_lev --colored_label $graphical.colored_labels --labeled_start_lev $textm.labeled_start_lev --labeled_stop_lev $textm.labeled_stop_lev --abrv_start_lev $textm.abrv_start_lev --abrv_stop_lev $textm.abrv_stop_lev --radial_start_lev $graphical.radial_start_lev --max_point_size $graphical.max_point_size --min_point_size $graphical.min_point_size --point_edge_width $graphical.point_edge_width --siblings_connector_width $graphical.siblings_connector_width --parents_connector_width $graphical.parents_connector_width --alpha $graphical.alpha --clade_sep $graphical.clade_sep --title_font_size $textm.title_font_size --label_font_size $textm.label_font_size --class_legend_font_size $textm.class_legend_font_size --sub_clade "$structural.root" --background_color $graphical.background_color --format $for --right_space_prop $graphical.right_space_prop --left_space_prop 0.01 --dpi $dpi</command> \n+ <inputs>\n+ <page>\n+\t<param format="lefse_internal_res" name="inp_data" type="data" label="Select data" help=""/>\n+\t\t\n+\t<conditional name="structural">\n+\t\t<param name="structural_choice" type="select" label="Set structural parameters of the cladogram" help="">\n+ \t\t<option value="d" selected=\'True\'>Default</option>\n+ \t\t<option value="a">Advanced</option>\n+ \t</param>\n+\t\t<when value="d">\n+\t\t\t<param name="root" type="hidden" value=""/>\n+\t\t\t<param name="expand_void_lev" type="hidden" value="0"/>\n+\t\t\t<param name="max_lev" type="hidden" value="6"/>\n+\t\t</when>\n+\t\t<when value="a">\n+\t\t\t<param name="root" type="text" size="10" value="" label="Root of the tree (insert the full name of the clade separating the levels with \'.\', nothing means the highest level clade)"/>\n+\t\t\t<param name="expand_void_lev" type="select" label="Expand terminal non-leaf levels">\n+\t\t\t\t<option value="1" >Yes</option>\n+ \t <option value="0" selected=\'True\'>No</option>\n+\t </param>\n+\t\t\t<param name="max_lev" type="integer" size="2" value="6" label="Maximum number of taxonomic levels"/>\n+\t\t\t</when>\n+\t</conditional>\n+\n+\t<conditional name="textm">\n+ \t<param name="text_choice" type="select" label="Set text and label options (font size, abbreviations, ...)" help="">\n+ \t\t<option value="d" selected=\'True\'>Default</option>\n+ \t\t<option value="a">Advanced</option>\n+ \t</param>\n+ \t<when value="d">\n+\t\t\t<param name="title" type="hidden" value=""/>\n+\t\t\t<param name="label_font_size" type="hidden" value="7"/>\n+\t\t\t<param name="title_font_size" type="hidden" value="14" />\n+\t\t\t<param name="class_legend_font_size" type="hidden" value="10"/>\n+\t\t\t<param name="labeled_start_lev" type="hidden" value="2"/>\t\n+\t\t\t<param name="labeled_stop_lev" type="hidden" value="6"/>\t\n+\t\t\t<param name="abrv_start_lev" type="hidden" value="4"/>\n+\t\t\t<param name="abrv_stop_lev" type="hidden" value="6"/>\n+ \t</when>\n+ \t<when value="a">\n+\t\t\t<param name="title" type="text" size="10" value="" label="The title of the cladogram"/>\n+\t\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n+\t\t\t<param name="label_font_size" type="integer" size="2" value="7" label="Label font size"/>\n+\t\t\t<param name="class_legend_font_size" type="integer" size="2" value="10" label="Class font size"/>\n+\t\t\t<param name="labeled_start_lev" type="integer" size="2" value="2" label="Starting level for drawing the labels"/>\t\n+\t\t\t<param name="labeled_stop_lev" type="integer" size="2" value="6" label="Last level for drawing the labels"/>\t\n+\t\t\t<param name="abrv_start_lev" type="integer" size="2" value="4" label="First level for abbreviating the labels"/>\n+\t\t\t<param name="abrv_stop_lev" type="integer" size="2" value="6" label="Last level for abbre'..b'anced parameter settings**\n+\n+*Structural parameters*\n+ * Root of the tree: selects any taxa of the tree to be the root of the cladogram (only the clades below the root will be visualized). Taxa levels are separated by ".", so for, example, Bacteria.Firmicutes will generate only the cladogram of Firmicutes.\n+ * Expand terminal non-leaf levels: whether to expand a non-leaf taxa without children up to the level of the leaves naming the new levels with the expanding taxa name. \n+ * Maximum number of taxonomic levels: you can limit the levels of the cladogram to a desired level.\n+*Text and label options*\n+\t* The title of the cladogram: optional title for the plot (default is no title).\n+\t* Title font size: set the font size of the title only.\n+\t* Label font size: set the font size of the labels (and of the label legend) used in the cladogram to denote taxa.\n+\t* Class font size: set the font of the legend for the class names and colors.\n+\t* Starting level for drawing the labels: you can avoid naming the more internal clades if they are not informative.\n+\t* Last level for drawing the labels: you may want to remove the most external labels as they may be to dense or overlapping.\n+\t* First level for abbreviating the labels: set the starting level for substituting the full clade names with with an abbreviation supported by a legend table (recommended for the most external taxa).\n+\t* Last level for abbreviating the labels: set the more external level for substituting the full clade names with with an abbreviation supported by a legend table.\n+*Graphical options*\n+\t* Number of external levels drawn in the radial representation: the connection between the taxa in last level and the corresponding parent is represented with a straight edge. The same representation may be used for more internal levels as well.\n+\t* Max dimension of the circles representing taxa: the sizes of the taxa represent the highest logarithmic abundance between classes, and this option sets the maximum diameter of the graphical representation.\n+\t* Min dimension of the circles representing taxa: the taxon diameter of the smallest logarithmic taxa abundance.\n+\t* Width of the edges of the circles representing taxa: the taxon circles have an external border whose thickness is regulated by this option.\n+\t* Width of the lines connecting sibling taxa: set the thickness of the lines connecting sibling taxa in the non-radial representation.\n+\t* Width of the lines connecting parent with son taxa: set the line thickness of the child-parent connection both in radial and non-radial representation.\n+\t* The alpha value for the transparency of clade labeling: the alpha value is responsible for the transparency of the differential clade highlighting. Since the transparency is additive, the alpha value should not be higher than 1/s where s is the number of levels with differential clades. \n+\t* Ration of the horizontal space to be given to the right side: in case the label legend requires more space (because of long labels) you may increase the right panel increasing this value.\n+\t* Whether to write the labels with the class color or in black: set whether the clade names inside the cladogram will be displayed with the class color or in black.\n+\t* Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n+\t* Set the space between clades at the lower level: set the separation between low-level taxa belonging to different super-clades. Depending on the density of the leaf-level this parameter is automatically adjusted.\n+\n+\n+------\n+\n+**Examples**\n+\n+The dataset provided here_ and described in the "Introduction" module produces the following image (alpha values of LEfSe - step B - are set to 0.01)\n+\n+\n+.. _here: http://www.huttenhower.org/webfm_send/73 \n+\n+Focusing the cladogram on the Firmicutes phylum only and playing a bit with the graphical options, we can obtain the following plot:\n+\n+ \n+\n+ </help>\n+</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_features.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_features.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os,sys,matplotlib,zipfile,argparse,string
+matplotlib.use('Agg')
+from pylab import *
+from lefse import *
+import random as rand
+
+colors = ['r','g','b','m','c']
+
+def read_params(args):
+ parser = argparse.ArgumentParser(description='Cladoplot')
+ parser.add_argument('input_file_1', metavar='INPUT_FILE', type=str, help="dataset files")
+ parser.add_argument('input_file_2', metavar='INPUT_FILE', type=str, help="LEfSe output file")
+ parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output (the zip file if an archive is required, the output directory otherwise)")
+ parser.add_argument('--width',dest="width", type=float, default=10.0 )
+ parser.add_argument('--height',dest="height", type=float, default=4.0)
+ parser.add_argument('--top',dest="top", type=float, default=-1.0, help="set maximum y limit (-1.0 means automatic limit)")
+ parser.add_argument('--bot',dest="bot", type=float, default=0.0, help="set minimum y limit (default 0.0, -1.0 means automatic limit)")
+ parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="14")
+ parser.add_argument('--class_font_size',dest="class_font_size", type=str, default="14")
+ parser.add_argument('--class_label_pos',dest="class_label_pos", type=str, choices=["up","down"], default="up")
+ parser.add_argument('--subcl_mean',dest="subcl_mean", type=str, choices=["y","n"], default="y")
+ parser.add_argument('--subcl_median',dest="subcl_median", type=str, choices=["y","n"], default="y")
+ parser.add_argument('--font_size',dest="font_size", type=str, default="10")
+ parser.add_argument('-n',dest="unused", metavar="flt", type=float, default=-1.0,help="unused")
+ parser.add_argument('--format', dest="format", default="png", choices=["png","pdf","svg"], type=str, help="the format for the output file")
+ parser.add_argument('-f', dest="f", default="diff", choices=["all","diff","one"], type=str, help="wheter to plot all features (all), only those differentially abundant according to LEfSe or only one (the one given with --feature_name) ")
+ parser.add_argument('--feature_name', dest="feature_name", default="", type=str, help="The name of the feature to plot (levels separated by .) ")
+ parser.add_argument('--feature_num', dest="feature_num", default="-1", type=int, help="The number of the feature to plot ")
+ parser.add_argument('--archive', dest="archive", default="none", choices=["zip","none"], type=str, help="")
+ parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")
+ parser.add_argument('--dpi',dest="dpi", type=int, default=72)
+
+ args = parser.parse_args()
+
+ return vars(args)
+
+def read_data(file_data,file_feats,params):
+ with open(file_feats, 'r') as features:
+ feats_to_plot = [(f.split()[:-1],len(f.split()) == 5) for f in features.readlines()]
+ if not feats_to_plot:
+ print "No features to plot\n"
+ sys.exit(0)
+ feats,cls,class_sl,subclass_sl,class_hierarchy,params['norm_v'] = load_data(file_data, True)
+ if params['feature_num'] > 0:
+ params['feature_name'] = [line.split()[0] for line in open(params['input_file_2'])][params['feature_num']-1]
+ features = {}
+ for f in feats_to_plot:
+ if params['f'] == "diff" and not f[1]: continue
+ if params['f'] == "one" and f[0][0] != params['feature_name']: continue
+ features[f[0][0]] = {'dim':float(f[0][1]), 'abundances':feats[f[0][0]], 'sig':f[1], 'cls':cls, 'class_sl':class_sl, 'subclass_sl':subclass_sl, 'class_hierarchy':class_hierarchy}
+ if not features:
+                print "No features to plot\n"
+                sys.exit(0)
+ return features
+
+def plot(name,k_n,feat,params):
+ fig = plt.figure(figsize=(params['width'], params['height']),edgecolor=params['fore_color'],facecolor=params['back_color'])
+ ax = fig.add_subplot(111,axis_bgcolor=params['back_color'])
+ subplots_adjust(bottom=0.15)
+
+ max_m = 0.0
+ norm = 1.0 if float(params['norm_v']) < 0.0 else float(params['norm_v'])
+ for v in feat['subclass_sl'].values():
+ fr,to  = v[0], v[1]
+ median = numpy.mean(feat['abundances'][fr:to])
+ if median > max_m: max_m = median
+ max_m /= norm
+ max_v = max_m*3 if max_m*3 < max(feat['abundances'])*1.1/norm else max(feat['abundances'])/norm
+ min_v = max(0.0,min(feat['abundances'])*0.9/norm)
+
+ if params['top'] > 0.0: max_v = params['top']
+ if params['bot'] >= 0.0: min_v = params['bot']
+
+ if max_v == 0.0: max_v = 0.0001
+ if max_v == min_v: max_v = min_v*1.1
+
+ cl_sep = max(int(sum([vv[1]/norm - vv[0]/norm for vv in feat['class_sl'].values()])/150.0),1)
+ seps = []
+ xtics = []
+ x2tics = []
+ last_fr = 0.0
+ for i,cl in enumerate(sorted(feat['class_hierarchy'].keys())):
+ for j,subcl in enumerate(feat['class_hierarchy'][cl]):
+ fr = feat['subclass_sl'][subcl][0]
+ to = feat['subclass_sl'][subcl][1]
+ val = feat['abundances'][fr:to]
+ fr += cl_sep*i
+ to += cl_sep*i
+ pos = arange(fr,to)
+ max_x = to
+ col = colors[j%len(colors)]
+ vv = [v1/norm for v1 in val]
+ median = numpy.median(vv)
+ mean = numpy.mean(vv)
+ valv = [max(min(v/norm,max_v),min_v) for v in val]
+ ax.bar(pos,valv, align='center', color=col, edgecolor=col, linewidth=0.1 )
+ if params['subcl_median'] == 'y': ax.plot([fr,to-1],[median,median],"k--",linewidth=1,color=params['fore_color'])
+ if params['subcl_mean'] == 'y': ax.plot([fr,to-1],[mean,mean],"-",linewidth=1,color=params['fore_color'])
+ nna = subcl if subcl.count("_") == 0 or not subcl.startswith(cl) else "_".join(subcl.split(cl)[1:])
+ if nna == "subcl" or nna == "_subcl": nna = " "
+ xtics.append(((fr+(to-fr)/2),nna))
+ seps.append(float(to))
+ x2tics.append(((last_fr+(to-last_fr)/2),cl))
+ last_fr = to + float(cl_sep)
+ for s in seps[:-1]:
+ ax.plot([s,s],[min_v,max_v],"-",linewidth=5,color=params['fore_color'])
+ ax.set_title(k_n, size=params['title_font_size'])
+ xticks([x[0] for x in xtics],[x[1] for x in xtics],rotation=-30, ha = 'left', fontsize=params['font_size'], color=params['fore_color'])
+ yticks(fontsize=params['font_size'])
+
+ ylabel('Relative abundance')
+ ax.set_ylim((min_v,max_v))
+ a,b = ax.get_xlim()
+ ax.set_xlim((0-float(last_fr)/float(b-a),max_x))
+ ax.yaxis.grid(True)
+
+ def get_col_attr(x):
+                return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor')
+ def get_edgecol_attr(x):
+                return hasattr(x, 'set_edgecolor')
+
+
+ for o in fig.findobj(get_col_attr):
+                o.set_color(params['fore_color'])
+ for o in fig.findobj(get_edgecol_attr):
+ if o.get_edgecolor() == params['back_color']:
+                 o.set_edgecolor(params['fore_color'])
+
+ for t in x2tics:
+ m = ax.get_ylim()[1]*0.97 if params['class_label_pos']=='up' else 0.07
+ plt.text(t[0],m, "class: "+t[1], ha ="center", size=params['class_font_size'], va="top", bbox = dict(boxstyle="round", ec='k', fc='y'))
+
+
+ plt.savefig(name,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
+ plt.close()
+ return name
+
+if __name__ == '__main__':
+ params = read_params(sys.argv)
+ params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k'
+ features = read_data(params['input_file_1'],params['input_file_2'],params)
+ if params['archive'] == "zip": file = zipfile.ZipFile(params['output_file'], "w")
+ for k,f in features.items():
+ print "Exporting ", k
+ if params['archive'] == "zip":
+ of = plot("/tmp/"+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params)
+ file.write(of, os.path.basename(of), zipfile.ZIP_DEFLATED)
+ else:
+ if params['f'] == 'one': plot(params['output_file'],k,f,params)
+ else: plot(params['output_file']+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params)
+ if params['archive'] == "zip": file.close()

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_features.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_features.xml Wed Aug 20 16:56:51 2014 -0400

b'@@ -0,0 +1,159 @@\n+<tool id="LEfSe_fea" name="F) Plot Differential Features" version="1.0">\n+ <description></description>\n+<command interpreter="python">plot_features.py $inp_data $inp_res $arch --title_font_size $graphical.title_font_size --background_color $graphical.background_color --class_label_pos $graphical.class_label_pos --class_font_size $graphical.class_font_size --top $graphical.top --bot $graphical.bot --font_size $graphical.font_size --width $graphical.width --height $graphical.height -f $feature_set --archive "zip" --format $for --dpi $dpi --subcl_mean $graphical.subcl_mean --subcl_median $graphical.subcl_median </command> \n+ <inputs>\n+ <page>\n+\t<param format="lefse_internal_for" name="inp_data" type="data" label="The formatted datasets" help=""/>\n+\t<param format="lefse_internal_res" name="inp_res" type="data" label="The LEfSe output" help=""/>\n+\n+\t<param name="feature_set" type="select" label="Do you want to plot all features or only those detected as biomarkers?">\n+\t\t<option value="diff" selected="diff">Biomarkers only</option>\n+\t\t<option value="all">All</option>\n+\t</param> \n+\n+\n+\t<conditional name="graphical">\n+\t<param name="graphical_choice" type="select" label="Set some graphical options to personalize the output">\n+ <option value="d" selected=\'True\'>Default</option>\n+ <option value="a">Advanced</option>\n+\t</param>\n+ \t<when value="d">\n+\t\t<param name="top" type="hidden" value="-1.0" />\n+\t\t<param name="bot" type="hidden" value="0.0" />\n+ <param name="title_font_size" type="hidden" value="14" />\n+ <param name="class_font_size" type="hidden" value="12" />\n+ <param name="font_size" type="hidden" value="8" />\n+ <param name="width" type="hidden" value="7.0" />\n+ <param name="height" type="hidden" value="4.0" />\n+ <param name="background_color" type="hidden" value="w" />\n+\t\t<param name="width" type="hidden" value="7.0" />\n+ <param name="height" type="hidden" value="4.0" />\n+\t\t<param name="class_label_pos" type="hidden" value="up" />\n+\t\t<param name="subcl_mean" type="hidden" value="y" />\n+\t\t<param name="subcl_median" type="hidden" value="y" />\n+\t</when>\n+\n+\t<when value="a">\n+\t\t<param name="top" type="float" size="2" value="-1.0" label="Set the maximum y value (-1.0 means automatic maximum setting based on maximum class median)"/>\n+\t\t<param name="bot" type="float" size="2" value="0.0" label="Set the minimum y value (-1.0 means automatic minimum setting based on minimum class median)"/>\n+\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n+\t <param name="class_font_size" type="integer" size="2" value="12" label="Class font size"/>\n+\t <param name="font_size" type="integer" size="2" value="8" label="Size of subclasses names and y values"/>\n+\t <param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n+\t <param name="height" type="float" size="2" value="4.0" label="Height of the plot"/>\n+\t\t<param name="background_color" type="select" label="Background color">\n+\t\t\t<option value="w" selected=\'True\'>White</option>\n+ <option value="k">Black</option>\n+ </param>\n+\t\t<param name="class_label_pos" type="select" label="Class label position">\n+\t\t\t<option value="up" selected=\'True\'>Top</option>\n+ <option value="down">Bottom</option>\n+ </param>\n+\t\t<param name="subcl_mean" type="select" label="Whether to plot the subclass means (straight line)">\n+ <option value="y" selected=\'True\'>Yes</option>\n+ <option value="n">No</option>\n+ </param>\n+\t\t<param name="subcl_median" type="select" label="Whether to plot the subclass medians (dotted line)">\n+ <option value="y" selected=\'True\'>Yes</option>\n+ <option value='..b'df</option>\n+ </param>\n+ <param name="dpi" type="select" label="Set the dpi resolution of the output">\n+ <option value="72">72</option>\n+ <option value="150" selected="True">150</option>\n+ <option value="300">300</option>\n+ <option value="600">600</option>\n+ <option value="1200">1200</option>\n+ </param>\n+\n+\t</page>\n+ </inputs>\n+ <outputs>\n+ <data format="zip" name="arch" >\n+ </data>\n+ </outputs>\n+ <tests>\n+ <test>\n+ <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/>\n+ <param name="maf_source" value="cached"/>\n+ <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>\n+ <param name="species" value="hg18,mm8"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />\n+ </test>\n+ <test>\n+ <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n+ <param name="maf_source" value="cached"/>\n+ <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>\n+ <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />\n+ </test>\n+ <test>\n+ <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n+ <param name="maf_source" value="user"/>\n+ <param name="maf_file" value="5.maf"/>\n+ <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+This module plots the raw data of the features detected by LEfSe as biomarkers as abundance histograms\n+with class and subclass information. The features are exported as images and\n+the user can download all the images in a .zip archive. It is also possible to\n+export all the features (instead of the biomarkers only). For exporting or\n+analyzing few features only the "Plot One Feature" module is recommended. \n+\n+------\n+\n+**Input format**\n+\n+The module accepts two datasets: the data formatted with the "Format Input for\n+LEfSe" module and the output of the LEfSe analysis. Both datasets are necessary\n+to run the module.\n+\n+------\n+\n+**Output format**\n+\n+The module generates zip archives containing images in png, svg or pdf format. \n+\n+------\n+\n+**Advanced parameter settings**\n+ \n+*Graphical options*\n+ * Set the maximum y value: -1 means automatic parameter setting that is computed as the minimum between the highest abundance value and three times the highest subclass median.\n+ * Set the minimum y value: -1 means automatic parameter setting that is computed as the maximum between 0 and the 90% of the smallest abundance value.\n+ * Title font size: set the font size of the title only.\n+ * Class font size: set the font of the legend for the class names and colors.\n+ * Size of subclasses names and y values: set the font size for the axis labels.\n+ * Width of the plot: horizontal size (in inches) of the plot.\n+ * Height of the plot: vertical size (in inches) of the plot.\n+ * Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n+ * Class label position: whether to place the class labels on the top or on the bottom of the plot.\n+ * Plot subclass means (straight line): whether to plot the subclass means with straight horizontal lines.\n+ * Plot subclass medians (dotted line): whether to plot the subclass medians with dotted horizontal lines.\n+\n+------\n+\n+**Examples**\n+\n+Please see the examples reported for the "Plot One Feature" module (E). This\n+module just produces multiple plots in the same way and compresses them into a\n+.zip archive. \n+\n+ </help>\n+</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_res.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_res.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os,sys
+import matplotlib
+matplotlib.use('Agg')
+from pylab import *
+
+from lefse import *
+import argparse
+
+colors = ['r','g','b','m','c','y','k','w']
+
+def read_params(args):
+ parser = argparse.ArgumentParser(description='Plot results')
+ parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="tab delimited input file")
+ parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output image")
+ parser.add_argument('--feature_font_size', dest="feature_font_size", type=int, default=7, help="the file for the output image")
+ parser.add_argument('--format', dest="format", choices=["png","svg","pdf"], default='png', type=str, help="the format for the output file")
+ parser.add_argument('--dpi',dest="dpi", type=int, default=72)
+ parser.add_argument('--title',dest="title", type=str, default="")
+ parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="12")
+ parser.add_argument('--class_legend_font_size',dest="class_legend_font_size", type=str, default="10")
+ parser.add_argument('--width',dest="width", type=float, default=7.0 )
+ parser.add_argument('--height',dest="height", type=float, default=4.0, help="only for vertical histograms")
+ parser.add_argument('--left_space',dest="ls", type=float, default=0.2 )
+ parser.add_argument('--right_space',dest="rs", type=float, default=0.1 )
+ parser.add_argument('--orientation',dest="orientation", type=str, choices=["h","v"], default="h" )
+ parser.add_argument('--autoscale',dest="autoscale", type=int, choices=[0,1], default=1 )
+ parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")
+ parser.add_argument('--subclades', dest="n_scl", type=int, default=1, help="number of label levels to be dislayed (starting from the leaves, -1 means all the levels, 1 is default )")
+ parser.add_argument('--max_feature_len', dest="max_feature_len", type=int, default=60, help="Maximum length of feature strings (def 60)")
+ parser.add_argument('--all_feats', dest="all_feats", type=str, default="")
+ args = parser.parse_args()
+ return vars(args)
+
+def read_data(input_file,output_file):
+ with open(input_file, 'r') as inp:
+ rows = [line.strip().split()[:-1] for line in inp.readlines() if len(line.strip().split())>3]
+ classes = list(set([v[2] for v in rows if len(v)>2]))
+ if len(classes) < 1:
+ print "No differentially abundant features found in "+input_file
+ os.system("touch "+output_file)
+ sys.exit()
+ data = {}
+ data['rows'] = rows
+ data['cls'] = classes
+ return data
+
+def plot_histo_hor(path,params,data,bcl):
+ cls2 = []
+ if params['all_feats'] != "":
+ cls2 = sorted(params['all_feats'].split(":"))
+ cls = sorted(data['cls'])
+ if bcl: data['rows'].sort(key=lambda ab: fabs(float(ab[3]))*(cls.index(ab[2])*2-1))
+ else:
+ mmax = max([fabs(float(a)) for a in zip(*data['rows'])[3]])
+ data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1))
+ pos = arange(len(data['rows']))
+ head = 0.75
+ tail = 0.5
+ ht = head + tail
+ ints = max(len(pos)*0.2,1.5)
+ fig = plt.figure(figsize=(params['width'], ints + ht), edgecolor=params['back_color'],facecolor=params['back_color'])
+        ax = fig.add_subplot(111,frame_on=False,axis_bgcolor=params['back_color'])
+ ls, rs = params['ls'], 1.0-params['rs']
+ plt.subplots_adjust(left=ls,right=rs,top=1-head*(1.0-ints/(ints+ht)), bottom=tail*(1.0-ints/(ints+ht)))
+
+ fig.canvas.set_window_title('LDA results')
+
+ l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'}
+        r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'}
+ added = []
+ m = 1 if data['rows'][0][2] == cls[0] else -1
+ for i,v in enumerate(data['rows']):
+ indcl = cls.index(v[2])
+ lab = str(v[2]) if str(v[2]) not in added else ""
+ added.append(str(v[2]))
+ col = colors[indcl%len(colors)]
+ if len(cls2) > 0:
+ col = colors[cls2.index(v[2])%len(colors)]
+ vv = fabs(float(v[3])) * (m*(indcl*2-1)) if bcl else fabs(float(v[3]))
+ ax.barh(pos[i],vv, align='center', color=col, label=lab, height=0.8, edgecolor=params['fore_color'])
+ mv = max([abs(float(v[3])) for v in data['rows']])
+ for i,r in enumerate(data['rows']):
+ indcl = cls.index(data['rows'][i][2])
+ if params['n_scl'] < 0: rr = r[0]
+ else: rr = r[0].split(".")[-min(r[0].count("."),params['n_scl'])]
+ if len(rr) > params['max_feature_len']: rr = rr[:params['max_feature_len']/2-2]+" [..]"+rr[-params['max_feature_len']/2+2:]
+                if m*(indcl*2-1) < 0 and bcl: ax.text(mv/40.0,float(i)-0.3,rr, l_align, size=params['feature_font_size'],color=params['fore_color'])
+                else: ax.text(-mv/40.0,float(i)-0.3,rr, r_align, size=params['feature_font_size'],color=params['fore_color'])
+ ax.set_title(params['title'],size=params['title_font_size'],y=1.0+head*(1.0-ints/(ints+ht))*0.8,color=params['fore_color'])
+
+ ax.set_yticks([])
+ ax.set_xlabel("LDA SCORE (log 10)")
+ ax.xaxis.grid(True)
+ xlim = ax.get_xlim()
+ if params['autoscale']:
+ ran = arange(0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100)
+ if len(ran) > 1 and len(ran) < 100:
+ ax.set_xticks(arange(xlim[0],xlim[1]+0.0001,min(xlim[1]+0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100)))
+ ax.set_ylim((pos[0]-1,pos[-1]+1))
+ leg = ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=5, borderaxespad=0., frameon=False,prop={'size':params['class_legend_font_size']})
+
+ def get_col_attr(x):
+                return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor')
+ for o in leg.findobj(get_col_attr):
+                o.set_color(params['fore_color'])
+ for o in ax.findobj(get_col_attr):
+                o.set_color(params['fore_color'])
+
+
+ plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
+ plt.close()
+
+def plot_histo_ver(path,params,data):
+ cls = data['cls']
+ mmax = max([fabs(float(a)) for a in zip(*data['rows'])[1]])
+ data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1))
+ pos = arange(len(data['rows']))
+ if params['n_scl'] < 0: nam = [d[0] for d in data['rows']]
+ else: nam = [d[0].split(".")[-min(d[0].count("."),params['n_scl'])] for d in data['rows']]
+ fig = plt.figure(edgecolor=params['back_color'],facecolor=params['back_color'],figsize=(params['width'], params['height']))
+ ax = fig.add_subplot(111,axis_bgcolor=params['back_color'])
+ plt.subplots_adjust(top=0.9, left=params['ls'], right=params['rs'], bottom=0.3)
+ fig.canvas.set_window_title('LDA results')
+ l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'}
+ r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'}
+ added = []
+ for i,v in enumerate(data['rows']):
+ indcl = data['cls'].index(v[2])
+ lab = str(v[2]) if str(v[2]) not in added else ""
+ added.append(str(v[2]))
+ col = colors[indcl%len(colors)]
+ vv = fabs(float(v[3]))
+ ax.bar(pos[i],vv, align='center', color=col, label=lab)
+ xticks(pos,nam,rotation=-20, ha = 'left',size=params['feature_font_size'])
+ ax.set_title(params['title'],size=params['title_font_size'])
+ ax.set_ylabel("LDA SCORE (log 10)")
+ ax.yaxis.grid(True)
+ a,b = ax.get_xlim()
+ dx = float(len(pos))/float((b-a))
+ ax.set_xlim((0-dx,max(pos)+dx))
+ plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
+ plt.close()
+
+if __name__ == '__main__':
+ params = read_params(sys.argv)
+ params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k'
+ data = read_data(params['input_file'],params['output_file'])
+ if params['orientation'] == 'v': plot_histo_ver(params['output_file'],params,data)
+ else: plot_histo_hor(params['output_file'],params,data,len(data['cls']) == 2)
+
+

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_res.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_res.xml Wed Aug 20 16:56:51 2014 -0400

b'@@ -0,0 +1,156 @@\n+<tool id="LEfSe_res" name="C) Plot LEfSe Results" version="1.0">\n+ <description></description>\n+<command interpreter="python">./plot_res.py $inp_data $res --title "$textm.title" --subclades $textm.subclades --title_font_size $textm.title_font_size --max_feature_len $textm.max_f_len --feature_font_size $textm.feature_font_size --class_legend_font_size $textm.class_legend_font_size --width $graphical.width --format $for --dpi $dpi --left_space $graphical.left_space_prop --right_space $graphical.right_space_prop --background_color $graphical.background_color</command> \n+ <inputs>\n+ <page>\n+\t<param format="lefse_internal_res" name="inp_data" type="data" label="Select data" help=""/>\n+\n+\t<conditional name="textm">\n+ <param name="text_choice" type="select" label="Set text and label options (font size, abbreviations, ...)" help="">\n+ <option value="d" selected=\'True\'>Default</option>\n+ <option value="a">Advanced</option>\n+ </param>\n+\t\t<when value="d">\n+\t\t\t<param name="title" type="hidden" value=""/>\n+\t\t\t<param name="title_font_size" type="hidden" value="14"/>\n+\t\t\t<param name="feature_font_size" type="hidden" value="7" />\n+\t\t <param name="class_legend_font_size" type="hidden" value="10" />\n+\t\t\t<param name="subclades" type="hidden" value="1" />\n+\t\t\t<param name="max_f_len" type="hidden" value="60" />\n+\t\t</when>\n+ <when value="a">\n+\t\t\t<param name="title" type="text" size="10" value="" label="The title of the cladogram"/>\n+\t \t<param name="subclades" type="integer" size="2" value="1" label="Number of label levels to be displayed (-1 means all)"/>\n+\t \t<param name="max_f_len" type="integer" size="2" value="60" label="Maximum length of feature names "/>\n+\t\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n+\t\t\t<param name="feature_font_size" type="integer" size="2" value="7" label="Label font size"/>\n+\t \t<param name="class_legend_font_size" type="integer" size="2" value="10" label="Class font size"/>\n+\t\t</when>\n+\t</conditional>\n+\n+\t<conditional name="graphical">\n+ <param name="text_choice" type="select" label="Set some graphical options to personalize the output" help="">\n+ <option value="d" selected=\'True\'>Default</option>\n+ <option value="a">Advanced</option>\n+ </param>\n+ <when value="d">\n+\t\t\t<param name="width" type="hidden" value="7.0" />\n+\t\t\t<param name="left_space_prop" type="hidden" value="0.2" />\n+\t\t\t<param name="right_space_prop" type="hidden" value="0.1" />\n+\t\t\t<param name="background_color" type="hidden" value="w"/>\n+ </when>\n+ <when value="a">\n+\t\t\t<param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n+\t\t\t<param name="left_space_prop" type="float" size="2" value="0.2" label="Fraction of the total width to be reserved for the space at the left of the plot "/>\n+\t\t\t<param name="right_space_prop" type="float" size="2" value="0.1" label="Fraction of the total width to be reserved for the space at the right of the plot "/>\n+\t\t\t<param name="background_color" type="select" label="Whether to optimize the colors for a white or black background">\n+ <option value="w" selected=\'True\'>White</option>\n+ <option value="k">Black</option>\n+ </param>\n+\t\n+ </when>\n+ </conditional>\n+\n+\n+\n+\t<param name="for" type="select" label="Output format">\n+ <option value="png" selected="png">png</option>\n+ <option value="svg">svg</option>\n+ <option value="pdf">pdf</option>\n+ </param>\n+ <param name="dpi" type="select" label="Set the dpi resolution of the output">\n+ <option value="72">72</option>\n+ <option value="150" selected="True">150</option'..b'ram name="maf_source" value="cached"/>\n+ <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>\n+ <param name="species" value="hg18,mm8"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />\n+ </test>\n+ <test>\n+ <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n+ <param name="maf_source" value="cached"/>\n+ <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>\n+ <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />\n+ </test>\n+ <test>\n+ <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n+ <param name="maf_source" value="user"/>\n+ <param name="maf_file" value="5.maf"/>\n+ <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n+ <param name="overwrite_with_gaps" value="True"/>\n+ <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+This module plots the biomarkers found by LEfSe ranking them accordingly to their effect size and associating them with the class with the highest median. \n+\n+------\n+\n+**Input format**\n+\n+The module accepts the output of the LEfSe module (B) only.\n+\n+------\n+\n+**Output format**\n+\n+The module generate images in png, svg or pdf format. The png format is recommended for exploratory runs as it can be easily visualized internally in Galaxy, whereas the vectorial svg and pdf formats are recommended for the final publication-ready image to be downloaded.\n+\n+------\n+\n+**Parameters**\n+\n+In addition to the output format and the dpi resolution two sets of parameters can be tuned: text and label options for regulating the plot annotation and graphical options for personalizing the appearance of the plot. The default settings of the parameters should give satisfactory outputs in the great majority of the cases.\n+\n+**Advanced parameter settings**\n+\n+*Text and label options*\n+\t* The title of the cladogram: optional title for the plot (default is no title).\n+\t* Number of label levels to be displayed: when dealing with hierarchical features this option sets the level to be displayed in the labels (-1 means the last level only, 1 means the highest level, 2 means the first two levels and so on).\t\n+\t* Maximum length of feature names: set the length of the feature names above which the names will be abbreviated (in the middle of the string).\n+\t* Title font size: set the font size of the title only.\n+\t* Label font size: set the font size of the labels (and of the label legend) used in the cladogram to denote taxa.\n+\t* Class font size: set the font of the legend for the class names and colors.\n+*Graphical options*\n+\t* Width of the plot: set the inches for the width of the plot (the height is automatically set based on the number of biomarkers to be displayed).\n+\t* Fraction of the total width to be reserved for the space at the left of the plot: this option permits the user to enlarge the space on the left of the plot for the cases in which the feature labels are long and need more space.\n+\t* Fraction of the total width to be reserved for the space at the right of the plot: this option permits the user to enlarge the space on the right of the plot for the cases in which the feature labels are long and need more space. \n+\t* Whether to optimize the colors for a white or black background: this option permits the user to generate output plots with black backgrounds, adjusting properly the colors of the cladogram.\n+\n+\n+------\n+\n+**Example**\n+\n+The dataset provided here_ and described in the "Introduction" module produces the following image (alpha values of LEfSe - step B - are set to 0.01)\n+\n+.. image:: ../galaxy/static/images/plot_res_ex.png\n+.. _here: http://www.huttenhower.org/webfm_send/73 \n+\n+\n+\n+\n+ </help>\n+</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/plot_single_feature.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/plot_single_feature.xml Wed Aug 20 16:56:51 2014 -0400

b'@@ -0,0 +1,151 @@\n+<tool id="LEfSe_sfe" name="E) Plot One Feature" version="1.0">\n+<code file="format_input_selector.py"/> \n+ <description></description>\n+<command interpreter="python">plot_features.py $inp_data $inp_res $arch --title_font_size $graphical.title_font_size --background_color $graphical.background_color --class_label_pos $graphical.class_label_pos --class_font_size $graphical.class_font_size --top $graphical.top --bot $graphical.bot --font_size $graphical.font_size --width $graphical.width --height $graphical.height -f one --feature_num $featd.feat --archive "none" --format $for --dpi $dpi --subcl_mean $graphical.subcl_mean --subcl_median $graphical.subcl_median </command> \n+<inputs>\n+\t<page>\n+\t<param format="lefse_internal_for" name="inp_data" type="data" label="The formatted datasets" help=""/>\n+\t<param format="lefse_internal_res" name="inp_res" type="data" label="The LEfSe output" help=""/>\n+\n+\t<param name="featd" type="data_column" data_ref="inp_res" value="1" optional="true" force_select="false" accept_default="false" /> \n+\n+\t<conditional name="featd" type="data_column" data_ref="inp_res" accept_default="true"> \n+ <param name="feat_dir" type="select" data_ref="inp_res" label="Select the feature names among biomarkers or all features" help="">\n+ <option value="b" selected=\'True\'>Biomarkers only</option> \n+ <option value="a">All features</option> \n+ </param>\n+ \n+ <when value="b">\n+\t\t<param name="feat" label="Select the feature to plot" data_ref="inp_res" type=\'select\' force_select="false" dynamic_options="get_row_names(inp_res,\'b\')" value="1" optional="true" accept_default="false" />\n+ </when>\n+ <when value="a">\n+\t\t<param name="feat" label="Select the feature to plot" data_ref="inp_res" type=\'select\' force_select="false" dynamic_options="get_row_names(inp_res,\'a\')" value="1" optional="true" accept_default="false" />\n+ </when>\n+\t</conditional> \n+\n+\t<conditional name="graphical">\n+\t<param name="graphical_choice" type="select" label="Set some graphical options to personalize the output">\n+ <option value="d" selected=\'True\'>Default</option>\n+ <option value="a">Advanced</option>\n+\t</param>\n+ \t<when value="d">\n+\t\t<param name="top" type="hidden" value="-1.0" />\n+\t\t<param name="bot" type="hidden" value="0.0" />\n+ <param name="title_font_size" type="hidden" value="14" />\n+ <param name="class_font_size" type="hidden" value="12" />\n+ <param name="font_size" type="hidden" value="8" />\n+ <param name="width" type="hidden" value="7.0" />\n+ <param name="height" type="hidden" value="4.0" />\n+ <param name="background_color" type="hidden" value="w" />\n+\t\t<param name="width" type="hidden" value="7.0" />\n+ <param name="height" type="hidden" value="4.0" />\n+\t\t<param name="class_label_pos" type="hidden" value="up" />\n+\t\t<param name="subcl_mean" type="hidden" value="y" />\n+\t\t<param name="subcl_median" type="hidden" value="y" />\n+\t</when>\n+\n+\t<when value="a">\n+\t\t<param name="top" type="float" size="2" value="-1.0" label="Set the maximum y value (-1.0 means automatic maximum setting based on maximum class median)"/>\n+\t\t<param name="bot" type="float" size="2" value="0.0" label="Set the minimum y value (-1.0 means automatic minimum setting based on minimum class median)"/>\n+\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n+\t <param name="class_font_size" type="integer" size="2" value="12" label="Class names font size"/>\n+\t <param name="font_size" type="integer" size="2" value="8" label="Size of subclasses names and y values"/>\n+\t <param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n+\t <param name="height" type="float" size="2" value="4.0" label="Height of the plot"/>\n+\t\t<param name="background_color" type='..b'tom</option>\n+ </param>\n+\t\t<param name="subcl_mean" type="select" label="Plot subclass means (straight line)">\n+ <option value="y" selected=\'True\'>Yes</option>\n+ <option value="n">No</option>\n+ </param>\n+\t\t<param name="subcl_median" type="select" label="Plot subclass medians (dotted line)">\n+ <option value="y" selected=\'True\'>Yes</option>\n+ <option value="n">No</option>\n+ </param>\n+\n+ </when>\n+\n+ </conditional>\t\t\n+\n+\n+ \t<param name="for" type="select" label="Output format">\n+ <option value="png" selected="png">png</option>\n+ <option value="svg">svg</option>\n+ <option value="pdf">pdf</option>\n+ </param>\n+ <param name="dpi" type="select" label="Set the dpi resolution of the output">\n+ <option value="72">72</option>\n+ <option value="150" selected="True">150</option>\n+ <option value="300">300</option>\n+ <option value="600">600</option>\n+ <option value="1200">1200</option>\n+ </param>\n+\n+\t</page>\n+</inputs>\n+ <outputs>\n+ <data format="png" name="arch" >\n+\t<change_format> \n+\t<when input="for" value="svg" format="svg" />\n+\t</change_format>\n+ </data>\n+ </outputs>\n+ <help>\n+**What it does**\n+\n+This module plots the raw data of a single feature as an abundance histogram with class and subclass information. You can select the feature to plot among the set of features detected by LEfSe as biomarker or among the full set of features.\n+\n+------\n+\n+**Input format**\n+\n+The module accepts two datasets: the data formatted with the "Format Input for\n+LEfSe" module and the output of the LEfSe analysis. Both datasets are necessary\n+to run the module. \n+\n+------\n+\n+**Output format**\n+\n+The module generates images in png, svg or pdf format. The png format is recommended for exploratory runs as it can be easily visualized internally in Galaxy, whereas the vectorial svg and pdf format are recommended for the final publication-ready image to be downloaded.\n+\n+------\n+\n+**Advanced parameter settings**\n+\n+*Graphical options*\n+\t* Set the maximum y value: set the maximum value on the y-axis. -1 means automatic parameter setting that is computed as the minimum between the highest abundance value and three times the highest subclass median.\n+\t* Set the minimum y value: -1 means automatic parameter setting that is computed as the maximum between 0 and the 90% of the smallest abundance value.\n+\t* Title font size: set the font size of the title only.\n+\t* Class font size: set the font of the legend for the class names and colors.\n+\t* Size of subclasses names and y values: set the fond size for the axis labels.\t\n+\t* Width of the plot: horizontal size (in inches) of the plot.\n+\t* Height of the plot: vertical size (in inches) of the plot.\n+\t* Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n+\t* Class label position: whether to place the class labels on the top or on the bottom of the plot.\n+\t* Plot subclass means (straight line): whether to plot the subclass means with straight horizontal lines.\n+\t* Plot subclass medians (dotted line): whether to plot the subclass medians with dotted horizontal lines.\n+\n+------\n+\n+**Examples**\n+\n+Selecting the Clostridia clade from the biomarkers detected by LEfSe in the dataset provided here_ and described in the "Introduction", we obtain the following image:\n+\n+.. _here: http://www.huttenhower.org/webfm_send/73 \n+\n+Another example, taken from the analysis we detailed in `(Segata et. al 2011)`_ that compares the viral and bacterial microbiomes using metagenomic data from `(Dinsdale et. al 2008)`_:\n+\n+\n+\n+.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n+.. _(Dinsdale et. al 2008): http://www.ncbi.nlm.nih.gov/pubmed/18337718\n+\n+ </help>\n+</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/qiime2lefse.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/qiime2lefse.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import sys
+
+def read_params(args):
+    import argparse as ap
+    import textwrap
+
+    p = ap.ArgumentParser( description= "TBA" )
+
+    p.add_argument( '--in', metavar='INPUT_FILE', type=str,
+                    nargs='?', default=sys.stdin,
+                    help=   "the Qiime OTU table file "
+                            "[ stdin if not present ]" )
+    p.add_argument( '--md', metavar='METADATA_FILE', type=str,
+                    nargs='?', default=None,
+                    help=   "the Qiime OTU table file "
+                            "[ only OTU table without metadata if not present ]" )
+    p.add_argument( '--out', metavar='OUTPUT_FILE', type=str,
+                    nargs = '?', default=sys.stdout,
+                    help=   "the output file "
+                            "[stdout if not present]")
+
+    p.add_argument( '-c', metavar="class attribute",
+                    type=str,
+                    help =  "the attribute to use as class"   )
+    p.add_argument( '-s', metavar="subclass attribute",
+                    type=str,
+                    help =  "the attribute to use as subclass"   )
+    p.add_argument( '-u', metavar="subject attribute",
+                    type=str,
+                    help =  "the attribute to use as subject"   )
+
+
+
+    return vars(p.parse_args())
+
+
+
+def qiime2lefse(  fin, fmd, fout, all_md, sel_md ):
+    with (fin if fin==sys.stdin else open(fin)) as inpf :
+        lines = [list(ll) for ll in
+                    (zip(*[l.strip().split('\t')
+                        for l in inpf.readlines()[1:]]) ) ]
+    for i,(l1,l2) in enumerate(zip( lines[0], lines[-1] )):
+        if not l2 == 'Consensus Lineage':
+            lines[-1][i] = l2+"|"+l1
+
+    data = dict([(l[0],l[1:]) for l in lines[1:]])
+
+    md = {}
+    if fmd:
+        with open(fmd) as inpf:
+            mdlines = [l.strip().split('\t') for l in inpf.readlines()]
+
+        mdf = mdlines[0][1:]
+
+        for l in mdlines:
+            mdd = dict(zip(mdf,l[1:]))
+            md[l[0]] = mdd
+
+    selected_md = md.values()[0].keys() if md else []
+
+    if not all_md:
+        selected_md = [s for s in sel_md if s]
+
+    out_m = [   selected_md +
+                list([d.replace(";","|").replace("\"","") for d in data[ 'Consensus Lineage' ]])    ]
+    for k,v in data.items():
+        if k == 'Consensus Lineage':
+            continue
+        out_m.append( [md[k][kmd] for kmd in selected_md] + list(v) )
+
+    with (fout if fout == sys.stdout else open( fout, "w" )) as outf:
+        for l in zip(*out_m):
+            outf.write( "\t".join(l) + "\n" )
+
+if __name__ == '__main__':
+    pars = read_params( sys.argv )
+
+    qiime2lefse(   fin     = pars['in'],
+                   fmd     = pars['md'],
+                   fout    = pars['out'],
+                   all_md  = not pars['c'] and not pars['s'] and not pars['u'],
+                   sel_md  = [pars['c'],pars['s'],pars['u']])

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/requirements.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/requirements.txt Wed Aug 20 16:56:51 2014 -0400

@@ -0,0 +1,5 @@
+
+- R
+- R libraries: splines, stats4, survival, mvtnorm, modeltools, coin, MASS
+- python libraries: rpy2 (v. 2.1 or higher), numpy, matplotlib (v. 1.0 or higher), argparse
+

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/run_lefse.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/run_lefse.py Wed Aug 20 16:56:51 2014 -0400

[

@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+import os,sys,math,pickle
+from lefse import *
+
+def read_params(args):
+        parser = argparse.ArgumentParser(description='LEfSe 1.0')
+        parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file")
+        parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
+                help="the output file containing the data for the visualization module")
+        parser.add_argument('-o',dest="out_text_file", metavar='str', type=str, default="",
+                help="set the file for exporting the result (only concise textual form)")
+        parser.add_argument('-a',dest="anova_alpha", metavar='float', type=float, default=0.05,
+                help="set the alpha value for the Anova test (default 0.05)")
+        parser.add_argument('-w',dest="wilcoxon_alpha", metavar='float', type=float, default=0.05,
+                help="set the alpha value for the Wilcoxon test (default 0.05)")
+        parser.add_argument('-l',dest="lda_abs_th", metavar='float', type=float, default=2.0,
+                help="set the threshold on the absolute value of the logarithmic LDA score (default 2.0)")
+        parser.add_argument('--nlogs',dest="nlogs", metavar='int', type=int, default=3,
+ help="max log ingluence of LDA coeff")
+        parser.add_argument('--verbose',dest="verbose", metavar='int', choices=[0,1], type=int, default=0,
+ help="verbose execution (default 0)")
+        parser.add_argument('--wilc',dest="wilc", metavar='int', choices=[0,1], type=int, default=1,
+ help="wheter to perform the Wicoxon step (default 1)")
+ parser.add_argument('-r',dest="rank_tec", metavar='str', choices=['lda','svm'], type=str, default='lda',
+ help="select LDA or SVM for effect size (default LDA)")
+ parser.add_argument('--svm_norm',dest="svm_norm", metavar='int', choices=[0,1], type=int, default=1,
+ help="whether to normalize the data in [0,1] for SVM feature waiting (default 1 strongly suggested)")
+        parser.add_argument('-b',dest="n_boots", metavar='int', type=int, default=30,
+                help="set the number of bootstrap iteration for LDA (default 30)")
+        parser.add_argument('-e',dest="only_same_subcl", metavar='int', type=int, default=0,
+                help="set whether perform the wilcoxon test only among the subclasses with the same name (default 0)")
+        parser.add_argument('-c',dest="curv", metavar='int', type=int, default=0,
+                help="set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] (default 0)")
+        parser.add_argument('-f',dest="f_boots", metavar='float', type=float, default=0.67,
+                help="set the subsampling fraction value for each bootstrap iteration (default 0.66666)")
+        parser.add_argument('-s',dest="strict", choices=[0,1,2], type=int, default=0,
+                help="set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison")
+#       parser.add_argument('-m',dest="m_boots", type=int, default=5,
+#               help="minimum cardinality of classes in each bootstrapping iteration (default 5)")
+        parser.add_argument('--min_c',dest="min_c", metavar='int', type=int, default=10,
+                help="minimum number of samples per subclass for performing wilcoxon test (default 10)")
+        parser.add_argument('-t',dest="title", metavar='str', type=str, default="",
+                help="set the title of the analysis (default input file without extension)")
+        parser.add_argument('-y',dest="multiclass_strat", choices=[0,1], type=int, default=0,
+                help="(for multiclass tasks) set whether the test is performed in a one-against-one ( 1 - more strict!) or in a one-against-all setting ( 0 - less strict) (default 0)")
+        args = parser.parse_args()
+
+        params = vars(args)
+        if params['title'] == "": params['title'] = params['input_file'].split("/")[-1].split('.')[0]
+        return params
+
+
+
+if __name__ == '__main__':
+ init()
+ params = read_params(sys.argv)
+ feats,cls,class_sl,subclass_sl,class_hierarchy = load_data(params['input_file'])
+ kord,cls_means = get_class_means(class_sl,feats)
+ wilcoxon_res = {}
+ kw_n_ok = 0
+ nf = 0
+ for feat_name,feat_values in feats.items():
+ if params['verbose']:
+ print "Testing feature",str(nf),": ",feat_name,
+ nf += 1
+ kw_ok,pv = test_kw_r(cls,feat_values,params['anova_alpha'],sorted(cls.keys()))
+ if not kw_ok:
+ if params['verbose']: print "\tkw ko"
+ del feats[feat_name]
+ wilcoxon_res[feat_name] = "-"
+ continue
+ if params['verbose']: print "\tkw ok\t",
+
+ if not params['wilc']: continue
+ kw_n_ok += 1
+ res_wilcoxon_rep = test_rep_wilcoxon_r(subclass_sl,class_hierarchy,feat_values,params['wilcoxon_alpha'],params['multiclass_strat'],params['strict'],feat_name,params['min_c'],params['only_same_subcl'],params['curv'])
+ wilcoxon_res[feat_name] = str(pv) if res_wilcoxon_rep else "-"
+ if not res_wilcoxon_rep:
+ if params['verbose']: print "wilc ko"
+ del feats[feat_name]
+ elif params['verbose']: print "wilc ok\t"
+
+ if len(feats) > 0:
+ print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
+ if params['lda_abs_th'] < 0.0:
+ lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
+ else:
+ if params['rank_tec'] == 'lda': lda_res,lda_res_th = test_lda_r(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0000000001,params['nlogs'])
+ elif params['rank_tec'] == 'svm': lda_res,lda_res_th = test_svm(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0,params['svm_norm'])
+ else: lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
+ else:
+ print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
+ print "No features with significant differences between the two classes"
+ lda_res,lda_res_th = {},{}
+ outres = {}
+ outres['lda_res_th'] = lda_res_th
+ outres['lda_res'] = lda_res
+ outres['cls_means'] = cls_means
+ outres['cls_means_kord'] = kord
+ outres['wilcox_res'] = wilcoxon_res
+ print "Number of discriminative features with abs LDA score >",params['lda_abs_th'],":",len(lda_res_th)
+ save_res(outres,params["output_file"])

diff -r e7cd19afda2e -r db64b6287cd6 home/ubuntu/lefse_to_export/run_lefse.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/home/ubuntu/lefse_to_export/run_lefse.xml Wed Aug 20 16:56:51 2014 -0400

@@ -0,0 +1,101 @@
+<tool id="LEfSe_run" name="B) LDA Effect Size (LEfSe)" version="1.0">
+  <description></description>
+
+<command interpreter="python">run_lefse.py $inp_data $output -l $lda_th -a $kw_alpha -w $w_alpha -e $w_pol -y $multiclass -f 0.9</command>
+  <inputs>
+    <page>
+ <param format="lefse_internal_for" name="inp_data" type="data" label="Select data" help=""/>
+ <param name="kw_alpha" type="float" size="2" value="0.05" label="Alpha value for the factorial Kruskal-Wallis test among classes"/>
+ <param name="w_alpha" type="float" size="2" value="0.05" label="Alpha value for the pairwise Wilcoxon test between subclasses"/>
+ <param name="lda_th" type="float" size="2" value="2.0" label="Threshold on the logarithmic LDA score for discriminative features"/>
+ <param name="w_pol" type="select" label="Do you want the pairwise comparisons among subclasses to be performed only among the subclasses with the same name?" help="">
+          <option value="0" selected="0">No</option>
+          <option value="1">Yes</option>
+        </param>
+
+ <param name="multiclass" type="select" label="Set the strategy for multi-class analysis" help="">
+          <option value="1" selected="True">All-against-all (more strict)</option>
+          <option value="0">One-against-all (less strict)</option>
+        </param>
+    </page>
+   </inputs>
+  <outputs>
+    <data format="lefse_internal_res" name="output" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>
+      <param name="species" value="hg18,mm8"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
+      <param name="maf_source" value="cached"/>
+      <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>
+      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />
+    </test>
+    <test>
+      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
+      <param name="maf_source" value="user"/>
+      <param name="maf_file" value="5.maf"/>
+      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
+      <param name="overwrite_with_gaps" value="True"/>
+      <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+Lda Effective Size (LEfSe) is a biomarker discovery and explanation tool for high-dimensional data. It couples statistical significance with biological consistency and effect size estimation. For an overview of LEfSe please refer to the "Introduction" module or to `(Segata et. al 2011)`_.
+
+The scheme and the description below illustrates how the algorithm works:
+
+.. image:: https://bytebucket.org/biobakery/galaxy_lefse/wiki/lefse_met.png
+
+Input data consist of a collection of m samples (columns) each made up of n numerical features (rows, typically normalized per-sample, red representing high values and green low). These samples are labeled with a class (taking two or more possible values) that represents the main biological hypothesis under investigation; they may also have one or more subclass labels reflecting within-class groupings.
+
+- Step 1: the Kruskall-Wallis test analyzes all features, testing whether the values in different classes are differentially distributed. Features violating the null hypothesis are further analyzed in Step 2.
+
+- Step 2: the pairwise Wilcoxon test checks whether all pairwise comparisons between subclasses within different classes significantly agree with the class level trend.
+
+- Step 3: the resulting subset of vectors is used to build a Linear Discriminant Analysis model from which the relative difference among classes is used to rank the features. The final output thus consists of a list of features that are discriminative with respect to the classes, consistent with the subclass grouping within classes, and ranked according to the effect size with which they differentiate classes.
+
+**Input format**
+
+The input for this module must be generated with the **"Format Input for LEfSe"** tool.
+
+------
+
+**Output format**
+
+The output consists of a tabular file listing all the features, the logarithm value of the highest mean among all the classes, and if the feature is discriminative, the class with the highest mean and the logarithmic LDA score.
+
+The output file can be conveniently visualized with the "Plot LEfSe Results" module and, if feature names define a hierarchy, with the "Plot Cladogram" module. The output can also be used for generating the histograms of the raw data of the discriminative features using the "Plot Differential Features" module.
+
+------
+
+**Parameters**
+
+The input parameters are the alpha-values for the factorial Kruskal-Wallis test and for the pairwise Wilcoxon test among subclasses (steps 1 and 2 in the schematic picture above) and the non-negative threshold for the logarithmic LDA score. Moreover, the user can decide the pairwise Wilcoxon test to be applied only among subclasses in different classes with the same name (less stringent) and select the multi-class strategy to be the All-against-all (more stringent) or the One-against-all (less stringent).
+
+.. _here: http://www.huttenhower.org/webfm_send/73
+.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898
+.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380
+.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388
+.. _contact us: nsegata@hsph.harvard.edu
+
+**Example**
+
+For the mouse model dataset (see the "Introduction" module) it is suggested to use alpha=0.01 as the sample size is not very large.
+
+  </help>
+</tool>

diff -r e7cd19afda2e -r db64b6287cd6 lefse.py
--- a/lefse.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,242 +0,0 @@\n-import os,sys,math,pickle\n-import random as lrand\n-import rpy2.robjects as robjects\n-import argparse\n-import numpy\n-#import svmutil\n-\n-def init(): \n-\tlrand.seed(1982)\n-\trobjects.r(\'library(splines)\')\n-\trobjects.r(\'library(stats4)\')\n-\trobjects.r(\'library(survival)\')\n-\trobjects.r(\'library(mvtnorm)\')\n-\trobjects.r(\'library(modeltools)\')\n-\trobjects.r(\'library(coin)\')\n-\trobjects.r(\'library(MASS)\')\n-\n-def get_class_means(class_sl,feats):\n-\tmeans = {}\n-\tclk = class_sl.keys()\n-\tfor fk,f in feats.items():\n-\t\tmeans[fk] = [numpy.mean((f[class_sl[k][0]:class_sl[k][1]])) for k in clk] \n-\treturn clk,means \n-\t\n-def save_res(res,filename): \n-\twith open(filename, \'w\') as out:\n-\t\tfor k,v in res[\'cls_means\'].items():\n-\t\t\tout.write(k+"\\t"+str(math.log(max(max(v),1.0),10.0))+"\\t")\n-\t\t\tif k in res[\'lda_res_th\']:\n-\t\t\t\tfor i,vv in enumerate(v):\n-\t\t\t\t\tif vv == max(v):\n-\t\t\t\t\t\tout.write(str(res[\'cls_means_kord\'][i])+"\\t")\n-\t\t\t\t\t\tbreak\n-\t\t\t\tout.write(str(res[\'lda_res\'][k])) \n-\t\t\telse: out.write("\\t")\n-\t\t\tout.write( "\\t" + res[\'wilcox_res\'][k]+"\\n")\n-\n-def load_data(input_file, nnorm = False):\n- \n-\twith open(input_file, \'rb\') as inputf:\n-\t\tinp = pickle.load(inputf)\n-\tif nnorm: return inp[\'feats\'],inp[\'cls\'],inp[\'class_sl\'],inp[\'subclass_sl\'],inp[\'class_hierarchy\'],inp[\'norm\'] \n-\telse: return inp[\'feats\'],inp[\'cls\'],inp[\'class_sl\'],inp[\'subclass_sl\'],inp[\'class_hierarchy\']\n-\n-def load_res(input_file):\n-\twith open(input_file, \'rb\') as inputf:\t\n-\t\tinp = pickle.load(inputf)\n-\treturn inp[\'res\'],inp[\'params\'],inp[\'class_sl\'],inp[\'subclass_sl\']\t\t\n-\n-\n-def test_kw_r(cls,feats,p,factors):\n-\trobjects.globalenv["y"] = robjects.FloatVector(feats)\n-\tfor i,f in enumerate(factors):\n-\t\trobjects.globalenv[\'x\'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f]))\n-\tfo = "y~x1"\n-\tfor i,f in enumerate(factors[1:]):\n-\t\tif f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue\n-\t\tif len(set(cls[f])) == len(cls[f]): continue\n-\t\tfo += "+x"+str(i+2)\n-\tkw_res = robjects.r(\'kruskal.test(\'+fo+\',)$p.value\')\n-\treturn float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])\n-\n-def test_rep_wilcoxon_r(sl,cl_hie,feats,th,multiclass_strat,mul_cor,fn,min_c,comp_only_same_subcl,curv=False):\n-\tcomp_all_sub = not comp_only_same_subcl\n-\ttot_ok = 0\n-\talpha_mtc = th\n-\tall_diff = []\n-\tfor pair in [(x,y) for x in cl_hie.keys() for y in cl_hie.keys() if x < y]:\n-\t\tdir_cmp = "not_set" #\n-\t\tl_subcl1, l_subcl2 = (len(cl_hie[pair[0]]), len(cl_hie[pair[1]]))\n-\t\tif mul_cor != 0: alpha_mtc = th*l_subcl1*l_subcl2 if mul_cor == 2 else 1.0-math.pow(1.0-th,l_subcl1*l_subcl2)\n-\t\tok = 0\n-\t\tcurv_sign = 0\n-\t\tfirst = True\n-\t\tfor i,k1 in enumerate(cl_hie[pair[0]]):\n-\t\t\tbr = False\n-\t\t\tfor j,k2 in enumerate(cl_hie[pair[1]]):\n-\t\t\t\tif not comp_all_sub and k1[len(pair[0]):] != k2[len(pair[1]):]: \n-\t\t\t\t\tok += 1\t\n-\t\t\t\t\tcontinue \n-\t\t\t\tcl1 = feats[sl[k1][0]:sl[k1][1]]\n-\t\t\t\tcl2 = feats[sl[k2][0]:sl[k2][1]]\n-\t\t\t\tmed_comp = False\n-\t\t\t\tif len(cl1) < min_c or len(cl2) < min_c: \n-\t\t\t\t\tmed_comp = True\n-\t\t\t\tsx,sy = numpy.median(cl1),numpy.median(cl2)\n-\t\t\t\tif cl1[0] == cl2[0] and len(set(cl1)) == 1 and len(set(cl2)) == 1: \n-\t\t\t\t\ttres, first = False, False\n-\t\t\t\telif not med_comp:\n-\t\t\t\t\trobjects.globalenv["x"] = robjects.FloatVector(cl1+cl2)\n-\t\t\t\t\trobjects.globalenv["y"] = robjects.FactorVector(robjects.StrVector(["a" for a in cl1]+["b" for b in cl2]))\t\n-\t\t\t\t\tpv = float(robjects.r(\'pvalue(wilcox_test(x~y,data=data.frame(x,y)))\')[0])\n-\t\t\t\t\ttres = pv < alpha_mtc*2.0\n-\t\t\t\tif first:\n-\t\t\t\t\tfirst = False\n-\t\t\t\t\tif not curv and ( med_comp or tres ): \n-\t\t\t\t\t\tdir_cmp = sx < sy\n-\t\t\t\t\t\tif sx == sy: br = True\n-\t\t\t\t\telif curv: \n-\t\t\t\t\t\tdir_cmp = None\n-\t\t\t\t\t\tif med_comp or tres: \n-\t\t\t\t\t\t\tcurv_sign += 1\n-\t\t\t\t\t\t\tdir_cmp = sx < sy\n-\t\t\t\t\telse: br = True\n-\t\t\t\telif not curv and med_comp:\n-\t\t\t\t\tif ((sx < sy) != dir_cmp or sx == sy): br = True\n-\t\t\t\telif curv:\n-\t\t\t\t\tif tres and dir_cmp == None:\n-\t\t\t\t\t\tcurv_sign += 1\n-\t\t\t\t\t\tdir_cmp = sx < sy\n-\t\t\t\t\tif tres and dir_cmp != (sx < sy):\n-\t\t\t\t\t\tbr = True\n-\t\t\t\t\t\tcurv_sign = '..b' for i,v in enumerate(feats[k]):\n- if feats[\'class\'][i] == c:\n- feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(0.0,max(feats[k][i]*0.05,0.01)))\n- rdict = {}\n- for a,b in feats.items():\n- if a == \'class\' or a == \'subclass\' or a == \'subject\':\n- rdict[a] = robjects.StrVector(b)\n- else: rdict[a] = robjects.FloatVector(b)\n- robjects.globalenv["d"] = robjects.DataFrame(rdict)\n- lfk = len(feats[fk[0]])\n- rfk = int(float(len(feats[fk[0]]))*fract_sample)\n- f = "class ~ "+fk[0]\n- for k in fk[1:]: f += " + " + k.strip()\n- ncl = len(set(cls[\'class\']))\n- min_cl = int(float(min([cls[\'class\'].count(c) for c in set(cls[\'class\'])]))*fract_sample*fract_sample*0.5) \n- min_cl = max(min_cl,1) \n- pairs = [(a,b) for a in set(cls[\'class\']) for b in set(cls[\'class\']) if a > b]\n-\n-\tfor k in fk:\t\n-\t\tfor i in range(boots):\n-\t\t\tmeans[k].append([])\t\n- for i in range(boots):\n- for rtmp in range(1000):\n- rand_s = [lrand.randint(0,lfk-1) for v in range(rfk)]\n- if not contast_within_classes_or_few_per_class(feats,rand_s,min_cl,ncl): break\n- rand_s = [r+1 for r in rand_s]\n-\t\tmeans[k][i] = []\n-\t\tfor p in pairs:\n-\t \trobjects.globalenv["rand_s"] = robjects.IntVector(rand_s)\n- \trobjects.globalenv["sub_d"] = robjects.r(\'d[rand_s,]\')\n- \tz = robjects.r(\'z <- suppressWarnings(lda(as.formula(\'+f+\'),data=sub_d,tol=\'+str(tol_min)+\'))\')\n-\t\t\trobjects.r(\'w <- z$scaling[,1]\')\n-\t\t\trobjects.r(\'w.unit <- w/sqrt(sum(w^2))\')\n-\t\t\trobjects.r(\'ss <- sub_d[,-match("class",colnames(sub_d))]\')\n-\t\t\tif \'subclass\' in feats:\n-\t\t\t\trobjects.r(\'ss <- ss[,-match("subclass",colnames(ss))]\')\n-\t\t\tif \'subject\' in feats:\n-\t\t\t\trobjects.r(\'ss <- ss[,-match("subject",colnames(ss))]\')\n-\t\t\trobjects.r(\'xy.matrix <- as.matrix(ss)\')\n-\t\t\trobjects.r(\'LD <- xy.matrix%*%w.unit\')\n-\t\t\trobjects.r(\'effect.size <- abs(mean(LD[sub_d[,"class"]=="\'+p[0]+\'"]) - mean(LD[sub_d[,"class"]=="\'+p[1]+\'"]))\')\n-\t\t\tscal = robjects.r(\'wfinal <- w.unit * effect.size\')\n-\t\t\trres = robjects.r(\'mm <- z$means\')\n-\t\t\trowns = list(rres.rownames)\n-\t\t\tlenc = len(list(rres.colnames))\n-\t\t\tcoeff = [abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal]\n- \tres = dict([(pp,[float(ff) for ff in rres.rx(pp,True)] if pp in rowns else [0.0]*lenc ) for pp in [p[0],p[1]]])\n-\t\t\tfor j,k in enumerate(fk):\n-\t\t\t\tgm = abs(res[p[0]][j] - res[p[1]][j])\n- \tmeans[k][i].append((gm+coeff[j])*0.5)\n- res = {}\n- for k in fk:\n-\t\tm = max([numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs))])\n- res[k] = math.copysign(1.0,m)*math.log(1.0+math.fabs(m),10)\n- return res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th])\n-\n-\n-def test_svm(cls,feats,cl_sl,boots,fract_sample,lda_th,tol_min,nsvm):\n-\treturn NULL\n-"""\n-\tfk = feats.keys()\n-\tclss = list(set(cls[\'class\']))\n-\ty = [clss.index(c)*2-1 for c in list(cls[\'class\'])]\n-\txx = [feats[f] for f in fk]\n-\tif nsvm: \n-\t\tmaxs = [max(v) for v in xx]\n-\t\tmins = [min(v) for v in xx]\n-\t\tx = [ dict([(i+1,(v-mins[i])/(maxs[i]-mins[i])) for i,v in enumerate(f)]) for f in zip(*xx)]\n-\telse: x = [ dict([(i+1,v) for i,v in enumerate(f)]) for f in zip(*xx)]\n-\t\n-\tlfk = len(feats[fk[0]])\n-\trfk = int(float(len(feats[fk[0]]))*fract_sample)\n-\tmm = []\n-\n-\tbest_c = svmutil.svm_ms(y, x, [pow(2.0,i) for i in range(-5,10)],\'-t 0 -q\')\n-\tfor i in range(boots):\n-\t\trand_s = [lrand.randint(0,lfk-1) for v in range(rfk)]\n-\t\tr = svmutil.svm_w([y[yi] for yi in rand_s], [x[xi] for xi in rand_s], best_c,\'-t 0 -q\')\n-\t\tmm.append(r[:len(fk)])\n-\tm = [numpy.mean(v) for v in zip(*mm)]\n-\tres = dict([(v,m[i]) for i,v in enumerate(fk)])\n-\treturn res,dict([(k,x) for k,x in res.items() if math.fabs(x) > lda_th])\n-"""\t\n'

diff -r e7cd19afda2e -r db64b6287cd6 lefse2circlader.py
--- a/lefse2circlader.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import with_statement
-
-import sys
-import os
-import argparse
-
-def read_params(args):
-    parser = argparse.ArgumentParser(description='Convert LEfSe output to '
-                        'Circlader input')
-    parser.add_argument(    'inp_f', metavar='INPUT_FILE', nargs='?',
-                            default=None, type=str,
-                            help="the input file [stdin if not present]")
-    parser.add_argument(    'out_f', metavar='OUTPUT_FILE', nargs='?',
-                            default=None, type=str,
-                            help="the output file [stdout if not present]")
-    parser.add_argument('-l', metavar='levels with label', default=0, type=int)
-
-    return vars(parser.parse_args())
-
-def lefse2circlader(par):
-    finp,fout = bool(par['inp_f']), bool(par['out_f'])
-
-    with open(par['inp_f']) if finp else sys.stdin as inpf:
-        put_bm = (l.strip().split('\t') for l in inpf.readlines())
-    biomarkers = [p for p in put_bm if len(p) > 2]
-
-    circ = [    [   b[0],
-                    "" if b[0].count('.') > par['l'] else b[0].split('.')[-1],
-                    b[2],
-                    b[2]+"_col" ] for b in biomarkers]
-
-    with open(par['out_f'],'w') if fout else sys.stdout as out_file:
-        for c in circ:
-            out_file.write( "\t".join( c ) + "\n" )
-
-if __name__ == '__main__':
-    params = read_params(sys.argv)
-    lefse2circlader(params)
-
-
-

diff -r e7cd19afda2e -r db64b6287cd6 lefse_met.png

Binary file lefse_met.png has changed

diff -r e7cd19afda2e -r db64b6287cd6 lefse_ove.png

Binary file lefse_ove.png has changed

diff -r e7cd19afda2e -r db64b6287cd6 plot_cladogram.py
--- a/plot_cladogram.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,342 +0,0 @@\n-#!/usr/bin/env python\n-\n-import os,sys,matplotlib,argparse,string\n-matplotlib.use(\'Agg\')\n-from pylab import *\n-from lefse import *\n-import numpy as np\n-\n-colors = [\'r\',\'g\',\'b\',\'m\',\'c\',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],\'k\']\n-dark_colors = [[0.4,0.0,0.0],[0.0,0.2,0.0],[0.0,0.0,0.4],\'m\',\'c\',[1.0,0.5,0.0],[0.0,1.0,0.0],[0.33,0.125,0.0],[0.75,0.75,0.75],\'k\']\n-\n-class CladeNode:\n-\tdef __init__(self, name, abundance, viz = True):\n- \t\tself.id = name\n- \t\tself.name = name.split(".")\n-\t\tself.last_name = self.name[-1]\n-\t\tself.abundance = abundance\n-\t\tself.pos = (-1.0,-1.0)\n-\t\tself.children = {}\n-\t\tself.isleaf = True\n-\t\tself.color = \'y\'\n-\t\tself.next_leaf = -1\n-\t\tself.prev_leaf = -1\n-\t\tself.viz = viz\n-\tdef __repr__(self):\n-\t\treturn self.last_name\n-\tdef add_child(self,node):\n-\t\tself.isleaf = False\n-\t\tself.children[node.__repr__()] = node\n-\tdef get_children(self):\n-\t\tck = sorted(self.children.keys())\n-\t\treturn [self.children[k] for k in ck]\n-\tdef get_color(self):\n-\t\treturn self.color\n-\tdef set_color(self,c):\n-\t\tself.color = c\n-\tdef set_pos(self,pos):\n-\t\tself.pos = pos\n-\n-def read_params(args):\n-\tparser = argparse.ArgumentParser(description=\'Cladoplot\')\n-\tparser.add_argument(\'input_file\', metavar=\'INPUT_FILE\', type=str, help="tab delimited input file")\n-\tparser.add_argument(\'output_file\', metavar=\'OUTPUT_FILE\', type=str, help="the file for the output image")\n-\tparser.add_argument(\'--clade_sep\',dest="clade_sep", type=float, default=1.5)\n-\tparser.add_argument(\'--max_lev\',dest="max_lev", type=int, default=-1)\n-\tparser.add_argument(\'--max_point_size\',dest="max_point_size", type=float, default=6.0)\n-\tparser.add_argument(\'--min_point_size\',dest="min_point_size", type=float, default=1)\n-\tparser.add_argument(\'--point_edge_width\',dest="markeredgewidth", type=float, default=.25)\n-\tparser.add_argument(\'--siblings_connector_width\',dest="siblings_connector_width", type=float, default=2)\n-\tparser.add_argument(\'--parents_connector_width\',dest="parents_connector_width", type=float, default=0.75)\n-\tparser.add_argument(\'--radial_start_lev\',dest="radial_start_lev", type=int, default=1)\n-\tparser.add_argument(\'--labeled_start_lev\',dest="labeled_start_lev", type=int, default=2)\n-\tparser.add_argument(\'--labeled_stop_lev\',dest="labeled_stop_lev", type=int, default=5)\n-\tparser.add_argument(\'--abrv_start_lev\',dest="abrv_start_lev", type=int, default=3)\n-\tparser.add_argument(\'--abrv_stop_lev\',dest="abrv_stop_lev", type=int, default=5)\n-\tparser.add_argument(\'--expand_void_lev\',dest="expand_void_lev", type=int, default=1)\n-\tparser.add_argument(\'--class_legend_vis\',dest="class_legend_vis", type=int, default=1)\n-\tparser.add_argument(\'--colored_connector\',dest="colored_connectors", type=int, default=1)\n-\tparser.add_argument(\'--alpha\',dest="alpha", type=float, default=0.2)\n-\tparser.add_argument(\'--title\',dest="title", type=str, default="Cladogram")\n-\tparser.add_argument(\'--sub_clade\',dest="sub_clade", type=str, default="")\n-\tparser.add_argument(\'--title_font_size\',dest="title_font_size", type=str, default="14")\n-\tparser.add_argument(\'--right_space_prop\',dest="r_prop", type=float, default=0.1)\n-\tparser.add_argument(\'--left_space_prop\',dest="l_prop", type=float, default=0.1)\n-\tparser.add_argument(\'--label_font_size\',dest="label_font_size", type=str, default="6")\n-\tparser.add_argument(\'--background_color\',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")\n-\tparser.add_argument(\'--colored_labels\',dest="col_lab", type=int, choices=[0,1], default=1, help="draw the label with class color (1) or in black (0)")\n-\tparser.add_argument(\'--class_legend_font_size\',dest="class_legend_font_size", type=str, default="10")\n-\tparser.add_argument(\'--dpi\',dest="dpi", type=int, default=72)\n-\tparser.add_argument(\'--format\', dest="format", choices=["png","svg","pdf"], default="svg", type=str, help="the format for the output file")\n-\tparser.add_argument(\'--all_feats\', dest="all_feats", type=str, d'..b'rc_ext = 0.65 if dim > 0.1 else 1.0 \n-\t\tclto = (de-l+1)*dim+dim*(dd+1-(l-dd-1))*perc_ext\n-\t\tclto = (de-l+1)*dim+dim*(dd-(l-params[\'labeled_start_lev\'])+1)*perc_ext\n-\t\tdes = float(180.0*(fr_0+fr_1)/np.pi)*0.5-90\n-\t\tlab = ""\n-\t\ttxt = father.last_name\n-\t\tif params[\'abrv_start_lev\'] < l <= params[\'abrv_stop_lev\'] + 1:\n-\t\t\tide = u_i.next()\n-\t\t\tlab = str(ide)+": "+father.last_name \n-\t\t\ttxt = str(ide)\n-#\t\tax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(depth-1), alpha = params[\'alpha\'], color=col, edgecolor=col)\n-\t\tax.bar(fr_0, clto, width = fr_1-fr_0, bottom = float(l-1)/float(de), alpha = params[\'alpha\'], color=col, edgecolor=col)\n-\t\tax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, alpha = 1.0, color=col, edgecolor=params[\'fore_color\'], label=lab)\n-\t\tif l <= params[\'abrv_stop_lev\'] + 1:\n-\t\t\tif not params[\'col_lab\']: col = params[\'fore_color\']\n-\t\t\telse: \n-\t\t\t\tif col not in colors: col = params[\'fore_color\']\n-\t\t\t\telse: col = dark_colors[colors.index(col)%len(dark_colors)]\n-\t\t\tax.text((fr_0+fr_1)*0.5, clto+float(l-1)/float(de)-dim*perc_ext/2.0, txt, size = params[\'label_font_size\'], rotation=des, ha ="center", va="center", color=col)\t\n- return fr_0, fr_1\n-\n-def draw_tree(out_file,tree,params):\n-\tplt_size = 7\n-\tnlev = tree[\'nlev\']\n-\tpt_scale = (params[\'min_point_size\'],max(1.0,((tree[\'max_abs\']-tree[\'min_abs\']))/(params[\'max_point_size\']-params[\'min_point_size\'])))\n-\tdepth = len(nlev)\n-\tsep = (2.0*np.pi)/float(nlev[-1]) \n-\tseps = [params[\'clade_sep\']*sep/float(depth-i+1) for i in range(1,len(tree[\'nlev\'])+1)]\n-\ttotseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])])\n-\tclade_sep_err = True if totseps > np.pi else False\n-\twhile totseps > np.pi:\n-\t\tparams[\'clade_sep\'] *= 0.75 \t \n-\t\tseps = [params[\'clade_sep\']*sep/(float(depth-i+1)*0.25) for i in range(1,len(tree[\'nlev\'])+1)]\n-\t\ttotseps = sum([s*nlev[i] for i,s in enumerate(seps[:-1])])\n-\tif clade_sep_err: print \'clade_sep parameter too large, lowered to\',params[\'clade_sep\']\n-\n-\tfig = plt.figure(edgecolor=params[\'back_color\'],facecolor=params[\'back_color\'])\n-\tax = fig.add_subplot(111, polar=True, frame_on=False, axis_bgcolor=params[\'back_color\'] )\n-\tplt.subplots_adjust(right=1.0-params[\'r_prop\'],left=params[\'l_prop\']) \t\n-\tax.grid(False)\n-\txticks([])\n-\tyticks([])\n-\n-\tds = (2.0*np.pi-totseps)/float(nlev[-1])\n-\n-\tadd_all_pos(tree[\'root\'],0.0,ds,seps,0.0,depth)\n-\t\n-\tplot_lines(tree[\'root\'],params,depth,ax,0)\n-\tplot_points(tree[\'root\'],params,pt_scale,ax)\n-\tplot_names(tree[\'root\'],params,depth,ax,uniqueid(),seps)\n-\n-\tr = np.arange(0, 3.0, 0.01)\n-\ttheta = 2*np.pi*r\n-\t\n-\tdef get_col_attr(x):\n- \t\treturn hasattr(x, \'set_color\') and not hasattr(x, \'set_facecolor\')\n-\n-\th, l = ax.get_legend_handles_labels()\n-\tif len(l) > 0:\n-\t\tleg = ax.legend(bbox_to_anchor=(1.05, 1), frameon=False, loc=2, borderaxespad=0.,prop={\'size\':params[\'label_font_size\']})\n-\t\tif leg != None:\n-\t\t\tgca().add_artist(leg)\n-\t\t\tfor o in leg.findobj(get_col_attr):\n-\t o.set_color(params[\'fore_color\'])\n-\t\n-\tcll = sorted(tree[\'classes\']) if params[\'all_feats\'] == "" else sorted(params[\'all_feats\'].split(":"))\n-\tnll = [ax.bar(0.0, 0.0, width = 0.0, bottom = 0.0, color=colors[i%len(colors)], label=c) for i,c in enumerate(cll) if c in tree[\'classes\']]\n-\tcl = [c for c in cll if c in tree[\'classes\']]\n-\n-\tax.set_title(params[\'title\'],size=params[\'title_font_size\'],color=params[\'fore_color\'])\n-\n-\tif params[\'class_legend_vis\']:\n-\t\tl2 = legend(nll, cl, loc=2, prop={\'size\':params[\'class_legend_font_size\']}, frameon=False)\n-\t\tif l2 != None:\n-\t\t\tfor o in l2.findobj(get_col_attr):\n- \t\t\t\to.set_color(params[\'fore_color\'])\n-\n-\tplt.savefig(out_file,format=params[\'format\'],facecolor=params[\'back_color\'],edgecolor=params[\'fore_color\'],dpi=params[\'dpi\'])\n-\tplt.close()\t\n-\n-if __name__ == \'__main__\':\n-\tparams = read_params(sys.argv)\n-\tparams[\'fore_color\'] = \'w\' if params[\'back_color\'] == \'k\' else \'k\'\n-\tclad_tree = read_data(params[\'input_file\'],params)\t\n-\tdraw_tree(params[\'output_file\'],clad_tree,params)\n-\t\n'

diff -r e7cd19afda2e -r db64b6287cd6 plot_cladogram.xml
--- a/plot_cladogram.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,211 +0,0 @@\n-<tool id="LEfSe_cla" name="D) Plot Cladogram" version="1.0">\n- <description></description>\n-<command interpreter="python">plot_cladogram.py $inp_data $cladogram --title "$textm.title" --expand_void_lev $structural.expand_void_lev --max_lev $structural.max_lev --colored_label $graphical.colored_labels --labeled_start_lev $textm.labeled_start_lev --labeled_stop_lev $textm.labeled_stop_lev --abrv_start_lev $textm.abrv_start_lev --abrv_stop_lev $textm.abrv_stop_lev --radial_start_lev $graphical.radial_start_lev --max_point_size $graphical.max_point_size --min_point_size $graphical.min_point_size --point_edge_width $graphical.point_edge_width --siblings_connector_width $graphical.siblings_connector_width --parents_connector_width $graphical.parents_connector_width --alpha $graphical.alpha --clade_sep $graphical.clade_sep --title_font_size $textm.title_font_size --label_font_size $textm.label_font_size --class_legend_font_size $textm.class_legend_font_size --sub_clade "$structural.root" --background_color $graphical.background_color --format $for --right_space_prop $graphical.right_space_prop --left_space_prop 0.01 --dpi $dpi</command> \n- <inputs>\n- <page>\n-\t<param format="lefse_res" name="inp_data" type="data" label="Select data" help=""/>\n-\t\t\n-\t<conditional name="structural">\n-\t\t<param name="structural_choice" type="select" label="Set structural parameters of the cladogram" help="">\n- \t\t<option value="d" selected=\'True\'>Default</option>\n- \t\t<option value="a">Advanced</option>\n- \t</param>\n-\t\t<when value="d">\n-\t\t\t<param name="root" type="hidden" value=""/>\n-\t\t\t<param name="expand_void_lev" type="hidden" value="0"/>\n-\t\t\t<param name="max_lev" type="hidden" value="6"/>\n-\t\t</when>\n-\t\t<when value="a">\n-\t\t\t<param name="root" type="text" size="10" value="" label="Root of the tree (insert the full name of the clade separating the levels with \'.\', nothing means the highest level clade)"/>\n-\t\t\t<param name="expand_void_lev" type="select" label="Expand terminal non-leaf levels">\n-\t\t\t\t<option value="1" >Yes</option>\n- \t <option value="0" selected=\'True\'>No</option>\n-\t </param>\n-\t\t\t<param name="max_lev" type="integer" size="2" value="6" label="Maximum number of taxonomic levels"/>\n-\t\t\t</when>\n-\t</conditional>\n-\n-\t<conditional name="textm">\n- \t<param name="text_choice" type="select" label="Set text and label options (font size, abbreviations, ...)" help="">\n- \t\t<option value="d" selected=\'True\'>Default</option>\n- \t\t<option value="a">Advanced</option>\n- \t</param>\n- \t<when value="d">\n-\t\t\t<param name="title" type="hidden" value=""/>\n-\t\t\t<param name="label_font_size" type="hidden" value="7"/>\n-\t\t\t<param name="title_font_size" type="hidden" value="14" />\n-\t\t\t<param name="class_legend_font_size" type="hidden" value="10"/>\n-\t\t\t<param name="labeled_start_lev" type="hidden" value="2"/>\t\n-\t\t\t<param name="labeled_stop_lev" type="hidden" value="6"/>\t\n-\t\t\t<param name="abrv_start_lev" type="hidden" value="4"/>\n-\t\t\t<param name="abrv_stop_lev" type="hidden" value="6"/>\n- \t</when>\n- \t<when value="a">\n-\t\t\t<param name="title" type="text" size="10" value="" label="The title of the cladogram"/>\n-\t\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n-\t\t\t<param name="label_font_size" type="integer" size="2" value="7" label="Label font size"/>\n-\t\t\t<param name="class_legend_font_size" type="integer" size="2" value="10" label="Class font size"/>\n-\t\t\t<param name="labeled_start_lev" type="integer" size="2" value="2" label="Starting level for drawing the labels"/>\t\n-\t\t\t<param name="labeled_stop_lev" type="integer" size="2" value="6" label="Last level for drawing the labels"/>\t\n-\t\t\t<param name="abrv_start_lev" type="integer" size="2" value="4" label="First level for abbreviating the labels"/>\n-\t\t\t<param name="abrv_stop_lev" type="integer" size="2" value="6" label="Last level for abbreviating t'..b'anced parameter settings**\n-\n-*Structural parameters*\n- * Root of the tree: selects any taxa of the tree to be the root of the cladogram (only the clades below the root will be visualized). Taxa levels are separated by ".", so for, example, Bacteria.Firmicutes will generate only the cladogram of Firmicutes.\n- * Expand terminal non-leaf levels: whether to expand a non-leaf taxa without children up to the level of the leaves naming the new levels with the expanding taxa name. \n- * Maximum number of taxonomic levels: you can limit the levels of the cladogram to a desired level.\n-*Text and label options*\n-\t* The title of the cladogram: optional title for the plot (default is no title).\n-\t* Title font size: set the font size of the title only.\n-\t* Label font size: set the font size of the labels (and of the label legend) used in the cladogram to denote taxa.\n-\t* Class font size: set the font of the legend for the class names and colors.\n-\t* Starting level for drawing the labels: you can avoid naming the more internal clades if they are not informative.\n-\t* Last level for drawing the labels: you may want to remove the most external labels as they may be to dense or overlapping.\n-\t* First level for abbreviating the labels: set the starting level for substituting the full clade names with with an abbreviation supported by a legend table (recommended for the most external taxa).\n-\t* Last level for abbreviating the labels: set the more external level for substituting the full clade names with with an abbreviation supported by a legend table.\n-*Graphical options*\n-\t* Number of external levels drawn in the radial representation: the connection between the taxa in last level and the corresponding parent is represented with a straight edge. The same representation may be used for more internal levels as well.\n-\t* Max dimension of the circles representing taxa: the sizes of the taxa represent the highest logarithmic abundance between classes, and this option sets the maximum diameter of the graphical representation.\n-\t* Min dimension of the circles representing taxa: the taxon diameter of the smallest logarithmic taxa abundance.\n-\t* Width of the edges of the circles representing taxa: the taxon circles have an external border whose thickness is regulated by this option.\n-\t* Width of the lines connecting sibling taxa: set the thickness of the lines connecting sibling taxa in the non-radial representation.\n-\t* Width of the lines connecting parent with son taxa: set the line thickness of the child-parent connection both in radial and non-radial representation.\n-\t* The alpha value for the transparency of clade labeling: the alpha value is responsible for the transparency of the differential clade highlighting. Since the transparency is additive, the alpha value should not be higher than 1/s where s is the number of levels with differential clades. \n-\t* Ration of the horizontal space to be given to the right side: in case the label legend requires more space (because of long labels) you may increase the right panel increasing this value.\n-\t* Whether to write the labels with the class color or in black: set whether the clade names inside the cladogram will be displayed with the class color or in black.\n-\t* Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n-\t* Set the space between clades at the lower level: set the separation between low-level taxa belonging to different super-clades. Depending on the density of the leaf-level this parameter is automatically adjusted.\n-\n-\n-------\n-\n-**Examples**\n-\n-The dataset provided here_ and described in the "Introduction" module produces the following image (alpha values of LEfSe - step B - are set to 0.01)\n-\n-\n-.. _here: http://www.huttenhower.org/webfm_send/73 \n-\n-Focusing the cladogram on the Firmicutes phylum only and playing a bit with the graphical options, we can obtain the following plot:\n-\n- \n-\n- </help>\n-</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 plot_features.py
--- a/plot_features.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-
-import os,sys,matplotlib,zipfile,argparse,string
-matplotlib.use('Agg')
-from pylab import *
-from lefse import *
-import random as rand
-
-colors = ['r','g','b','m','c']
-
-def read_params(args):
- parser = argparse.ArgumentParser(description='Cladoplot')
- parser.add_argument('input_file_1', metavar='INPUT_FILE', type=str, help="dataset files")
- parser.add_argument('input_file_2', metavar='INPUT_FILE', type=str, help="LEfSe output file")
- parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output (the zip file if an archive is required, the output directory otherwise)")
- parser.add_argument('--width',dest="width", type=float, default=10.0 )
- parser.add_argument('--height',dest="height", type=float, default=4.0)
- parser.add_argument('--top',dest="top", type=float, default=-1.0, help="set maximum y limit (-1.0 means automatic limit)")
- parser.add_argument('--bot',dest="bot", type=float, default=0.0, help="set minimum y limit (default 0.0, -1.0 means automatic limit)")
- parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="14")
- parser.add_argument('--class_font_size',dest="class_font_size", type=str, default="14")
- parser.add_argument('--class_label_pos',dest="class_label_pos", type=str, choices=["up","down"], default="up")
- parser.add_argument('--subcl_mean',dest="subcl_mean", type=str, choices=["y","n"], default="y")
- parser.add_argument('--subcl_median',dest="subcl_median", type=str, choices=["y","n"], default="y")
- parser.add_argument('--font_size',dest="font_size", type=str, default="10")
- parser.add_argument('-n',dest="unused", metavar="flt", type=float, default=-1.0,help="unused")
- parser.add_argument('--format', dest="format", default="png", choices=["png","pdf","svg"], type=str, help="the format for the output file")
- parser.add_argument('-f', dest="f", default="diff", choices=["all","diff","one"], type=str, help="wheter to plot all features (all), only those differentially abundant according to LEfSe or only one (the one given with --feature_name) ")
- parser.add_argument('--feature_name', dest="feature_name", default="", type=str, help="The name of the feature to plot (levels separated by .) ")
- parser.add_argument('--feature_num', dest="feature_num", default="-1", type=int, help="The number of the feature to plot ")
- parser.add_argument('--archive', dest="archive", default="none", choices=["zip","none"], type=str, help="")
- parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")
- parser.add_argument('--dpi',dest="dpi", type=int, default=72)
-
- args = parser.parse_args()
-
- return vars(args)
-
-def read_data(file_data,file_feats,params):
- with open(file_feats, 'r') as features:
- feats_to_plot = [(f.split()[:-1],len(f.split()) == 5) for f in features.readlines()]
- if not feats_to_plot:
- print "No features to plot\n"
- sys.exit(0)
- feats,cls,class_sl,subclass_sl,class_hierarchy,params['norm_v'] = load_data(file_data, True)
- if params['feature_num'] > 0:
- params['feature_name'] = [line.split()[0] for line in open(params['input_file_2'])][params['feature_num']-1]
- features = {}
- for f in feats_to_plot:
- if params['f'] == "diff" and not f[1]: continue
- if params['f'] == "one" and f[0][0] != params['feature_name']: continue
- features[f[0][0]] = {'dim':float(f[0][1]), 'abundances':feats[f[0][0]], 'sig':f[1], 'cls':cls, 'class_sl':class_sl, 'subclass_sl':subclass_sl, 'class_hierarchy':class_hierarchy}
- if not features:
-                print "No features to plot\n"
-                sys.exit(0)
- return features
-
-def plot(name,k_n,feat,params):
- fig = plt.figure(figsize=(params['width'], params['height']),edgecolor=params['fore_color'],facecolor=params['back_color'])
- ax = fig.add_subplot(111,axis_bgcolor=params['back_color'])
- subplots_adjust(bottom=0.15)
-
- max_m = 0.0
- norm = 1.0 if float(params['norm_v']) < 0.0 else float(params['norm_v'])
- for v in feat['subclass_sl'].values():
- fr,to  = v[0], v[1]
- median = numpy.mean(feat['abundances'][fr:to])
- if median > max_m: max_m = median
- max_m /= norm
- max_v = max_m*3 if max_m*3 < max(feat['abundances'])*1.1/norm else max(feat['abundances'])/norm
- min_v = max(0.0,min(feat['abundances'])*0.9/norm)
-
- if params['top'] > 0.0: max_v = params['top']
- if params['bot'] >= 0.0: min_v = params['bot']
-
- if max_v == 0.0: max_v = 0.0001
- if max_v == min_v: max_v = min_v*1.1
-
- cl_sep = max(int(sum([vv[1]/norm - vv[0]/norm for vv in feat['class_sl'].values()])/150.0),1)
- seps = []
- xtics = []
- x2tics = []
- last_fr = 0.0
- for i,cl in enumerate(sorted(feat['class_hierarchy'].keys())):
- for j,subcl in enumerate(feat['class_hierarchy'][cl]):
- fr = feat['subclass_sl'][subcl][0]
- to = feat['subclass_sl'][subcl][1]
- val = feat['abundances'][fr:to]
- fr += cl_sep*i
- to += cl_sep*i
- pos = arange(fr,to)
- max_x = to
- col = colors[j%len(colors)]
- vv = [v1/norm for v1 in val]
- median = numpy.median(vv)
- mean = numpy.mean(vv)
- valv = [max(min(v/norm,max_v),min_v) for v in val]
- ax.bar(pos,valv, align='center', color=col, edgecolor=col, linewidth=0.1 )
- if params['subcl_median'] == 'y': ax.plot([fr,to-1],[median,median],"k--",linewidth=1,color=params['fore_color'])
- if params['subcl_mean'] == 'y': ax.plot([fr,to-1],[mean,mean],"-",linewidth=1,color=params['fore_color'])
- nna = subcl if subcl.count("_") == 0 or not subcl.startswith(cl) else "_".join(subcl.split(cl)[1:])
- if nna == "subcl" or nna == "_subcl": nna = " "
- xtics.append(((fr+(to-fr)/2),nna))
- seps.append(float(to))
- x2tics.append(((last_fr+(to-last_fr)/2),cl))
- last_fr = to + float(cl_sep)
- for s in seps[:-1]:
- ax.plot([s,s],[min_v,max_v],"-",linewidth=5,color=params['fore_color'])
- ax.set_title(k_n, size=params['title_font_size'])
- xticks([x[0] for x in xtics],[x[1] for x in xtics],rotation=-30, ha = 'left', fontsize=params['font_size'], color=params['fore_color'])
- yticks(fontsize=params['font_size'])
-
- ylabel('Relative abundance')
- ax.set_ylim((min_v,max_v))
- a,b = ax.get_xlim()
- ax.set_xlim((0-float(last_fr)/float(b-a),max_x))
- ax.yaxis.grid(True)
-
- def get_col_attr(x):
-                return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor')
- def get_edgecol_attr(x):
-                return hasattr(x, 'set_edgecolor')
-
-
- for o in fig.findobj(get_col_attr):
-                o.set_color(params['fore_color'])
- for o in fig.findobj(get_edgecol_attr):
- if o.get_edgecolor() == params['back_color']:
-                 o.set_edgecolor(params['fore_color'])
-
- for t in x2tics:
- m = ax.get_ylim()[1]*0.97 if params['class_label_pos']=='up' else 0.07
- plt.text(t[0],m, "class: "+t[1], ha ="center", size=params['class_font_size'], va="top", bbox = dict(boxstyle="round", ec='k', fc='y'))
-
-
- plt.savefig(name,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
- plt.close()
- return name
-
-if __name__ == '__main__':
- params = read_params(sys.argv)
- params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k'
- features = read_data(params['input_file_1'],params['input_file_2'],params)
- if params['archive'] == "zip": file = zipfile.ZipFile(params['output_file'], "w")
- for k,f in features.items():
- print "Exporting ", k
- if params['archive'] == "zip":
- of = plot("/tmp/"+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params)
- file.write(of, os.path.basename(of), zipfile.ZIP_DEFLATED)
- else:
- if params['f'] == 'one': plot(params['output_file'],k,f,params)
- else: plot(params['output_file']+str(int(f['sig']))+"_"+"-".join(k.split("."))+"."+params['format'],k,f,params)
- if params['archive'] == "zip": file.close()

diff -r e7cd19afda2e -r db64b6287cd6 plot_features.xml
--- a/plot_features.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,159 +0,0 @@\n-<tool id="LEfSe_fea" name="F) Plot Differential Features" version="1.0">\n- <description></description>\n-<command interpreter="python">plot_features.py $inp_data $inp_res $arch --title_font_size $graphical.title_font_size --background_color $graphical.background_color --class_label_pos $graphical.class_label_pos --class_font_size $graphical.class_font_size --top $graphical.top --bot $graphical.bot --font_size $graphical.font_size --width $graphical.width --height $graphical.height -f $feature_set --archive "zip" --format $for --dpi $dpi --subcl_mean $graphical.subcl_mean --subcl_median $graphical.subcl_median </command> \n- <inputs>\n- <page>\n-\t<param format="lefse" name="inp_data" type="data" label="The formatted datasets" help=""/>\n-\t<param format="lefse_res" name="inp_res" type="data" label="The LEfSe output" help=""/>\n-\n-\t<param name="feature_set" type="select" label="Do you want to plot all features or only those detected as biomarkers?">\n-\t\t<option value="diff" selected="diff">Biomarkers only</option>\n-\t\t<option value="all">All</option>\n-\t</param> \n-\n-\n-\t<conditional name="graphical">\n-\t<param name="graphical_choice" type="select" label="Set some graphical options to personalize the output">\n- <option value="d" selected=\'True\'>Default</option>\n- <option value="a">Advanced</option>\n-\t</param>\n- \t<when value="d">\n-\t\t<param name="top" type="hidden" value="-1.0" />\n-\t\t<param name="bot" type="hidden" value="0.0" />\n- <param name="title_font_size" type="hidden" value="14" />\n- <param name="class_font_size" type="hidden" value="12" />\n- <param name="font_size" type="hidden" value="8" />\n- <param name="width" type="hidden" value="7.0" />\n- <param name="height" type="hidden" value="4.0" />\n- <param name="background_color" type="hidden" value="w" />\n-\t\t<param name="width" type="hidden" value="7.0" />\n- <param name="height" type="hidden" value="4.0" />\n-\t\t<param name="class_label_pos" type="hidden" value="up" />\n-\t\t<param name="subcl_mean" type="hidden" value="y" />\n-\t\t<param name="subcl_median" type="hidden" value="y" />\n-\t</when>\n-\n-\t<when value="a">\n-\t\t<param name="top" type="float" size="2" value="-1.0" label="Set the maximum y value (-1.0 means automatic maximum setting based on maximum class median)"/>\n-\t\t<param name="bot" type="float" size="2" value="0.0" label="Set the minimum y value (-1.0 means automatic minimum setting based on minimum class median)"/>\n-\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n-\t <param name="class_font_size" type="integer" size="2" value="12" label="Class font size"/>\n-\t <param name="font_size" type="integer" size="2" value="8" label="Size of subclasses names and y values"/>\n-\t <param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n-\t <param name="height" type="float" size="2" value="4.0" label="Height of the plot"/>\n-\t\t<param name="background_color" type="select" label="Background color">\n-\t\t\t<option value="w" selected=\'True\'>White</option>\n- <option value="k">Black</option>\n- </param>\n-\t\t<param name="class_label_pos" type="select" label="Class label position">\n-\t\t\t<option value="up" selected=\'True\'>Top</option>\n- <option value="down">Bottom</option>\n- </param>\n-\t\t<param name="subcl_mean" type="select" label="Whether to plot the subclass means (straight line)">\n- <option value="y" selected=\'True\'>Yes</option>\n- <option value="n">No</option>\n- </param>\n-\t\t<param name="subcl_median" type="select" label="Whether to plot the subclass medians (dotted line)">\n- <option value="y" selected=\'True\'>Yes</option>\n- <option value="n">No</option>\n- '..b'df</option>\n- </param>\n- <param name="dpi" type="select" label="Set the dpi resolution of the output">\n- <option value="72">72</option>\n- <option value="150" selected="True">150</option>\n- <option value="300">300</option>\n- <option value="600">600</option>\n- <option value="1200">1200</option>\n- </param>\n-\n-\t</page>\n- </inputs>\n- <outputs>\n- <data format="zip" name="arch" >\n- </data>\n- </outputs>\n- <tests>\n- <test>\n- <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/>\n- <param name="maf_source" value="cached"/>\n- <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>\n- <param name="species" value="hg18,mm8"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />\n- </test>\n- <test>\n- <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n- <param name="maf_source" value="cached"/>\n- <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>\n- <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />\n- </test>\n- <test>\n- <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n- <param name="maf_source" value="user"/>\n- <param name="maf_file" value="5.maf"/>\n- <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />\n- </test>\n- </tests>\n- <help>\n-**What it does**\n-\n-This module plots the raw data of the features detected by LEfSe as biomarkers as abundance histograms\n-with class and subclass information. The features are exported as images and\n-the user can download all the images in a .zip archive. It is also possible to\n-export all the features (instead of the biomarkers only). For exporting or\n-analyzing few features only the "Plot One Feature" module is recommended. \n-\n-------\n-\n-**Input format**\n-\n-The module accepts two datasets: the data formatted with the "Format Input for\n-LEfSe" module and the output of the LEfSe analysis. Both datasets are necessary\n-to run the module.\n-\n-------\n-\n-**Output format**\n-\n-The module generates zip archives containing images in png, svg or pdf format. \n-\n-------\n-\n-**Advanced parameter settings**\n- \n-*Graphical options*\n- * Set the maximum y value: -1 means automatic parameter setting that is computed as the minimum between the highest abundance value and three times the highest subclass median.\n- * Set the minimum y value: -1 means automatic parameter setting that is computed as the maximum between 0 and the 90% of the smallest abundance value.\n- * Title font size: set the font size of the title only.\n- * Class font size: set the font of the legend for the class names and colors.\n- * Size of subclasses names and y values: set the font size for the axis labels.\n- * Width of the plot: horizontal size (in inches) of the plot.\n- * Height of the plot: vertical size (in inches) of the plot.\n- * Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n- * Class label position: whether to place the class labels on the top or on the bottom of the plot.\n- * Plot subclass means (straight line): whether to plot the subclass means with straight horizontal lines.\n- * Plot subclass medians (dotted line): whether to plot the subclass medians with dotted horizontal lines.\n-\n-------\n-\n-**Examples**\n-\n-Please see the examples reported for the "Plot One Feature" module (E). This\n-module just produces multiple plots in the same way and compresses them into a\n-.zip archive. \n-\n- </help>\n-</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 plot_res.py
--- a/plot_res.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-
-import os,sys
-import matplotlib
-matplotlib.use('Agg')
-from pylab import *
-
-from lefse import *
-import argparse
-
-colors = ['r','g','b','m','c','y','k','w']
-
-def read_params(args):
- parser = argparse.ArgumentParser(description='Plot results')
- parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="tab delimited input file")
- parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str, help="the file for the output image")
- parser.add_argument('--feature_font_size', dest="feature_font_size", type=int, default=7, help="the file for the output image")
- parser.add_argument('--format', dest="format", choices=["png","svg","pdf"], default='png', type=str, help="the format for the output file")
- parser.add_argument('--dpi',dest="dpi", type=int, default=72)
- parser.add_argument('--title',dest="title", type=str, default="")
- parser.add_argument('--title_font_size',dest="title_font_size", type=str, default="12")
- parser.add_argument('--class_legend_font_size',dest="class_legend_font_size", type=str, default="10")
- parser.add_argument('--width',dest="width", type=float, default=7.0 )
- parser.add_argument('--height',dest="height", type=float, default=4.0, help="only for vertical histograms")
- parser.add_argument('--left_space',dest="ls", type=float, default=0.2 )
- parser.add_argument('--right_space',dest="rs", type=float, default=0.1 )
- parser.add_argument('--orientation',dest="orientation", type=str, choices=["h","v"], default="h" )
- parser.add_argument('--autoscale',dest="autoscale", type=int, choices=[0,1], default=1 )
- parser.add_argument('--background_color',dest="back_color", type=str, choices=["k","w"], default="w", help="set the color of the background")
- parser.add_argument('--subclades', dest="n_scl", type=int, default=1, help="number of label levels to be dislayed (starting from the leaves, -1 means all the levels, 1 is default )")
- parser.add_argument('--max_feature_len', dest="max_feature_len", type=int, default=60, help="Maximum length of feature strings (def 60)")
- parser.add_argument('--all_feats', dest="all_feats", type=str, default="")
- args = parser.parse_args()
- return vars(args)
-
-def read_data(input_file,output_file):
- with open(input_file, 'r') as inp:
- rows = [line.strip().split()[:-1] for line in inp.readlines() if len(line.strip().split())>3]
- classes = list(set([v[2] for v in rows if len(v)>2]))
- if len(classes) < 1:
- print "No differentially abundant features found in "+input_file
- os.system("touch "+output_file)
- sys.exit()
- data = {}
- data['rows'] = rows
- data['cls'] = classes
- return data
-
-def plot_histo_hor(path,params,data,bcl):
- cls2 = []
- if params['all_feats'] != "":
- cls2 = sorted(params['all_feats'].split(":"))
- cls = sorted(data['cls'])
- if bcl: data['rows'].sort(key=lambda ab: fabs(float(ab[3]))*(cls.index(ab[2])*2-1))
- else:
- mmax = max([fabs(float(a)) for a in zip(*data['rows'])[3]])
- data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1))
- pos = arange(len(data['rows']))
- head = 0.75
- tail = 0.5
- ht = head + tail
- ints = max(len(pos)*0.2,1.5)
- fig = plt.figure(figsize=(params['width'], ints + ht), edgecolor=params['back_color'],facecolor=params['back_color'])
-        ax = fig.add_subplot(111,frame_on=False,axis_bgcolor=params['back_color'])
- ls, rs = params['ls'], 1.0-params['rs']
- plt.subplots_adjust(left=ls,right=rs,top=1-head*(1.0-ints/(ints+ht)), bottom=tail*(1.0-ints/(ints+ht)))
-
- fig.canvas.set_window_title('LDA results')
-
- l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'}
-        r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'}
- added = []
- m = 1 if data['rows'][0][2] == cls[0] else -1
- for i,v in enumerate(data['rows']):
- indcl = cls.index(v[2])
- lab = str(v[2]) if str(v[2]) not in added else ""
- added.append(str(v[2]))
- col = colors[indcl%len(colors)]
- if len(cls2) > 0:
- col = colors[cls2.index(v[2])%len(colors)]
- vv = fabs(float(v[3])) * (m*(indcl*2-1)) if bcl else fabs(float(v[3]))
- ax.barh(pos[i],vv, align='center', color=col, label=lab, height=0.8, edgecolor=params['fore_color'])
- mv = max([abs(float(v[3])) for v in data['rows']])
- for i,r in enumerate(data['rows']):
- indcl = cls.index(data['rows'][i][2])
- if params['n_scl'] < 0: rr = r[0]
- else: rr = r[0].split(".")[-min(r[0].count("."),params['n_scl'])]
- if len(rr) > params['max_feature_len']: rr = rr[:params['max_feature_len']/2-2]+" [..]"+rr[-params['max_feature_len']/2+2:]
-                if m*(indcl*2-1) < 0 and bcl: ax.text(mv/40.0,float(i)-0.3,rr, l_align, size=params['feature_font_size'],color=params['fore_color'])
-                else: ax.text(-mv/40.0,float(i)-0.3,rr, r_align, size=params['feature_font_size'],color=params['fore_color'])
- ax.set_title(params['title'],size=params['title_font_size'],y=1.0+head*(1.0-ints/(ints+ht))*0.8,color=params['fore_color'])
-
- ax.set_yticks([])
- ax.set_xlabel("LDA SCORE (log 10)")
- ax.xaxis.grid(True)
- xlim = ax.get_xlim()
- if params['autoscale']:
- ran = arange(0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100)
- if len(ran) > 1 and len(ran) < 100:
- ax.set_xticks(arange(xlim[0],xlim[1]+0.0001,min(xlim[1]+0.0001,round(round((abs(xlim[0])+abs(xlim[1]))/10,4)*100,0)/100)))
- ax.set_ylim((pos[0]-1,pos[-1]+1))
- leg = ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=5, borderaxespad=0., frameon=False,prop={'size':params['class_legend_font_size']})
-
- def get_col_attr(x):
-                return hasattr(x, 'set_color') and not hasattr(x, 'set_facecolor')
- for o in leg.findobj(get_col_attr):
-                o.set_color(params['fore_color'])
- for o in ax.findobj(get_col_attr):
-                o.set_color(params['fore_color'])
-
-
- plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
- plt.close()
-
-def plot_histo_ver(path,params,data):
- cls = data['cls']
- mmax = max([fabs(float(a)) for a in zip(*data['rows'])[1]])
- data['rows'].sort(key=lambda ab: fabs(float(ab[3]))/mmax+(cls.index(ab[2])+1))
- pos = arange(len(data['rows']))
- if params['n_scl'] < 0: nam = [d[0] for d in data['rows']]
- else: nam = [d[0].split(".")[-min(d[0].count("."),params['n_scl'])] for d in data['rows']]
- fig = plt.figure(edgecolor=params['back_color'],facecolor=params['back_color'],figsize=(params['width'], params['height']))
- ax = fig.add_subplot(111,axis_bgcolor=params['back_color'])
- plt.subplots_adjust(top=0.9, left=params['ls'], right=params['rs'], bottom=0.3)
- fig.canvas.set_window_title('LDA results')
- l_align = {'horizontalalignment':'left', 'verticalalignment':'baseline'}
- r_align = {'horizontalalignment':'right', 'verticalalignment':'baseline'}
- added = []
- for i,v in enumerate(data['rows']):
- indcl = data['cls'].index(v[2])
- lab = str(v[2]) if str(v[2]) not in added else ""
- added.append(str(v[2]))
- col = colors[indcl%len(colors)]
- vv = fabs(float(v[3]))
- ax.bar(pos[i],vv, align='center', color=col, label=lab)
- xticks(pos,nam,rotation=-20, ha = 'left',size=params['feature_font_size'])
- ax.set_title(params['title'],size=params['title_font_size'])
- ax.set_ylabel("LDA SCORE (log 10)")
- ax.yaxis.grid(True)
- a,b = ax.get_xlim()
- dx = float(len(pos))/float((b-a))
- ax.set_xlim((0-dx,max(pos)+dx))
- plt.savefig(path,format=params['format'],facecolor=params['back_color'],edgecolor=params['fore_color'],dpi=params['dpi'])
- plt.close()
-
-if __name__ == '__main__':
- params = read_params(sys.argv)
- params['fore_color'] = 'w' if params['back_color'] == 'k' else 'k'
- data = read_data(params['input_file'],params['output_file'])
- if params['orientation'] == 'v': plot_histo_ver(params['output_file'],params,data)
- else: plot_histo_hor(params['output_file'],params,data,len(data['cls']) == 2)
-
-

diff -r e7cd19afda2e -r db64b6287cd6 plot_res.xml
--- a/plot_res.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,156 +0,0 @@\n-<tool id="LEfSe_res" name="C) Plot LEfSe Results" version="1.0">\n- <description></description>\n-<command interpreter="python">./plot_res.py $inp_data $res --title "$textm.title" --subclades $textm.subclades --title_font_size $textm.title_font_size --max_feature_len $textm.max_f_len --feature_font_size $textm.feature_font_size --class_legend_font_size $textm.class_legend_font_size --width $graphical.width --format $for --dpi $dpi --left_space $graphical.left_space_prop --right_space $graphical.right_space_prop --background_color $graphical.background_color</command> \n- <inputs>\n- <page>\n-\t<param format="lefse_res" name="inp_data" type="data" label="Select data" help=""/>\n-\n-\t<conditional name="textm">\n- <param name="text_choice" type="select" label="Set text and label options (font size, abbreviations, ...)" help="">\n- <option value="d" selected=\'True\'>Default</option>\n- <option value="a">Advanced</option>\n- </param>\n-\t\t<when value="d">\n-\t\t\t<param name="title" type="hidden" value=""/>\n-\t\t\t<param name="title_font_size" type="hidden" value="14"/>\n-\t\t\t<param name="feature_font_size" type="hidden" value="7" />\n-\t\t <param name="class_legend_font_size" type="hidden" value="10" />\n-\t\t\t<param name="subclades" type="hidden" value="1" />\n-\t\t\t<param name="max_f_len" type="hidden" value="60" />\n-\t\t</when>\n- <when value="a">\n-\t\t\t<param name="title" type="text" size="10" value="" label="The title of the cladogram"/>\n-\t \t<param name="subclades" type="integer" size="2" value="1" label="Number of label levels to be displayed (-1 means all)"/>\n-\t \t<param name="max_f_len" type="integer" size="2" value="60" label="Maximum length of feature names "/>\n-\t\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n-\t\t\t<param name="feature_font_size" type="integer" size="2" value="7" label="Label font size"/>\n-\t \t<param name="class_legend_font_size" type="integer" size="2" value="10" label="Class font size"/>\n-\t\t</when>\n-\t</conditional>\n-\n-\t<conditional name="graphical">\n- <param name="text_choice" type="select" label="Set some graphical options to personalize the output" help="">\n- <option value="d" selected=\'True\'>Default</option>\n- <option value="a">Advanced</option>\n- </param>\n- <when value="d">\n-\t\t\t<param name="width" type="hidden" value="7.0" />\n-\t\t\t<param name="left_space_prop" type="hidden" value="0.2" />\n-\t\t\t<param name="right_space_prop" type="hidden" value="0.1" />\n-\t\t\t<param name="background_color" type="hidden" value="w"/>\n- </when>\n- <when value="a">\n-\t\t\t<param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n-\t\t\t<param name="left_space_prop" type="float" size="2" value="0.2" label="Fraction of the total width to be reserved for the space at the left of the plot "/>\n-\t\t\t<param name="right_space_prop" type="float" size="2" value="0.1" label="Fraction of the total width to be reserved for the space at the right of the plot "/>\n-\t\t\t<param name="background_color" type="select" label="Whether to optimize the colors for a white or black background">\n- <option value="w" selected=\'True\'>White</option>\n- <option value="k">Black</option>\n- </param>\n-\t\n- </when>\n- </conditional>\n-\n-\n-\n-\t<param name="for" type="select" label="Output format">\n- <option value="png" selected="png">png</option>\n- <option value="svg">svg</option>\n- <option value="pdf">pdf</option>\n- </param>\n- <param name="dpi" type="select" label="Set the dpi resolution of the output">\n- <option value="72">72</option>\n- <option value="150" selected="True">150</option>\n- '..b'ram name="maf_source" value="cached"/>\n- <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>\n- <param name="species" value="hg18,mm8"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />\n- </test>\n- <test>\n- <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n- <param name="maf_source" value="cached"/>\n- <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>\n- <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />\n- </test>\n- <test>\n- <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>\n- <param name="maf_source" value="user"/>\n- <param name="maf_file" value="5.maf"/>\n- <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>\n- <param name="overwrite_with_gaps" value="True"/>\n- <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />\n- </test>\n- </tests>\n- <help>\n-**What it does**\n-\n-This module plots the biomarkers found by LEfSe ranking them accordingly to their effect size and associating them with the class with the highest median. \n-\n-------\n-\n-**Input format**\n-\n-The module accepts the output of the LEfSe module (B) only.\n-\n-------\n-\n-**Output format**\n-\n-The module generate images in png, svg or pdf format. The png format is recommended for exploratory runs as it can be easily visualized internally in Galaxy, whereas the vectorial svg and pdf formats are recommended for the final publication-ready image to be downloaded.\n-\n-------\n-\n-**Parameters**\n-\n-In addition to the output format and the dpi resolution two sets of parameters can be tuned: text and label options for regulating the plot annotation and graphical options for personalizing the appearance of the plot. The default settings of the parameters should give satisfactory outputs in the great majority of the cases.\n-\n-**Advanced parameter settings**\n-\n-*Text and label options*\n-\t* The title of the cladogram: optional title for the plot (default is no title).\n-\t* Number of label levels to be displayed: when dealing with hierarchical features this option sets the level to be displayed in the labels (-1 means the last level only, 1 means the highest level, 2 means the first two levels and so on).\t\n-\t* Maximum length of feature names: set the length of the feature names above which the names will be abbreviated (in the middle of the string).\n-\t* Title font size: set the font size of the title only.\n-\t* Label font size: set the font size of the labels (and of the label legend) used in the cladogram to denote taxa.\n-\t* Class font size: set the font of the legend for the class names and colors.\n-*Graphical options*\n-\t* Width of the plot: set the inches for the width of the plot (the height is automatically set based on the number of biomarkers to be displayed).\n-\t* Fraction of the total width to be reserved for the space at the left of the plot: this option permits the user to enlarge the space on the left of the plot for the cases in which the feature labels are long and need more space.\n-\t* Fraction of the total width to be reserved for the space at the right of the plot: this option permits the user to enlarge the space on the right of the plot for the cases in which the feature labels are long and need more space. \n-\t* Whether to optimize the colors for a white or black background: this option permits the user to generate output plots with black backgrounds, adjusting properly the colors of the cladogram.\n-\n-\n-------\n-\n-**Example**\n-\n-The dataset provided here_ and described in the "Introduction" module produces the following image (alpha values of LEfSe - step B - are set to 0.01)\n-\n-.. image:: ../galaxy/static/images/plot_res_ex.png\n-.. _here: http://www.huttenhower.org/webfm_send/73 \n-\n-\n-\n-\n- </help>\n-</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 plot_single_feature.xml
--- a/plot_single_feature.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,151 +0,0 @@\n-<tool id="LEfSe_sfe" name="E) Plot One Feature" version="1.0">\n-<code file="format_input_selector.py"/> \n- <description></description>\n-<command interpreter="python">plot_features.py $inp_data $inp_res $arch --title_font_size $graphical.title_font_size --background_color $graphical.background_color --class_label_pos $graphical.class_label_pos --class_font_size $graphical.class_font_size --top $graphical.top --bot $graphical.bot --font_size $graphical.font_size --width $graphical.width --height $graphical.height -f one --feature_num $featd.feat --archive "none" --format $for --dpi $dpi --subcl_mean $graphical.subcl_mean --subcl_median $graphical.subcl_median </command> \n-<inputs>\n-\t<page>\n-\t<param format="lefse" name="inp_data" type="data" label="The formatted datasets" help=""/>\n-\t<param format="lefse_res" name="inp_res" type="data" label="The LEfSe output" help=""/>\n-\n-\t<param name="featd" type="data_column" data_ref="inp_res" value="1" optional="true" force_select="false" accept_default="false" /> \n-\n-\t<conditional name="featd" type="data_column" data_ref="inp_res" accept_default="true"> \n- <param name="feat_dir" type="select" data_ref="inp_res" label="Select the feature names among biomarkers or all features" help="">\n- <option value="b" selected=\'True\'>Biomarkers only</option> \n- <option value="a">All features</option> \n- </param>\n- \n- <when value="b">\n-\t\t<param name="feat" label="Select the feature to plot" data_ref="inp_res" type=\'select\' force_select="false" dynamic_options="get_row_names(inp_res,\'b\')" value="1" optional="true" accept_default="false" />\n- </when>\n- <when value="a">\n-\t\t<param name="feat" label="Select the feature to plot" data_ref="inp_res" type=\'select\' force_select="false" dynamic_options="get_row_names(inp_res,\'a\')" value="1" optional="true" accept_default="false" />\n- </when>\n-\t</conditional> \n-\n-\t<conditional name="graphical">\n-\t<param name="graphical_choice" type="select" label="Set some graphical options to personalize the output">\n- <option value="d" selected=\'True\'>Default</option>\n- <option value="a">Advanced</option>\n-\t</param>\n- \t<when value="d">\n-\t\t<param name="top" type="hidden" value="-1.0" />\n-\t\t<param name="bot" type="hidden" value="0.0" />\n- <param name="title_font_size" type="hidden" value="14" />\n- <param name="class_font_size" type="hidden" value="12" />\n- <param name="font_size" type="hidden" value="8" />\n- <param name="width" type="hidden" value="7.0" />\n- <param name="height" type="hidden" value="4.0" />\n- <param name="background_color" type="hidden" value="w" />\n-\t\t<param name="width" type="hidden" value="7.0" />\n- <param name="height" type="hidden" value="4.0" />\n-\t\t<param name="class_label_pos" type="hidden" value="up" />\n-\t\t<param name="subcl_mean" type="hidden" value="y" />\n-\t\t<param name="subcl_median" type="hidden" value="y" />\n-\t</when>\n-\n-\t<when value="a">\n-\t\t<param name="top" type="float" size="2" value="-1.0" label="Set the maximum y value (-1.0 means automatic maximum setting based on maximum class median)"/>\n-\t\t<param name="bot" type="float" size="2" value="0.0" label="Set the minimum y value (-1.0 means automatic minimum setting based on minimum class median)"/>\n-\t\t<param name="title_font_size" type="integer" size="2" value="14" label="Title font size"/>\n-\t <param name="class_font_size" type="integer" size="2" value="12" label="Class names font size"/>\n-\t <param name="font_size" type="integer" size="2" value="8" label="Size of subclasses names and y values"/>\n-\t <param name="width" type="float" size="2" value="7.0" label="Width of the plot"/>\n-\t <param name="height" type="float" size="2" value="4.0" label="Height of the plot"/>\n-\t\t<param name="background_color" type="select" label="Backgr'..b'tom</option>\n- </param>\n-\t\t<param name="subcl_mean" type="select" label="Plot subclass means (straight line)">\n- <option value="y" selected=\'True\'>Yes</option>\n- <option value="n">No</option>\n- </param>\n-\t\t<param name="subcl_median" type="select" label="Plot subclass medians (dotted line)">\n- <option value="y" selected=\'True\'>Yes</option>\n- <option value="n">No</option>\n- </param>\n-\n- </when>\n-\n- </conditional>\t\t\n-\n-\n- \t<param name="for" type="select" label="Output format">\n- <option value="png" selected="png">png</option>\n- <option value="svg">svg</option>\n- <option value="pdf">pdf</option>\n- </param>\n- <param name="dpi" type="select" label="Set the dpi resolution of the output">\n- <option value="72">72</option>\n- <option value="150" selected="True">150</option>\n- <option value="300">300</option>\n- <option value="600">600</option>\n- <option value="1200">1200</option>\n- </param>\n-\n-\t</page>\n-</inputs>\n- <outputs>\n- <data format="png" name="arch" >\n-\t<change_format> \n-\t<when input="for" value="svg" format="svg" />\n-\t</change_format>\n- </data>\n- </outputs>\n- <help>\n-**What it does**\n-\n-This module plots the raw data of a single feature as an abundance histogram with class and subclass information. You can select the feature to plot among the set of features detected by LEfSe as biomarker or among the full set of features.\n-\n-------\n-\n-**Input format**\n-\n-The module accepts two datasets: the data formatted with the "Format Input for\n-LEfSe" module and the output of the LEfSe analysis. Both datasets are necessary\n-to run the module. \n-\n-------\n-\n-**Output format**\n-\n-The module generates images in png, svg or pdf format. The png format is recommended for exploratory runs as it can be easily visualized internally in Galaxy, whereas the vectorial svg and pdf format are recommended for the final publication-ready image to be downloaded.\n-\n-------\n-\n-**Advanced parameter settings**\n-\n-*Graphical options*\n-\t* Set the maximum y value: set the maximum value on the y-axis. -1 means automatic parameter setting that is computed as the minimum between the highest abundance value and three times the highest subclass median.\n-\t* Set the minimum y value: -1 means automatic parameter setting that is computed as the maximum between 0 and the 90% of the smallest abundance value.\n-\t* Title font size: set the font size of the title only.\n-\t* Class font size: set the font of the legend for the class names and colors.\n-\t* Size of subclasses names and y values: set the fond size for the axis labels.\t\n-\t* Width of the plot: horizontal size (in inches) of the plot.\n-\t* Height of the plot: vertical size (in inches) of the plot.\n-\t* Background color: whether to generate plots with black or white backgrounds, adjusting properly the other colors.\n-\t* Class label position: whether to place the class labels on the top or on the bottom of the plot.\n-\t* Plot subclass means (straight line): whether to plot the subclass means with straight horizontal lines.\n-\t* Plot subclass medians (dotted line): whether to plot the subclass medians with dotted horizontal lines.\n-\n-------\n-\n-**Examples**\n-\n-Selecting the Clostridia clade from the biomarkers detected by LEfSe in the dataset provided here_ and described in the "Introduction", we obtain the following image:\n-\n-.. _here: http://www.huttenhower.org/webfm_send/73 \n-\n-Another example, taken from the analysis we detailed in `(Segata et. al 2011)`_ that compares the viral and bacterial microbiomes using metagenomic data from `(Dinsdale et. al 2008)`_:\n-\n-\n-\n-.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898\n-.. _(Dinsdale et. al 2008): http://www.ncbi.nlm.nih.gov/pubmed/18337718\n-\n- </help>\n-</tool>\n'

diff -r e7cd19afda2e -r db64b6287cd6 qiime2lefse.py
--- a/qiime2lefse.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-def read_params(args):
-    import argparse as ap
-    import textwrap
-
-    p = ap.ArgumentParser( description= "TBA" )
-
-    p.add_argument( '--in', metavar='INPUT_FILE', type=str,
-                    nargs='?', default=sys.stdin,
-                    help=   "the Qiime OTU table file "
-                            "[ stdin if not present ]" )
-    p.add_argument( '--md', metavar='METADATA_FILE', type=str,
-                    nargs='?', default=None,
-                    help=   "the Qiime OTU table file "
-                            "[ only OTU table without metadata if not present ]" )
-    p.add_argument( '--out', metavar='OUTPUT_FILE', type=str,
-                    nargs = '?', default=sys.stdout,
-                    help=   "the output file "
-                            "[stdout if not present]")
-
-    p.add_argument( '-c', metavar="class attribute",
-                    type=str,
-                    help =  "the attribute to use as class"   )
-    p.add_argument( '-s', metavar="subclass attribute",
-                    type=str,
-                    help =  "the attribute to use as subclass"   )
-    p.add_argument( '-u', metavar="subject attribute",
-                    type=str,
-                    help =  "the attribute to use as subject"   )
-
-
-
-    return vars(p.parse_args())
-
-
-
-def qiime2lefse(  fin, fmd, fout, all_md, sel_md ):
-    with (fin if fin==sys.stdin else open(fin)) as inpf :
-        lines = [list(ll) for ll in
-                    (zip(*[l.strip().split('\t')
-                        for l in inpf.readlines()[1:]]) ) ]
-    for i,(l1,l2) in enumerate(zip( lines[0], lines[-1] )):
-        if not l2 == 'Consensus Lineage':
-            lines[-1][i] = l2+"|"+l1
-
-    data = dict([(l[0],l[1:]) for l in lines[1:]])
-
-    md = {}
-    if fmd:
-        with open(fmd) as inpf:
-            mdlines = [l.strip().split('\t') for l in inpf.readlines()]
-
-        mdf = mdlines[0][1:]
-
-        for l in mdlines:
-            mdd = dict(zip(mdf,l[1:]))
-            md[l[0]] = mdd
-
-    selected_md = md.values()[0].keys() if md else []
-
-    if not all_md:
-        selected_md = [s for s in sel_md if s]
-
-    out_m = [   selected_md +
-                list([d.replace(";","|").replace("\"","") for d in data[ 'Consensus Lineage' ]])    ]
-    for k,v in data.items():
-        if k == 'Consensus Lineage':
-            continue
-        out_m.append( [md[k][kmd] for kmd in selected_md] + list(v) )
-
-    with (fout if fout == sys.stdout else open( fout, "w" )) as outf:
-        for l in zip(*out_m):
-            outf.write( "\t".join(l) + "\n" )
-
-if __name__ == '__main__':
-    pars = read_params( sys.argv )
-
-    qiime2lefse(   fin     = pars['in'],
-                   fmd     = pars['md'],
-                   fout    = pars['out'],
-                   all_md  = not pars['c'] and not pars['s'] and not pars['u'],
-                   sel_md  = [pars['c'],pars['s'],pars['u']])

diff -r e7cd19afda2e -r db64b6287cd6 requirements.txt
--- a/requirements.txt Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,5 +0,0 @@
-
-- R
-- R libraries: splines, stats4, survival, mvtnorm, modeltools, coin, MASS
-- python libraries: rpy2 (v. 2.1 or higher), numpy, matplotlib (v. 1.0 or higher), argparse
-

diff -r e7cd19afda2e -r db64b6287cd6 run_lefse.py
--- a/run_lefse.py Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,103 +0,0 @@
-#!/usr/bin/env python
-
-import os,sys,math,pickle
-from lefse import *
-
-def read_params(args):
-        parser = argparse.ArgumentParser(description='LEfSe 1.0')
-        parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file")
-        parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
-                help="the output file containing the data for the visualization module")
-        parser.add_argument('-o',dest="out_text_file", metavar='str', type=str, default="",
-                help="set the file for exporting the result (only concise textual form)")
-        parser.add_argument('-a',dest="anova_alpha", metavar='float', type=float, default=0.05,
-                help="set the alpha value for the Anova test (default 0.05)")
-        parser.add_argument('-w',dest="wilcoxon_alpha", metavar='float', type=float, default=0.05,
-                help="set the alpha value for the Wilcoxon test (default 0.05)")
-        parser.add_argument('-l',dest="lda_abs_th", metavar='float', type=float, default=2.0,
-                help="set the threshold on the absolute value of the logarithmic LDA score (default 2.0)")
-        parser.add_argument('--nlogs',dest="nlogs", metavar='int', type=int, default=3,
- help="max log ingluence of LDA coeff")
-        parser.add_argument('--verbose',dest="verbose", metavar='int', choices=[0,1], type=int, default=0,
- help="verbose execution (default 0)")
-        parser.add_argument('--wilc',dest="wilc", metavar='int', choices=[0,1], type=int, default=1,
- help="wheter to perform the Wicoxon step (default 1)")
- parser.add_argument('-r',dest="rank_tec", metavar='str', choices=['lda','svm'], type=str, default='lda',
- help="select LDA or SVM for effect size (default LDA)")
- parser.add_argument('--svm_norm',dest="svm_norm", metavar='int', choices=[0,1], type=int, default=1,
- help="whether to normalize the data in [0,1] for SVM feature waiting (default 1 strongly suggested)")
-        parser.add_argument('-b',dest="n_boots", metavar='int', type=int, default=30,
-                help="set the number of bootstrap iteration for LDA (default 30)")
-        parser.add_argument('-e',dest="only_same_subcl", metavar='int', type=int, default=0,
-                help="set whether perform the wilcoxon test only among the subclasses with the same name (default 0)")
-        parser.add_argument('-c',dest="curv", metavar='int', type=int, default=0,
-                help="set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] (default 0)")
-        parser.add_argument('-f',dest="f_boots", metavar='float', type=float, default=0.67,
-                help="set the subsampling fraction value for each bootstrap iteration (default 0.66666)")
-        parser.add_argument('-s',dest="strict", choices=[0,1,2], type=int, default=0,
-                help="set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison")
-#       parser.add_argument('-m',dest="m_boots", type=int, default=5,
-#               help="minimum cardinality of classes in each bootstrapping iteration (default 5)")
-        parser.add_argument('--min_c',dest="min_c", metavar='int', type=int, default=10,
-                help="minimum number of samples per subclass for performing wilcoxon test (default 10)")
-        parser.add_argument('-t',dest="title", metavar='str', type=str, default="",
-                help="set the title of the analysis (default input file without extension)")
-        parser.add_argument('-y',dest="multiclass_strat", choices=[0,1], type=int, default=0,
-                help="(for multiclass tasks) set whether the test is performed in a one-against-one ( 1 - more strict!) or in a one-against-all setting ( 0 - less strict) (default 0)")
-        args = parser.parse_args()
-
-        params = vars(args)
-        if params['title'] == "": params['title'] = params['input_file'].split("/")[-1].split('.')[0]
-        return params
-
-
-
-if __name__ == '__main__':
- init()
- params = read_params(sys.argv)
- feats,cls,class_sl,subclass_sl,class_hierarchy = load_data(params['input_file'])
- kord,cls_means = get_class_means(class_sl,feats)
- wilcoxon_res = {}
- kw_n_ok = 0
- nf = 0
- for feat_name,feat_values in feats.items():
- if params['verbose']:
- print "Testing feature",str(nf),": ",feat_name,
- nf += 1
- kw_ok,pv = test_kw_r(cls,feat_values,params['anova_alpha'],sorted(cls.keys()))
- if not kw_ok:
- if params['verbose']: print "\tkw ko"
- del feats[feat_name]
- wilcoxon_res[feat_name] = "-"
- continue
- if params['verbose']: print "\tkw ok\t",
-
- if not params['wilc']: continue
- kw_n_ok += 1
- res_wilcoxon_rep = test_rep_wilcoxon_r(subclass_sl,class_hierarchy,feat_values,params['wilcoxon_alpha'],params['multiclass_strat'],params['strict'],feat_name,params['min_c'],params['only_same_subcl'],params['curv'])
- wilcoxon_res[feat_name] = str(pv) if res_wilcoxon_rep else "-"
- if not res_wilcoxon_rep:
- if params['verbose']: print "wilc ko"
- del feats[feat_name]
- elif params['verbose']: print "wilc ok\t"
-
- if len(feats) > 0:
- print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
- if params['lda_abs_th'] < 0.0:
- lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
- else:
- if params['rank_tec'] == 'lda': lda_res,lda_res_th = test_lda_r(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0000000001,params['nlogs'])
- elif params['rank_tec'] == 'svm': lda_res,lda_res_th = test_svm(cls,feats,class_sl,params['n_boots'],params['f_boots'],params['lda_abs_th'],0.0,params['svm_norm'])
- else: lda_res,lda_res_th = dict([(k,0.0) for k,v in feats.items()]), dict([(k,v) for k,v in feats.items()])
- else:
- print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
- print "No features with significant differences between the two classes"
- lda_res,lda_res_th = {},{}
- outres = {}
- outres['lda_res_th'] = lda_res_th
- outres['lda_res'] = lda_res
- outres['cls_means'] = cls_means
- outres['cls_means_kord'] = kord
- outres['wilcox_res'] = wilcoxon_res
- print "Number of discriminative features with abs LDA score >",params['lda_abs_th'],":",len(lda_res_th)
- save_res(outres,params["output_file"])

diff -r e7cd19afda2e -r db64b6287cd6 run_lefse.xml
--- a/run_lefse.xml Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,101 +0,0 @@
-<tool id="LEfSe_run" name="B) LDA Effect Size (LEfSe)" version="1.0">
-  <description></description>
-
-<command interpreter="python">run_lefse.py $inp_data $output -l $lda_th -a $kw_alpha -w $w_alpha -e $w_pol -y $multiclass -f 0.9</command>
-  <inputs>
-    <page>
- <param format="lefse" name="inp_data" type="data" label="Select data" help=""/>
- <param name="kw_alpha" type="float" size="2" value="0.05" label="Alpha value for the factorial Kruskal-Wallis test among classes"/>
- <param name="w_alpha" type="float" size="2" value="0.05" label="Alpha value for the pairwise Wilcoxon test between subclasses"/>
- <param name="lda_th" type="float" size="2" value="2.0" label="Threshold on the logarithmic LDA score for discriminative features"/>
- <param name="w_pol" type="select" label="Do you want the pairwise comparisons among subclasses to be performed only among the subclasses with the same name?" help="">
-          <option value="0" selected="0">No</option>
-          <option value="1">Yes</option>
-        </param>
-
- <param name="multiclass" type="select" label="Set the strategy for multi-class analysis" help="">
-          <option value="1" selected="True">All-against-all (more strict)</option>
-          <option value="0">One-against-all (less strict)</option>
-        </param>
-    </page>
-   </inputs>
-  <outputs>
-    <data format="lefse_res" name="output" />
-  </outputs>
-  <tests>
-    <test>
-      <param name="input1" value="13.bed" dbkey="hg18" ftype="bed"/>
-      <param name="maf_source" value="cached"/>
-      <param name="maf_identifier" value="17_WAY_MULTIZ_hg18"/>
-      <param name="species" value="hg18,mm8"/>
-      <param name="overwrite_with_gaps" value="True"/>
-      <output name="out_file1" file="interval_maf_to_merged_fasta_out3.fasta" />
-    </test>
-    <test>
-      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
-      <param name="maf_source" value="cached"/>
-      <param name="maf_identifier" value="8_WAY_MULTIZ_hg17"/>
-      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
-      <param name="overwrite_with_gaps" value="True"/>
-      <output name="out_file1" file="interval_maf_to_merged_fasta_out.dat" />
-    </test>
-    <test>
-      <param name="input1" value="1.bed" dbkey="hg17" ftype="bed"/>
-      <param name="maf_source" value="user"/>
-      <param name="maf_file" value="5.maf"/>
-      <param name="species" value="canFam1,hg17,mm5,panTro1,rn3"/>
-      <param name="overwrite_with_gaps" value="True"/>
-      <output name="out_file1" file="interval_maf_to_merged_fasta_user_out.dat" />
-    </test>
-  </tests>
-  <help>
-**What it does**
-
-Lda Effective Size (LEfSe) is a biomarker discovery and explanation tool for high-dimensional data. It couples statistical significance with biological consistency and effect size estimation. For an overview of LEfSe please refer to the "Introduction" module or to `(Segata et. al 2011)`_.
-
-The scheme and the description below illustrates how the algorithm works:
-
-.. image:: https://bytebucket.org/biobakery/galaxy_lefse/wiki/lefse_met.png
-
-Input data consist of a collection of m samples (columns) each made up of n numerical features (rows, typically normalized per-sample, red representing high values and green low). These samples are labeled with a class (taking two or more possible values) that represents the main biological hypothesis under investigation; they may also have one or more subclass labels reflecting within-class groupings.
-
-- Step 1: the Kruskall-Wallis test analyzes all features, testing whether the values in different classes are differentially distributed. Features violating the null hypothesis are further analyzed in Step 2.
-
-- Step 2: the pairwise Wilcoxon test checks whether all pairwise comparisons between subclasses within different classes significantly agree with the class level trend.
-
-- Step 3: the resulting subset of vectors is used to build a Linear Discriminant Analysis model from which the relative difference among classes is used to rank the features. The final output thus consists of a list of features that are discriminative with respect to the classes, consistent with the subclass grouping within classes, and ranked according to the effect size with which they differentiate classes.
-
-**Input format**
-
-The input for this module must be generated with the **"Format Input for LEfSe"** tool.
-
-------
-
-**Output format**
-
-The output consists of a tabular file listing all the features, the logarithm value of the highest mean among all the classes, and if the feature is discriminative, the class with the highest mean and the logarithmic LDA score.
-
-The output file can be conveniently visualized with the "Plot LEfSe Results" module and, if feature names define a hierarchy, with the "Plot Cladogram" module. The output can also be used for generating the histograms of the raw data of the discriminative features using the "Plot Differential Features" module.
-
-------
-
-**Parameters**
-
-The input parameters are the alpha-values for the factorial Kruskal-Wallis test and for the pairwise Wilcoxon test among subclasses (steps 1 and 2 in the schematic picture above) and the non-negative threshold for the logarithmic LDA score. Moreover, the user can decide the pairwise Wilcoxon test to be applied only among subclasses in different classes with the same name (less stringent) and select the multi-class strategy to be the All-against-all (more stringent) or the One-against-all (less stringent).
-
-.. _here: http://www.huttenhower.org/webfm_send/73
-.. _(Segata et. al 2011): http://www.ncbi.nlm.nih.gov/pubmed/21702898
-.. _(Garrett et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20833380
-.. _(Veiga et. al 2010): http://www.ncbi.nlm.nih.gov/pubmed/20921388
-.. _contact us: nsegata@hsph.harvard.edu
-
-**Example**
-
-For the mouse model dataset (see the "Introduction" module) it is suggested to use alpha=0.01 as the sample size is not very large.
-
-  </help>
-</tool>

diff -r e7cd19afda2e -r db64b6287cd6 test-data/lefse_input
--- a/test-data/lefse_input Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,340 +0,0 @@\n-class\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\ttruc\trag2\trag2\trag2\trag2\trag2\trag2\trag2\trag2\trag2\trag2\n-Archaea\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Sulfolobales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Sulfolobales|Sulfolobaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Thermoproteales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Thermoproteales|Thermoproteaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Crenarchaeota|Thermoprotei|Thermoproteales|Thermoproteaceae|Caldivirga\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Halobacteria\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Halobacteria|Halobacteriales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Halobacteria|Halobacteriales|Halobacteriaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Methanomicrobia\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Methanopyri\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Methanopyri|Methanopyrales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Methanopyri|Methanopyrales|Methanopyraceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Archaea|Euryarchaeota|Methanopyri|Methanopyrales|Methanopyraceae|Methanopyrus\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria\t99.9276759884\t99.9902190923\t99.9595436524\t100.0\t99.9808135073\t99.9103827935\t99.910001125\t99.9718309859\t100.0\t99.9383730485\t99.8693095186\t99.9816681943\t100.0\t100.0\t99.869303825\t99.9558985667\t99.9800876145\t99.7702539299\t99.9909033021\t100.0\t99.9226978452\t99.9476896251\t99.9853436905\t99.9903241413\t100.0\t98.5993721323\t99.9886169607\t99.940284247\t100.0\t99.9723871324\n-Bacteria|Acidobacteria\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Actinobacteria\t0.0723240115718\t0.0880281690141\t0.121369042803\t0.232045480914\t0.134305448964\t0.307258993727\t0.044999437507\t0.0422535211268\t0.773313115997\t0.0410846343468\t0.130690481377\t0.733272227314\t1.31532172058\t0.882723833544\t0.653480874793\t0.8710033076'..b'oteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Plesiomonas\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Rahnella\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Trabulsiella\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Oceanospirillales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Oceanospirillales|Hahellaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Oceanospirillales|Hahellaceae|Halospina\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Oceanospirillales|Oceanospirillaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Oceanospirillales|Oceanospirillaceae|Oceanobacter\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pasteurellales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pasteurellales|Pasteurellaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0384073742158\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0108459869848\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Moraxellaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0128024580719\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Moraxellaceae|Acinetobacter\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0128024580719\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0256049161439\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0108459869848\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Thiotrichales\t0.0482160077146\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Thiotrichales|Thiotrichaceae\t0.0482160077146\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|Proteobacteria|Gammaproteobacteria|Thiotrichales|Thiotrichaceae|Thiothrix\t0.0482160077146\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n-Bacteria|TM7\t0.0\t0.00978090766823\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0252206809584\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0113830392715\t0.0\t0.0\t0.0\n-Bacteria|Verrucomicrobia\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n'

diff -r e7cd19afda2e -r db64b6287cd6 test-data/lefse_output_a
--- a/test-data/lefse_output_a Tue May 13 21:57:00 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b"@@ -1,11553 +0,0 @@\n-(dp0\n-S'class_sl'\n-p1\n-(dp2\n-S'rag2'\n-p3\n-(I0\n-I10\n-tp4\n-sS'truc'\n-p5\n-(I10\n-I30\n-tp6\n-ssS'class_hierarchy'\n-p7\n-(dp8\n-g3\n-(lp9\n-S'rag2_subcl'\n-p10\n-asg5\n-(lp11\n-S'truc_subcl'\n-p12\n-assS'feats'\n-p13\n-(dp14\n-S'Bacteria.Actinobacteria.Actinobacteria.Coriobacteriales.Coriobacteriaceae'\n-p15\n-(lp16\n-F552.4098881370206\n-aF1403.08679094\n-aF1195.0286806852373\n-aF3073.770491803074\n-aF734.7538574575511\n-aF2378.4748030299997\n-aF1064.4474550016819\n-aF879.5074758127307\n-aF1046.755059315813\n-aF1160.42935885933\n-aF1626.89804772\n-aF1182.6783115031262\n-aF2787.5409041296684\n-aF3087.0344552912557\n-aF7831.458195452485\n-aF5758.157389638737\n-aF3656.99873897\n-aF4858.3955445\n-aF3850.3850385039605\n-aF872.4100327155379\n-aF411.0996916756056\n-aF6216.83093252\n-aF281.7695125387165\n-aF337.79979732034064\n-aF2178.370066630947\n-aF1055.4596046800034\n-aF1740.34110686\n-aF1133.2361988057178\n-aF586.9118654018354\n-aF482.5090470451561\n-asS'Bacteria.Firmicutes.Bacilli.Lactobacillales.Streptococcaceae'\n-p17\n-(lp18\n-F1104.8197762720406\n-aF1002.20485067\n-aF1075.525812617714\n-aF796.9034608381303\n-aF0.0\n-aF743.273375948\n-aF1354.7513063621402\n-aF1465.8457930262184\n-aF348.91835310560447\n-aF386.8097862877777\n-aF1952.2776572700002\n-aF1182.6783115031262\n-aF484.7897224575951\n-aF896.2358095993966\n-aF1544.2311934697857\n-aF2617.344268014877\n-aF1513.2408575\n-aF2014.45668918\n-aF733.4066740004686\n-aF3925.845147221923\n-aF616.6495375124077\n-aF2122.82031842\n-aF422.654268808575\n-aF1463.7991217218096\n-aF2434.6488980004706\n-aF4989.445403948199\n-aF5685.1142824\n-aF1133.2361988057178\n-aF1565.0983077408946\n-aF7961.3992762360685\n-asS'Bacteria.Firmicutes.Clostridia.Clostridiales.Eubacteriaceae'\n-p19\n-(lp20\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Archaea.Euryarchaeota.Methanopyri.Methanopyrales'\n-p21\n-(lp22\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Firmicutes.Bacilli.Bacillales.Bacillaceae.Jeotgalibacillus'\n-p23\n-(lp24\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Actinobacteria.Actinobacteria.Actinomycetales.Micromonosporaceae'\n-p25\n-(lp26\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Firmicutes.Bacilli.Lactobacillales.Lactobacillaceae.Paralactobacillus'\n-p27\n-(lp28\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Firmicutes.Clostridia.Clostridiales.Lachnospiraceae.Anaerostipes'\n-p29\n-(lp30\n-F138.102472033755\n-aF200.440970134\n-aF119.50286806852372\n-aF455.3734061927887\n-aF734.7538574575511\n-aF0.0\n-aF0.0\n-aF732.9228965111089\n-aF174.45917655230195\n-aF386.8097862877777\n-aF325.379609544\n-aF90.9752547306866\n-aF605.987153072495\n-aF1095.3993228459296\n-aF882.4178248404493\n-aF0.0\n-aF126.103404792\n-aF236.99490461\n-aF183.35166850061725\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF1125.9993244044717\n-aF128.1394156837612\n-aF0.0\n-aF0.0\n-aF80.94544277159405\n-aF0.0\n-aF0.0\n-asS'Archaea.Crenarchaeota'\n-p31\n-(lp32\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Bacteroidetes.Sphingobacteria'\n-p33\n-(lp34\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0."..b"aF1000000.0\n-aF1000000.0000000001\n-aF1000000.0000000001\n-aF1000000.0\n-aF1000000.0\n-aF999999.9999999999\n-aF1000000.0\n-aF999999.9999999999\n-aF1000000.0\n-aF999999.9999999999\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0\n-aF999999.9999999999\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0000000001\n-aF999999.9999999999\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0000000001\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0\n-aF1000000.0000000001\n-asS'Bacteria.Bacteroidetes.Bacteroidia.Bacteroidales.Porphyromonadaceae'\n-p695\n-(lp696\n-F21129.678221271544\n-aF104830.62737999999\n-aF24617.5908221299\n-aF144125.68306014413\n-aF15429.831006617704\n-aF214360.04162300003\n-aF66479.58196241505\n-aF67575.49105831565\n-aF113921.84228879426\n-aF198820.23015208586\n-aF40563.9913232\n-aF39847.16157200993\n-aF6787.056114408335\n-aF32065.325632351745\n-aF48091.771453810485\n-aF41005.06019893645\n-aF18663.3039092\n-aF23936.4853656\n-aF32086.541987533004\n-aF5888.767720832885\n-aF205.5498458378028\n-aF21683.093252500003\n-aF1831.5018314991567\n-aF4503.997297597869\n-aF39210.661199417096\n-aF12185.76089042368\n-aF17171.365587700002\n-aF8337.380605468485\n-aF586.9118654018354\n-aF13510.253317276378\n-asS'Bacteria.Actinobacteria'\n-p697\n-(lp698\n-F828.6148322055308\n-aF134495.89096\n-aF4899.617590818478\n-aF142873.4061929762\n-aF27920.646583388974\n-aF144195.034934\n-aF25159.667118239755\n-aF103048.9592494041\n-aF18492.672714626053\n-aF29590.94865103301\n-aF1735.3579175700002\n-aF1728.529839887646\n-aF2787.5409041296684\n-aF3087.0344552912557\n-aF8713.876020290933\n-aF6543.360670042199\n-aF8827.238335439999\n-aF13153.217205800001\n-aF7334.066740004686\n-aF1308.615049077312\n-aF411.0996916756056\n-aF7733.13115997\n-aF422.654268808575\n-aF450.3997297597869\n-aF3075.345976424281\n-aF1343.3122241418232\n-aF2320.45480914\n-aF1214.181641575411\n-aF880.3677981027531\n-aF723.7635705667334\n-asS'Bacteria.Proteobacteria.Betaproteobacteria.Burkholderiales.Comamonadaceae.Verminephrobacter'\n-p699\n-(lp700\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Cyanobacteria.Cyanobacteria'\n-p701\n-(lp702\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-asS'Bacteria.Proteobacteria.Deltaproteobacteria.Desulfobacterales.Desulfobacteraceae'\n-p703\n-(lp704\n-F0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-aF0.0\n-assS'subclass_sl'\n-p705\n-(dp706\n-g10\n-(I0\n-I10\n-tp707\n-sg12\n-(I10\n-I30\n-tp708\n-ssS'norm'\n-p709\n-F1000000.0\n-sS'cls'\n-p710\n-(dp711\n-S'class'\n-p712\n-(S'rag2'\n-p713\n-S'rag2'\n-p714\n-S'rag2'\n-p715\n-S'rag2'\n-p716\n-S'rag2'\n-p717\n-S'rag2'\n-p718\n-S'rag2'\n-p719\n-S'rag2'\n-p720\n-S'rag2'\n-p721\n-g3\n-S'truc'\n-p722\n-S'truc'\n-p723\n-S'truc'\n-p724\n-S'truc'\n-p725\n-S'truc'\n-p726\n-S'truc'\n-p727\n-S'truc'\n-p728\n-S'truc'\n-p729\n-S'truc'\n-p730\n-S'truc'\n-p731\n-S'truc'\n-p732\n-S'truc'\n-p733\n-S'truc'\n-p734\n-S'truc'\n-p735\n-S'truc'\n-p736\n-S'truc'\n-p737\n-S'truc'\n-p738\n-S'truc'\n-p739\n-S'truc'\n-p740\n-g5\n-tp741\n-sS'subclass'\n-p742\n-(lp743\n-S'rag2_subcl'\n-p744\n-aS'rag2_subcl'\n-p745\n-aS'rag2_subcl'\n-p746\n-aS'rag2_subcl'\n-p747\n-aS'rag2_subcl'\n-p748\n-aS'rag2_subcl'\n-p749\n-aS'rag2_subcl'\n-p750\n-aS'rag2_subcl'\n-p751\n-aS'rag2_subcl'\n-p752\n-ag10\n-aS'truc_subcl'\n-p753\n-aS'truc_subcl'\n-p754\n-aS'truc_subcl'\n-p755\n-aS'truc_subcl'\n-p756\n-aS'truc_subcl'\n-p757\n-aS'truc_subcl'\n-p758\n-aS'truc_subcl'\n-p759\n-aS'truc_subcl'\n-p760\n-aS'truc_subcl'\n-p761\n-aS'truc_subcl'\n-p762\n-aS'truc_subcl'\n-p763\n-aS'truc_subcl'\n-p764\n-aS'truc_subcl'\n-p765\n-aS'truc_subcl'\n-p766\n-aS'truc_subcl'\n-p767\n-aS'truc_subcl'\n-p768\n-aS'truc_subcl'\n-p769\n-aS'truc_subcl'\n-p770\n-aS'truc_subcl'\n-p771\n-ag12\n-ass.\n\\ No newline at end of file\n"