Galaxy |

Changeset 0:00912e6e974f (2016-04-24)

Next changeset 1:085b26768dae (2016-04-24)

Commit message:
metaphlan2 Huttenhower Lab: Initial upload

added:
metaphlan2_hutlab/metaphlan2.py
metaphlan2_hutlab/metaphlan2.xml
metaphlan2_hutlab/tool_dependencies.xml

diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/metaphlan2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.py Sun Apr 24 13:19:46 2016 -0400

[

b'@@ -0,0 +1,1283 @@\n+#!/usr/bin/env python\n+\n+from __future__ import with_statement \n+\n+# ==============================================================================\n+# MetaPhlAn v2.x: METAgenomic PHyLogenetic ANalysis for taxonomic classification\n+# of metagenomic data\n+#\n+# Authors: Nicola Segata (nicola.segata@unitn.it), \n+# Duy Tin Truong (duytin.truong@unitn.it)\n+#\n+# Please type "./metaphlan2.py -h" for usage help\n+#\n+# ==============================================================================\n+\n+__author__ = \'Nicola Segata (nicola.segata@unitn.it), Duy Tin Truong (duytin.truong@unitn.it)\'\n+__version__ = \'2.5.0\'\n+__date__ = \'28 April 2015\'\n+\n+\n+import sys\n+import os\n+import stat\n+import re\n+from binascii import b2a_uu \n+\n+try:\n+ import numpy as np \n+except ImportError:\n+ sys.stderr.write("Error! numpy python library not detected!!\\n")\n+ sys.exit(1)\n+import tempfile as tf\n+import argparse as ap\n+import subprocess as subp\n+import multiprocessing as mp\n+from collections import defaultdict as defdict\n+import bz2 \n+import itertools\n+from distutils.version import LooseVersion\n+try:\n+ import cPickle as pickle\n+except:\n+ import pickle\n+\n+import cStringIO\n+\n+#*************************************************************\n+#* Imports related to biom file generation *\n+#*************************************************************\n+try:\n+ import biom\n+ import biom.table\n+ import numpy as np\n+except ImportError:\n+ sys.stderr.write("Warning! Biom python library not detected!"\n+ "\\n Exporting to biom format will not work!\\n")\n+try:\n+ import json\n+except ImportError:\n+ sys.stderr.write("Warning! json python library not detected!"\n+ "\\n Exporting to biom format will not work!\\n")\n+\n+# This set contains the markers that after careful validation are found to have low precision or recall\n+# We esclude the markers here to avoid generating a new marker DB when changing just few markers\n+markers_to_exclude = \\\n+ set([\n+ \'NC_001782.1\',\'GeneID:17099689\',\'gi|419819595|ref|NZ_AJRE01000517.1|:1-118\',\n+ \'GeneID:10498696\', \'GeneID:10498710\', \'GeneID:10498726\', \'GeneID:10498735\',\n+ \'GeneID:10498757\', \'GeneID:10498760\', \'GeneID:10498761\', \'GeneID:10498763\',\n+ \'GeneID:11294465\', \'GeneID:14181982\', \'GeneID:14182132\', \'GeneID:14182146\',\n+ \'GeneID:14182148\', \'GeneID:14182328\', \'GeneID:14182639\', \'GeneID:14182647\',\n+ \'GeneID:14182650\', \'GeneID:14182663\', \'GeneID:14182683\', \'GeneID:14182684\',\n+ \'GeneID:14182691\', \'GeneID:14182803\', \'GeneID:14296322\', \'GeneID:1489077\',\n+ \'GeneID:1489080\', \'GeneID:1489081\', \'GeneID:1489084\', \'GeneID:1489085\',\n+ \'GeneID:1489088\', \'GeneID:1489089\', \'GeneID:1489090\', \'GeneID:1489528\',\n+ \'GeneID:1489530\', \'GeneID:1489531\', \'GeneID:1489735\', \'GeneID:1491873\',\n+ \'GeneID:1491889\', \'GeneID:1491962\', \'GeneID:1491963\', \'GeneID:1491964\',\n+ \'GeneID:1491965\', \'GeneID:17099689\', \'GeneID:1724732\', \'GeneID:17494231\',\n+ \'GeneID:2546403\', \'GeneID:2703374\', \'GeneID:2703375\', \'GeneID:2703498\',\n+ \'GeneID:2703531\', \'GeneID:2772983\', \'GeneID:2772989\', \'GeneID:2772991\',\n+ \'GeneID:2772993\', \'GeneID:2772995\', \'GeneID:2773037\', \'GeneID:2777387\',\n+ \'GeneID:2777399\', \'GeneID:2777400\', \'GeneID:2777439\', \'GeneID:2777493\',\n+ \'GeneID:2777494\', \'GeneID:3077424\', \'GeneID:3160801\', \'GeneID:3197323\',\n+ \'GeneID:3197355\', \'GeneID:3197400\', \'GeneID:3197428\', \'GeneID:3783722\',\n+ \'GeneID:3783750\', \'GeneID:3953004\', \'GeneID:3959334\', \'GeneID:3964368\',\n+ \'GeneID:3964370\', \'GeneID:4961452\', \'GeneID:5075645\', \'GeneID:5075646\',\n+ \'GeneID:5075647\', \'GeneID:5075648\', \'GeneID:5075649\', \'GeneID:5075650\',\n+ \'GeneID:5075651\', \'GeneID:5075652\', \'GeneID:5075653\', \'GeneID:5075654\',\n+ \'GeneID:5075655\', \'GeneID:5075656\', \'GeneID:5075657\', \'GeneID:5075658\',\n+ \'GeneID:5075659\', \'GeneID:5075660\', \''..b'[0].count("|"))) ): \n+ outf.write( "\\t".join( [k,str(v)] ) + "\\n" ) \n+ else:\n+ outf.write( "unclassified\\t100.0\\n" )\n+ maybe_generate_biom_file(pars, outpred)\n+ elif pars[\'t\'] == \'rel_ab_w_read_stats\':\n+ cl2ab, rr = tree.relative_abundances( \n+ pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None )\n+ outpred = [(k,round(v*100.0,5)) for k,v in cl2ab.items() if v > 0.0]\n+ totl = 0\n+ if outpred:\n+ outf.write( "\\t".join( [ "#clade_name",\n+ "relative_abundance",\n+ "coverage",\n+ "average_genome_length_in_the_clade",\n+ "estimated_number_of_reads_from_the_clade" ]) +"\\n" )\n+\n+ for k,v in sorted( outpred, reverse=True,\n+ key=lambda x:x[1]+(100.0*(8-x[0].count("|"))) ): \n+ outf.write( "\\t".join( [ k,\n+ str(v),\n+ str(rr[k][0]) if k in rr else "-",\n+ str(rr[k][1]) if k in rr else "-",\n+ str(int(round(rr[k][2],0)) if k in rr else "-") \n+ ] ) + "\\n" ) \n+ if "|" not in k:\n+ totl += (int(round(rr[k][2],0)) if k in rr else 0)\n+\n+ outf.write( "#estimated total number of reads from known clades: " + str(totl)+"\\n")\n+ else:\n+ outf.write( "unclassified\\t100.0\\n" )\n+ maybe_generate_biom_file(pars, outpred)\n+\n+ elif pars[\'t\'] == \'clade_profiles\':\n+ cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None )\n+ for c,p in cl2pr.items():\n+ mn,n = zip(*p)\n+ outf.write( "\\t".join( [""]+[str(s) for s in mn] ) + "\\n" )\n+ outf.write( "\\t".join( [c]+[str(s) for s in n] ) + "\\n" )\n+ elif pars[\'t\'] == \'marker_ab_table\':\n+ cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None )\n+ for v in cl2pr.values():\n+ outf.write( "\\n".join(["\\t".join([str(a),str(b/float(pars[\'nreads\'])) if pars[\'nreads\'] else str(b)]) \n+ for a,b in v if b > 0.0]) + "\\n" )\n+ elif pars[\'t\'] == \'marker_pres_table\':\n+ cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None )\n+ for v in cl2pr.values():\n+ strout = ["\\t".join([str(a),"1"]) for a,b in v if b > pars[\'pres_th\']]\n+ if strout:\n+ outf.write( "\\n".join(strout) + "\\n" )\n+\n+ elif pars[\'t\'] == \'marker_counts\':\n+ outf.write( "\\n".join( ["\\t".join([m,str(c)]) for m,c in tree.markers2counts().items() ]) +"\\n" )\n+\n+ elif pars[\'t\'] == \'clade_specific_strain_tracker\':\n+ cl2pr = tree.clade_profiles( None, get_all = True )\n+ cl2ab, _ = tree.relative_abundances( None )\n+ strout = []\n+ for cl,v in cl2pr.items():\n+ if cl.endswith(pars[\'clade\']) and cl2ab[cl]*100.0 < pars[\'min_ab\']:\n+ strout = []\n+ break\n+ if pars[\'clade\'] in cl:\n+ strout += ["\\t".join([str(a),str(int(b > pars[\'pres_th\']))]) for a,b in v]\n+ if strout:\n+ strout = sorted(strout,key=lambda x:x[0])\n+ outf.write( "\\n".join(strout) + "\\n" )\n+ else:\n+ sys.stderr.write("Clade "+pars[\'clade\']+" not present at an abundance >"+str(round(pars[\'min_ab\'],2))+"%, "\n+ "so no clade specific markers are reported\\n")\n'

diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/metaphlan2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.xml Sun Apr 24 13:19:46 2016 -0400

[

b'@@ -0,0 +1,210 @@\n+<tool id="metaphlan2_hutlab" name="MetaPhlAn2" version="2.5.0">\n+ <requirements>\n+ <requirement type="package" version="2.5.0">metaphlan2_hutlab</requirement>\n+ <requirement type="package">bowtie2</requirement>\n+ </requirements>\n+\n+\t<description>metagenomic profiler V2</description>\n+\t<command interpreter="python">metaphlan2.py $input \n+\t\t--mpa_pkl \\${METAPHLAN2_PATH}/db_v20/mpa_v20_m200.pkl \n+\t\t--bowtie2db \\${METAPHLAN2_PATH}//db_v20/mpa_v20_m200 \n+\t\t--input_type fastq\n+\t\t--no_map \n+\t\t--bt2_ps $PresetsForBowtie2 \n+\t\t#if $str($gchoice_post_mapping.global_choice_post_mapping) == "1":\n+\t\t\t--tax_lev $gchoice_post_mapping.Taxonomic_Level\n+\t\t\t--min_cu_len $gchoice_post_mapping.min_cu_len\n+\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Viruses) == "1":\n+\t\t\t\t--ignore_viruses\n+\t\t\t#end if\t\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Eukaryotes) == "1":\n+\t\t\t\t--ignore_eukaryotes \n+\t\t\t#end if\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Bacteria) == "1":\n+\t\t\t\t--ignore_bacteria\n+\t\t\t#end if\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Archaea) == "1":\n+\t\t\t\t--ignore_archaea\n+\t\t\t#end if\t\n+\n+\t\t\t--stat_q $gchoice_post_mapping.stat_q\n+\t\t#end if\n+ \n+\t\t#if $str($gchoice_additional_analysis_types.global_additional_analysis_types) == "1":\n+ \t\t\t-t $gchoice_additional_analysis_types.Analysis_Type \n+\t\t\t#if int($gchoice_additional_analysis_types.nreads) > 0:\n+\t\t\t\t--nreads $gchoice_additional_analysis_types.nreads\n+\t\t\t#end if\t\n+\t\t\t\n+\t\t\t#if int($gchoice_additional_analysis_types.pres_th) > 0:\n+\t\t\t\t--pres_th $gchoice_additional_analysis_types.pres_th\n+\t\t\t#end if\t\n+\t\t\t\n+\t\t\t#if $str($gchoice_additional_analysis_types.clade) != " ":\n+\t\t\t\t--clade $gchoice_additional_analysis_types.clade \n+\t\t\t\t#if int($gchoice_additional_analysis_types.min_ab) > 0:\n+\t\t\t\t\t\t --min_ab $gchoice_additional_analysis_types.min_ab \n+\t\t\t\t#end if\t\t\t\t\n+\t\t\t#end if\t\t\t\t\n+\t\t\t\n+\t\t#end if\n+ \t\t-o $output \n+\t\t\n+\t\t\t\n+\t\t#if $str($gchoice_biom.global_biom) == "1":\n+\t\t\t--biom $biom_output\n+\t\t\t--mdelim $gchoice_biom.MetadataDelimiterChar\n+\t\t#end if\t\n+\t</command>\n+\t<inputs>\n+\t\t<param format="fastq" name="input" type="data" label="Input metagenome (fastq of metagenomic reads, loaded with the Get Data module )"></param>\n+\n+\t\t<param name="PresetsForBowtie2" type="select" format="text" >\n+\t\t\t<label>Sensitivity options for read-marker similarity (as described by BowTie2)</label>\n+\t\t\t\t<option value="very-sensitive">Very Sensitive</option>\n+\t\t\t\t<option value="sensitive">Sensitive</option>\n+\t\t\t\t<option value="very-sensitive-local">Very Sensitive Local</option>\n+\t\t\t\t<option value="sensitive-local">Sensitive Local</option>\n+\n+\t\t</param>\t\n+\t\t\n+\t\t\n+\t\t\n+\t<conditional name="gchoice_post_mapping">\n+ \t<param name="global_choice_post_mapping" type="select" label="Display Post Mapping Advanced Parameters" multiple="False" help="Select Post Mapping advanced choices ">\n+ \t\t<option value="0" selected=\'True\'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+ \t</param>\n+ \t<when value="0">\n+ \t\t\t\t<param name="min_cu_len" type="hidden" value="" />\n+\t\t\t</when>\n+\n+ \t<when value="1">\n+\t\t\t\t<param name="Taxonomic_Level" type="select" value="a" format="text" >\n+\t\t\t\t\t<label>Taxonomic Level</label>\n+\t\t\t\t\t\t<option value="a">All taxonomic levels</option>\n+\t\t\t\t\t\t<option value="k">Kingdoms (Bacteria and Archaea) only</option>\n+\t\t\t\t\t\t<option value="p">Phyla only</option>\n+\t\t\t\t\t\t<option value="o">Orders only</option>\n+\t\t\t\t\t\t<option value="f">Families only</option>\n+\t\t\t\t\t\t<option value="g">Genera only</option>\n+\t\t\t\t\t\t<option value="s">Species only</option>\n+\t\t\t\t</param>\n+\t\t\t\t<param name="min_cu_len" type="integer" size="4" value="2000" label="Minimum total nucleotide length for the markers in a clade "/>\n+\n+\t\t\t\t<param name="Ignore_Viruses" type="select" label="Profile viral organisms" value="0" >\n+\t\t\t\t\t\t<option value="0">Yes</option>\n+\t\t\t\t\t\t<option value="1">No</option>\n+\t\t\t\t</param>\n+\t\t\t\t\n+\t\t\t\t<param name="Ignore_Eukaryotes" type="select" l'..b'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+ \t</param>\n+ \t<when value="0">\n+ \t\t\t\t<param name="Analysis_Type" type="hidden" value="" />\n+\t\t\t</when>\n+\n+ \t<when value="1">\n+\t\t\t\t<param name="Analysis_Type" type="select" value="rel_ab" format="text" >\n+\t\t\t\t\t<label>Analysis Type: Type of Analysis to perform</label>\n+\t\t\t\t\t\t<option value="rel_ab">Profiling a metagenomes in terms of relative abundances</option>\n+\t\t\t\t\t\t<option value="reads_map">Mapping from reads to clades (only reads hitting a marker)</option>\n+\t\t\t\t\t\t<option value="clade_profiles">Normalized marker counts for clades with at least a non-null marker</option>\n+\t\t\t\t\t\t<option value="marker_ab_table">normalized marker counts (only when > 0.0 and normalized by metagenome size if --nreads is specified)</option>\n+\t\t\t\t\t\t<option value="marker_pres_table">list of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th</option>\n+\t\t\t\t</param>\n+\t\t\t\t\n+\t\t\t\t<param name="nreads" type="integer" size="4" value="0" label="The total number of reads in the original metagenome "/>\n+\t\t\t\t<param name="pres_th" type="integer" size="4" value="0" label="Threshold for calling a marker present"/>\n+\t\t\t\t<param name="clade" label="The clade for clade specific strain tracker analysis" value=" " type="text" format="text"/>\n+\t\t\t\t<param name="min_ab" type="integer" size="4" value="0" min="0" max="100" label="The minimum percentage abundance for the clade in the clade specific strain tracker analysis"/>\t\n+\t\n+\t\t\t</when>\n+ </conditional>\n+\t\t\n+\t\t\n+\t\t\n+\t\t\n+\t\t<conditional name="gchoice_biom">\n+ \t<param name="global_biom" type="select" label="Display additional biom advanced parameters" help="Select additional biom choices ">\n+ \t\t<option value="0" selected=\'True\'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+ \t</param>\n+ \t<when value="0">\n+\t\t\t</when>\n+\n+ \t<when value="1">\n+\t\t\t\t<param name="MetadataDelimiterChar" label="Delimiter for bug metadata if biom file requested - defaults to pipe. e.g. the pipe in k__Bacteria|p__Proteobacteria" value="|" type="text" format="text"/>\n+\t\t\t</when>\n+ </conditional>\n+ \n+\n+\t</inputs>\n+\t<outputs>\n+\t\t<data name="output" format="tabular" />\n+ <data name="biom_output" format="text">\n+ <filter>gchoice_biom["global_biom"] == "1"</filter>\n+ </data>\n+\t \n+\n+\t</outputs>\n+ \n+ <help>\n+**MetaPhlAn** is a computational tool for profiling the composition of microbial communities *(Bacteria, Archaea, Eukaryotes and Viruses)* from metagenomic shotgun sequencing data with species level resolution. \n+\n+From version 2.0 MetaPhlAn is also able to identify specific strains (in the not-so-frequent cases in which the sample contains a previously sequenced strains) and to track strains across samples for all species.\n+\n+MetaPhlAn 2.0 relies on ~1M unique clade-specific marker genes identified from ~17,000 reference genomes (~13,500 bacterial and archaeal, ~3,500 viral, and ~110 eukaryotic), allowing:\n+\n+* Unambiguous taxonomic assignments \n+\n+* Accurate estimation of organismal relative abundance \n+\n+* Species-level resolution for bacteria, archaea, eukaryotes and viruses \n+\n+* Strain identification and tracking\n+\n+* Orders of magnitude speedups compared to existing methods.\n+\n+EXAMPLE\n+-------\n+\n+Here is an infographic of the application of the Human Microbiome Project results obtained applying MetaPhlAn on the 690 shotgun sequencing samples. The image has been produced with GraPhlAn.\n+\n+\n+.. image:: https://bytebucket.org/nsegata/metaphlan/wiki/hmptree13_nl_bb.png\n+ :height: 500\n+ :width: 600\n+\n+\n+\n+If you use this software, please cite :\n+=======================================\n+\n+**Metagenomic microbial community profiling using unique clade-specific marker genes.** Nicola Segata, Levi Waldron, Annalisa Ballarini, Vagheesh Narasimhan, Olivier Jousson, Curtis Huttenhower. Nature Methods, 8, 811\xe2\x80\x93814, 2012\n+\n+ </help>\n+</tool>\n'

diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/tool_dependencies.xml Sun Apr 24 13:19:46 2016 -0400

@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="metaphlan2_hutlab" version="2.5.0">
+        <install version="1.0">
+            <actions>
+                <action type="shell_command">hg clone https://bitbucket.org/biobakery/metaphlan2</action>
+                <action type="move_directory_files">
+                    <source_directory>.</source_directory>
+                    <destination_directory>$INSTALL_DIR</destination_directory>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="METAPHLAN2_PATH" action="set_to">$INSTALL_DIR</environment_variable>
+                </action>
+
+            </actions>
+        </install>
+        <readme>
+These links provide information for the metaphlan2 package:
+http://huttenhower.sph.harvard.edu/metaphlan2
+https://groups.google.com/forum/#!forum/metaphlan-users
+        </readme>
+    </package>
+    <package name="numpy" version="1.7">
+        <repository changeset_revision="0c288abd2a1e" name="package_numpy_1_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bowtie2" version="2.1.0">
+        <repository changeset_revision="017a00c265f1" name="package_bowtie2_2_1_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
+
+