Repository 'metaphlan2_hutlab'
hg clone https://toolshed.g2.bx.psu.edu/repos/george-weingart/metaphlan2_hutlab

Changeset 0:00912e6e974f (2016-04-24)
Next changeset 1:085b26768dae (2016-04-24)
Commit message:
metaphlan2 Huttenhower Lab: Initial upload
added:
metaphlan2_hutlab/metaphlan2.py
metaphlan2_hutlab/metaphlan2.xml
metaphlan2_hutlab/tool_dependencies.xml
b
diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/metaphlan2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.py Sun Apr 24 13:19:46 2016 -0400
[
b'@@ -0,0 +1,1283 @@\n+#!/usr/bin/env python\n+\n+from __future__ import with_statement \n+\n+# ==============================================================================\n+# MetaPhlAn v2.x: METAgenomic PHyLogenetic ANalysis for taxonomic classification\n+#                 of metagenomic data\n+#\n+# Authors: Nicola Segata (nicola.segata@unitn.it), \n+#          Duy Tin Truong (duytin.truong@unitn.it)\n+#\n+# Please type "./metaphlan2.py -h" for usage help\n+#\n+# ==============================================================================\n+\n+__author__ = \'Nicola Segata (nicola.segata@unitn.it), Duy Tin Truong (duytin.truong@unitn.it)\'\n+__version__ = \'2.5.0\'\n+__date__ = \'28 April 2015\'\n+\n+\n+import sys\n+import os\n+import stat\n+import re\n+from binascii import b2a_uu \n+\n+try:\n+    import numpy as np \n+except ImportError:\n+    sys.stderr.write("Error! numpy python library not detected!!\\n")\n+    sys.exit(1)\n+import tempfile as tf\n+import argparse as ap\n+import subprocess as subp\n+import multiprocessing as mp\n+from collections import defaultdict as defdict\n+import bz2 \n+import itertools\n+from distutils.version import LooseVersion\n+try:\n+    import cPickle as pickle\n+except:\n+    import pickle\n+\n+import cStringIO\n+\n+#*************************************************************\n+#*  Imports related to biom file generation                  *\n+#*************************************************************\n+try:\n+    import biom\n+    import biom.table\n+    import numpy as np\n+except ImportError:\n+    sys.stderr.write("Warning! Biom python library not detected!"\n+                     "\\n Exporting to biom format will not work!\\n")\n+try:\n+    import json\n+except ImportError:\n+    sys.stderr.write("Warning! json python library not detected!"\n+                     "\\n Exporting to biom format will not work!\\n")\n+\n+# This set contains the markers that after careful validation are found to have low precision or recall\n+# We esclude the markers here to avoid generating a new marker DB when changing just few markers\n+markers_to_exclude = \\\n+    set([\n+        \'NC_001782.1\',\'GeneID:17099689\',\'gi|419819595|ref|NZ_AJRE01000517.1|:1-118\',\n+        \'GeneID:10498696\', \'GeneID:10498710\', \'GeneID:10498726\', \'GeneID:10498735\',\n+        \'GeneID:10498757\', \'GeneID:10498760\', \'GeneID:10498761\', \'GeneID:10498763\',\n+        \'GeneID:11294465\', \'GeneID:14181982\', \'GeneID:14182132\', \'GeneID:14182146\',\n+        \'GeneID:14182148\', \'GeneID:14182328\', \'GeneID:14182639\', \'GeneID:14182647\',\n+        \'GeneID:14182650\', \'GeneID:14182663\', \'GeneID:14182683\', \'GeneID:14182684\',\n+        \'GeneID:14182691\', \'GeneID:14182803\', \'GeneID:14296322\', \'GeneID:1489077\',\n+        \'GeneID:1489080\', \'GeneID:1489081\', \'GeneID:1489084\', \'GeneID:1489085\',\n+        \'GeneID:1489088\', \'GeneID:1489089\', \'GeneID:1489090\', \'GeneID:1489528\',\n+        \'GeneID:1489530\', \'GeneID:1489531\', \'GeneID:1489735\', \'GeneID:1491873\',\n+        \'GeneID:1491889\', \'GeneID:1491962\', \'GeneID:1491963\', \'GeneID:1491964\',\n+        \'GeneID:1491965\', \'GeneID:17099689\', \'GeneID:1724732\', \'GeneID:17494231\',\n+        \'GeneID:2546403\', \'GeneID:2703374\', \'GeneID:2703375\', \'GeneID:2703498\',\n+        \'GeneID:2703531\', \'GeneID:2772983\', \'GeneID:2772989\', \'GeneID:2772991\',\n+        \'GeneID:2772993\', \'GeneID:2772995\', \'GeneID:2773037\', \'GeneID:2777387\',\n+        \'GeneID:2777399\', \'GeneID:2777400\', \'GeneID:2777439\', \'GeneID:2777493\',\n+        \'GeneID:2777494\', \'GeneID:3077424\', \'GeneID:3160801\', \'GeneID:3197323\',\n+        \'GeneID:3197355\', \'GeneID:3197400\', \'GeneID:3197428\', \'GeneID:3783722\',\n+        \'GeneID:3783750\', \'GeneID:3953004\', \'GeneID:3959334\', \'GeneID:3964368\',\n+        \'GeneID:3964370\', \'GeneID:4961452\', \'GeneID:5075645\', \'GeneID:5075646\',\n+        \'GeneID:5075647\', \'GeneID:5075648\', \'GeneID:5075649\', \'GeneID:5075650\',\n+        \'GeneID:5075651\', \'GeneID:5075652\', \'GeneID:5075653\', \'GeneID:5075654\',\n+        \'GeneID:5075655\', \'GeneID:5075656\', \'GeneID:5075657\', \'GeneID:5075658\',\n+        \'GeneID:5075659\', \'GeneID:5075660\', \''..b'[0].count("|")))  ): \n+                    outf.write( "\\t".join( [k,str(v)] ) + "\\n" )   \n+            else:\n+                outf.write( "unclassified\\t100.0\\n" )\n+            maybe_generate_biom_file(pars, outpred)\n+        elif pars[\'t\'] == \'rel_ab_w_read_stats\':\n+            cl2ab, rr = tree.relative_abundances( \n+                        pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None )\n+            outpred = [(k,round(v*100.0,5)) for k,v in cl2ab.items() if v > 0.0]\n+            totl = 0\n+            if outpred:\n+                outf.write( "\\t".join( [    "#clade_name",\n+                                            "relative_abundance",\n+                                            "coverage",\n+                                            "average_genome_length_in_the_clade",\n+                                            "estimated_number_of_reads_from_the_clade" ]) +"\\n" )\n+\n+                for k,v in sorted(  outpred, reverse=True,\n+                                    key=lambda x:x[1]+(100.0*(8-x[0].count("|")))  ): \n+                    outf.write( "\\t".join( [    k,\n+                                                str(v),\n+                                                str(rr[k][0]) if k in rr else "-",\n+                                                str(rr[k][1]) if k in rr else "-",\n+                                                str(int(round(rr[k][2],0)) if k in rr else "-")   \n+                                                ] ) + "\\n" )   \n+                    if "|" not in k:\n+                        totl += (int(round(rr[k][2],0)) if k in rr else 0)\n+\n+                outf.write( "#estimated total number of reads from known clades: " + str(totl)+"\\n")\n+            else:\n+                outf.write( "unclassified\\t100.0\\n" )\n+            maybe_generate_biom_file(pars, outpred)\n+\n+        elif pars[\'t\'] == \'clade_profiles\':\n+            cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None  )\n+            for c,p in cl2pr.items():\n+                mn,n = zip(*p)\n+                outf.write( "\\t".join( [""]+[str(s) for s in mn] ) + "\\n" )\n+                outf.write( "\\t".join( [c]+[str(s) for s in n] ) + "\\n" )\n+        elif pars[\'t\'] == \'marker_ab_table\':\n+            cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None  )\n+            for v in cl2pr.values():\n+                outf.write( "\\n".join(["\\t".join([str(a),str(b/float(pars[\'nreads\'])) if pars[\'nreads\'] else str(b)]) \n+                                for a,b in v if b > 0.0]) + "\\n" )\n+        elif pars[\'t\'] == \'marker_pres_table\':\n+            cl2pr = tree.clade_profiles( pars[\'tax_lev\']+"__" if pars[\'tax_lev\'] != \'a\' else None  )\n+            for v in cl2pr.values():\n+                strout = ["\\t".join([str(a),"1"]) for a,b in v if b > pars[\'pres_th\']]\n+                if strout:\n+                    outf.write( "\\n".join(strout) + "\\n" )\n+\n+        elif pars[\'t\'] == \'marker_counts\':\n+            outf.write( "\\n".join( ["\\t".join([m,str(c)]) for m,c in tree.markers2counts().items() ]) +"\\n" )\n+\n+        elif pars[\'t\'] == \'clade_specific_strain_tracker\':\n+            cl2pr = tree.clade_profiles( None, get_all = True  )\n+            cl2ab, _ = tree.relative_abundances( None )\n+            strout = []\n+            for cl,v in cl2pr.items():\n+                if cl.endswith(pars[\'clade\']) and cl2ab[cl]*100.0 < pars[\'min_ab\']:\n+                    strout = []\n+                    break\n+                if pars[\'clade\'] in cl:\n+                    strout += ["\\t".join([str(a),str(int(b > pars[\'pres_th\']))]) for a,b in v]\n+            if strout:\n+                strout = sorted(strout,key=lambda x:x[0])\n+                outf.write( "\\n".join(strout) + "\\n" )\n+            else:\n+                sys.stderr.write("Clade "+pars[\'clade\']+" not present at an abundance >"+str(round(pars[\'min_ab\'],2))+"%, "\n+                                 "so no clade specific markers are reported\\n")\n'
b
diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/metaphlan2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.xml Sun Apr 24 13:19:46 2016 -0400
[
b'@@ -0,0 +1,210 @@\n+<tool id="metaphlan2_hutlab" name="MetaPhlAn2" version="2.5.0">\n+  <requirements>\n+       <requirement type="package" version="2.5.0">metaphlan2_hutlab</requirement>\n+       <requirement type="package">bowtie2</requirement>\n+   </requirements>\n+\n+\t<description>metagenomic profiler V2</description>\n+\t<command interpreter="python">metaphlan2.py $input \n+\t\t--mpa_pkl  \\${METAPHLAN2_PATH}/db_v20/mpa_v20_m200.pkl    \n+\t\t--bowtie2db \\${METAPHLAN2_PATH}//db_v20/mpa_v20_m200 \n+\t\t--input_type  fastq\n+\t\t--no_map \n+\t\t--bt2_ps $PresetsForBowtie2  \n+\t\t#if $str($gchoice_post_mapping.global_choice_post_mapping)  == "1":\n+\t\t\t--tax_lev $gchoice_post_mapping.Taxonomic_Level\n+\t\t\t--min_cu_len $gchoice_post_mapping.min_cu_len\n+\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Viruses)  == "1":\n+\t\t\t\t--ignore_viruses\n+\t\t\t#end if\t\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Eukaryotes)  == "1":\n+\t\t\t\t--ignore_eukaryotes   \n+\t\t\t#end if\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Bacteria)  == "1":\n+\t\t\t\t--ignore_bacteria\n+\t\t\t#end if\t\n+\t\t\t#if $str($gchoice_post_mapping.Ignore_Archaea)  == "1":\n+\t\t\t\t--ignore_archaea\n+\t\t\t#end if\t\n+\n+\t\t\t--stat_q $gchoice_post_mapping.stat_q\n+\t\t#end if\n+ \n+\t\t#if $str($gchoice_additional_analysis_types.global_additional_analysis_types)  == "1":\n+ \t\t\t-t $gchoice_additional_analysis_types.Analysis_Type \n+\t\t\t#if  int($gchoice_additional_analysis_types.nreads) > 0:\n+\t\t\t\t--nreads $gchoice_additional_analysis_types.nreads\n+\t\t\t#end if\t\n+\t\t\t\n+\t\t\t#if  int($gchoice_additional_analysis_types.pres_th) > 0:\n+\t\t\t\t--pres_th  $gchoice_additional_analysis_types.pres_th\n+\t\t\t#end if\t\n+\t\t\t\n+\t\t\t#if $str($gchoice_additional_analysis_types.clade)  != " ":\n+\t\t\t\t--clade $gchoice_additional_analysis_types.clade \n+\t\t\t\t#if  int($gchoice_additional_analysis_types.min_ab) > 0:\n+\t\t\t\t\t\t --min_ab  $gchoice_additional_analysis_types.min_ab             \n+\t\t\t\t#end if\t\t\t\t\n+\t\t\t#end if\t\t\t\t\n+\t\t\t\n+\t\t#end if\n+ \t\t-o $output \n+\t\t\n+\t\t\t\n+\t\t#if $str($gchoice_biom.global_biom)  == "1":\n+\t\t\t--biom $biom_output\n+\t\t\t--mdelim $gchoice_biom.MetadataDelimiterChar\n+\t\t#end if\t\n+\t</command>\n+\t<inputs>\n+\t\t<param format="fastq" name="input" type="data" label="Input metagenome (fastq of metagenomic reads, loaded with the Get Data module )"></param>\n+\n+\t\t<param name="PresetsForBowtie2" type="select" format="text" >\n+\t\t\t<label>Sensitivity options for read-marker similarity (as described by BowTie2)</label>\n+\t\t\t\t<option value="very-sensitive">Very Sensitive</option>\n+\t\t\t\t<option value="sensitive">Sensitive</option>\n+\t\t\t\t<option value="very-sensitive-local">Very Sensitive  Local</option>\n+\t\t\t\t<option value="sensitive-local">Sensitive  Local</option>\n+\n+\t\t</param>\t\n+\t\t\n+\t\t\n+\t\t\n+\t<conditional name="gchoice_post_mapping">\n+        \t<param name="global_choice_post_mapping" type="select" label="Display Post Mapping Advanced Parameters"  multiple="False" help="Select Post Mapping advanced choices ">\n+        \t\t<option value="0" selected=\'True\'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+        \t</param>\n+           \t<when value="0">\n+ \t\t\t\t<param name="min_cu_len" type="hidden"  value=""  />\n+\t\t\t</when>\n+\n+        \t<when value="1">\n+\t\t\t\t<param name="Taxonomic_Level" type="select" value="a" format="text" >\n+\t\t\t\t\t<label>Taxonomic Level</label>\n+\t\t\t\t\t\t<option value="a">All taxonomic levels</option>\n+\t\t\t\t\t\t<option value="k">Kingdoms (Bacteria and Archaea) only</option>\n+\t\t\t\t\t\t<option value="p">Phyla only</option>\n+\t\t\t\t\t\t<option value="o">Orders only</option>\n+\t\t\t\t\t\t<option value="f">Families only</option>\n+\t\t\t\t\t\t<option value="g">Genera only</option>\n+\t\t\t\t\t\t<option value="s">Species only</option>\n+\t\t\t\t</param>\n+\t\t\t\t<param name="min_cu_len" type="integer" size="4"   value="2000" label="Minimum total nucleotide length for the markers in a clade "/>\n+\n+\t\t\t\t<param name="Ignore_Viruses" type="select" label="Profile viral organisms" value="0" >\n+\t\t\t\t\t\t<option value="0">Yes</option>\n+\t\t\t\t\t\t<option value="1">No</option>\n+\t\t\t\t</param>\n+\t\t\t\t\n+\t\t\t\t<param name="Ignore_Eukaryotes" type="select" l'..b'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+        \t</param>\n+           \t<when value="0">\n+ \t\t\t\t<param name="Analysis_Type" type="hidden"  value=""  />\n+\t\t\t</when>\n+\n+        \t<when value="1">\n+\t\t\t\t<param name="Analysis_Type" type="select" value="rel_ab" format="text" >\n+\t\t\t\t\t<label>Analysis Type: Type of Analysis to perform</label>\n+\t\t\t\t\t\t<option value="rel_ab">Profiling a metagenomes in terms of relative abundances</option>\n+\t\t\t\t\t\t<option value="reads_map">Mapping from reads to clades (only reads hitting a marker)</option>\n+\t\t\t\t\t\t<option value="clade_profiles">Normalized marker counts for clades with at least a non-null marker</option>\n+\t\t\t\t\t\t<option value="marker_ab_table">normalized marker counts (only when > 0.0 and normalized by metagenome size if --nreads is specified)</option>\n+\t\t\t\t\t\t<option value="marker_pres_table">list of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th</option>\n+\t\t\t\t</param>\n+\t\t\t\t\n+\t\t\t\t<param name="nreads" type="integer" size="4"   value="0" label="The total number of reads in the original metagenome "/>\n+\t\t\t\t<param name="pres_th" type="integer" size="4"  value="0"   label="Threshold for calling a marker present"/>\n+\t\t\t\t<param name="clade" label="The clade for clade specific strain tracker analysis"  value=" " type="text" format="text"/>\n+\t\t\t\t<param name="min_ab" type="integer" size="4"  value="0" min="0" max="100"  label="The minimum percentage abundance for the clade in the clade specific strain tracker analysis"/>\t\n+\t\n+\t\t\t</when>\n+        </conditional>\n+\t\t\n+\t\t\n+\t\t\n+\t\t\n+\t\t<conditional name="gchoice_biom">\n+        \t<param name="global_biom" type="select" label="Display additional biom advanced parameters"  help="Select additional biom  choices ">\n+        \t\t<option value="0" selected=\'True\'>No</option>\n+\t\t\t\t<option value="1">Yes</option>\n+        \t</param>\n+           \t<when value="0">\n+\t\t\t</when>\n+\n+        \t<when value="1">\n+\t\t\t\t<param name="MetadataDelimiterChar" label="Delimiter for bug metadata if biom file requested - defaults to pipe. e.g. the pipe in k__Bacteria|p__Proteobacteria"  value="|" type="text" format="text"/>\n+\t\t\t</when>\n+        </conditional>\n+ \n+\n+\t</inputs>\n+\t<outputs>\n+\t\t<data name="output" format="tabular" />\n+        <data name="biom_output"  format="text">\n+            <filter>gchoice_biom["global_biom"] == "1"</filter>\n+        </data>\n+\t \n+\n+\t</outputs>\n+      \n+  <help>\n+**MetaPhlAn**  is a computational tool for profiling the composition of microbial communities *(Bacteria, Archaea, Eukaryotes and Viruses)* from metagenomic shotgun sequencing data with species level resolution. \n+\n+From version 2.0 MetaPhlAn is also able to identify specific strains (in the not-so-frequent cases in which the sample contains a previously sequenced strains) and to track strains across samples for all species.\n+\n+MetaPhlAn 2.0 relies on ~1M unique clade-specific marker genes identified from ~17,000 reference genomes (~13,500 bacterial and archaeal, ~3,500 viral, and ~110 eukaryotic), allowing:\n+\n+* Unambiguous taxonomic assignments \n+\n+* Accurate estimation of organismal relative abundance \n+\n+* Species-level resolution for bacteria, archaea, eukaryotes and viruses \n+\n+* Strain identification and tracking\n+\n+* Orders of magnitude speedups compared to existing methods.\n+\n+EXAMPLE\n+-------\n+\n+Here is an infographic of the application of the Human Microbiome Project results obtained applying MetaPhlAn on the 690 shotgun sequencing samples.  The image has been produced with GraPhlAn.\n+\n+\n+.. image::    https://bytebucket.org/nsegata/metaphlan/wiki/hmptree13_nl_bb.png\n+    :height: 500\n+    :width: 600\n+\n+\n+\n+If you use this software, please cite :\n+=======================================\n+\n+**Metagenomic microbial community profiling using unique clade-specific marker genes.** Nicola Segata, Levi Waldron, Annalisa Ballarini, Vagheesh Narasimhan, Olivier Jousson, Curtis Huttenhower. Nature Methods, 8, 811\xe2\x80\x93814, 2012\n+\n+  </help>\n+</tool>\n'
b
diff -r 000000000000 -r 00912e6e974f metaphlan2_hutlab/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/tool_dependencies.xml Sun Apr 24 13:19:46 2016 -0400
b
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="metaphlan2_hutlab" version="2.5.0">
+        <install version="1.0">
+            <actions>
+                <action type="shell_command">hg clone https://bitbucket.org/biobakery/metaphlan2</action>
+                <action type="move_directory_files">
+                    <source_directory>.</source_directory>
+                    <destination_directory>$INSTALL_DIR</destination_directory>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="METAPHLAN2_PATH" action="set_to">$INSTALL_DIR</environment_variable>
+                </action>
+
+            </actions>
+        </install>
+        <readme>
+These links provide information for the metaphlan2 package:
+http://huttenhower.sph.harvard.edu/metaphlan2
+https://groups.google.com/forum/#!forum/metaphlan-users
+        </readme>
+    </package>
+    <package name="numpy" version="1.7">
+        <repository changeset_revision="0c288abd2a1e" name="package_numpy_1_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bowtie2" version="2.1.0">
+        <repository changeset_revision="017a00c265f1" name="package_bowtie2_2_1_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
+
+