Repository 'concoct_extract_fasta_bins'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/concoct_extract_fasta_bins

Changeset 0:1bc8fd1b3ed0 (2022-02-18)
Next changeset 1:a04028a8181d (2022-03-13)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
added:
extract_fasta_bins.py
extract_fasta_bins.xml
macros.xml
test-data/input1.fa.gz
test-data/input1.tabular
test-data/inputcluster1.tabular
test-data/process_log.txt
b
diff -r 000000000000 -r 1bc8fd1b3ed0 extract_fasta_bins.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_fasta_bins.py Fri Feb 18 14:17:48 2022 +0000
[
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+import argparse
+import gzip
+import os
+import sys
+from collections import defaultdict
+from functools import partial
+
+import pandas as pd
+from Bio import SeqIO
+
+parser = argparse.ArgumentParser(description=__doc__)
+
+parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped')
+parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Input Fasta file")
+parser.add_argument("--input_cluster", action="store", dest="input_cluster", help="Concoct output cluster file")
+parser.add_argument("--output_path", help="Output directory")
+
+args = parser.parse_args()
+
+all_seqs = {}
+if args.gzipped:
+    _open = partial(gzip.open, mode='rt')
+else:
+    _open = open
+
+with _open(args.input_fasta) as fh:
+    for seq in SeqIO.parse(fh, "fasta"):
+        all_seqs[seq.id] = seq
+
+# Make sure we're reading the file as tabular!
+df = pd.read_csv(args.input_cluster, sep='\t')
+try:
+    assert df.columns[0] == 'contig_id'
+    assert df.columns[1] == 'cluster_id'
+except AssertionError:
+    sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n")
+    sys.exit(-1)
+
+cluster_to_contigs = defaultdict(list)
+for i, row in df.iterrows():
+    cluster_to_contigs[row['cluster_id']].append(row['contig_id'])
+
+for cluster_id, contig_ids in cluster_to_contigs.items():
+    output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id))
+    seqs = [all_seqs[contig_id] for contig_id in contig_ids]
+    with open(output_file, 'w') as ofh:
+        SeqIO.write(seqs, ofh, 'fasta')
b
diff -r 000000000000 -r 1bc8fd1b3ed0 extract_fasta_bins.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_fasta_bins.xml Fri Feb 18 14:17:48 2022 +0000
[
@@ -0,0 +1,58 @@
+<tool id="concoct_extract_fasta_bins" name="Extract a fasta file" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>for each cluster in a CONCOCT clustering file</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+mkdir outdir &&
+python '$__tool_directory__/extract_fasta_bins.py'
+#if $input_fasta.is_of_type('fasta.gz'):
+    --gzipped
+#end if
+--input_fasta '$input_fasta'
+--input_cluster '$input_cluster'
+--output_path 'outdir'
+    ]]></command>
+    <inputs>
+        <param argument="--input_fasta" type="data" format="fasta,fasta.gz" label="Fasta file"/>
+        <param argument="--input_cluster" type="data" format="tabular" label="CONCOCT clustering file"/>
+    </inputs>
+    <outputs>
+        <collection name="bins" type="list" label="${tool.name} on ${on_string} (bins)">
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.fa" format="fasta" directory="outdir"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_fasta" value="input1.fa.gz" ftype="fasta.gz"/>
+            <param name="input_cluster" value="inputcluster1.tabular" ftype="tabular"/>
+            <output_collection name="bins" type="list" count="2">
+                <element name="77" ftype="fasta">
+                    <assert_contents>
+                        <has_size value="2194"/>
+                        <has_text text="NODE_1_length_2054_cov_17.474684"/>
+                        <has_n_lines n="37"/>
+                    </assert_contents>
+                </element>
+                <element name="93" ftype="fasta">
+                    <assert_contents>
+                        <has_size value="16531"/>
+                        <has_text text="NODE_2_length_16156_cov_8.219856"/>
+                        <has_n_lines n="272"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+Performs metagenomic binning of fasta contigs by extracting a fasta file for each cluster defined in a
+CONCOCT clustering file.
+
+The tool accepts two inputs; the fasta contigs file and the CONCOCT clustering file that was produced using
+the same fasta contigs input.  A collection of fasta files is produced.
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
b
diff -r 000000000000 -r 1bc8fd1b3ed0 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Fri Feb 18 14:17:48 2022 +0000
b
@@ -0,0 +1,17 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">concoct</requirement>
+            <requirement type="package" version="0.19.2">pandas</requirement>
+            <requirement type="package" version="1.79">biopython</requirement>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/nmeth.3103</citation>
+        </citations>
+    </xml>
+</macros>
b
diff -r 000000000000 -r 1bc8fd1b3ed0 test-data/input1.fa.gz
b
Binary file test-data/input1.fa.gz has changed
b
diff -r 000000000000 -r 1bc8fd1b3ed0 test-data/input1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input1.tabular Fri Feb 18 14:17:48 2022 +0000
b
b'@@ -0,0 +1,417 @@\n+NODE_100_length_1535_cov_6.691205\t1605\t8.261677\t0.249221\t6.652957\t0.123988\t0\t3.362618\t6.351409\t0\t0\t0.808100\t0.373832\t0.373832\t5.781307\t0\t0\t1.059190\n+NODE_101_length_153_cov_4.235294\t223\t0.896861\t0\t0.408072\t0\t0\t0\t0\t0\t0\t1.345292\t0.448430\t1.345293\t0.448430\t0\t1.345290\t3.587450\n+NODE_103_length_20202_cov_8.395357.0\t10000\t2.007700\t4.942400\t3.128200\t0.080000\t0\t0\t3.826100\t0\t4.206600\t1.528800\t0.420000\t1.268800\t4.233400\t4.407300\t3.177900\t2.606200\n+NODE_103_length_20202_cov_8.395357.1\t10272\t1.800233\t3.637852\t3.542055\t0.262851\t0\t0.097352\t3.745523\t0\t4.590639\t0.884151\t0.252823\t1.595307\t4.252823\t4.476445\t2.927082\t1.857087\n+NODE_104_length_823_cov_8.356014\t893\t10.070548\t0\t6.152291\t0.111982\t0\t4.245240\t6.038073\t0\t0\t0.671892\t0.559910\t0.895856\t5.597985\t0\t0\t2.349386\n+NODE_105_length_260_cov_4.142308\t330\t1.818181\t0.303030\t0.909091\t0\t0\t0\t0.303030\t0\t0\t2.121211\t1.212121\t0\t0.909090\t0\t0\t1.818183\n+NODE_106_length_2172_cov_16.183702\t2242\t3.384477\t9.701164\t5.793935\t0\t0\t0\t7.700716\t0\t8.591435\t2.094113\t0.758250\t2.096341\t6.768960\t10.165474\t5.038357\t3.557540\n+NODE_107_length_7609_cov_8.569195\t7679\t1.874722\t4.344835\t3.357731\t0\t0\t0\t4.048056\t0\t4.411123\t0.949603\t0.325564\t1.263185\t4.151191\t4.947649\t2.760774\t2.054826\n+NODE_108_length_1232_cov_16.167208\t1302\t2.833331\t7.956992\t4.980798\t0\t0\t0\t7.882490\t0\t7.116747\t2.607528\t0.691244\t3.064514\t7.667427\t7.364051\t6.023813\t2.526882\n+NODE_109_length_178_cov_16.084270\t248\t0\t0\t1.209678\t0\t0\t0\t0.403226\t0\t0\t0.806452\t0\t0\t0.403226\t0\t0\t0.403226\n+NODE_10_length_186_cov_4.327957\t256\t1.171875\t0.390625\t1.144532\t0\t0\t0\t0.390625\t0\t0.781250\t4.249997\t0\t1.171875\t0.390625\t0\t0.781250\t0.781250\n+NODE_111_length_80116_cov_8.871636.0\t10000\t2.747000\t5.603200\t3.788400\t0\t0\t0\t4.259100\t0\t5.404000\t2.697400\t0.779300\t1.608100\t4.958100\t4.486800\t3.538300\t3.677700\n+NODE_111_length_80116_cov_8.871636.1\t10000\t2.909500\t4.897200\t3.998600\t0\t0\t0\t4.777000\t0\t4.757700\t2.779000\t0.780000\t1.979700\t4.556000\t4.479100\t3.225600\t3.966300\n+NODE_111_length_80116_cov_8.871636.2\t10000\t2.425900\t4.784400\t3.617900\t0\t0\t0\t3.947500\t0\t4.744200\t2.039100\t0.490000\t1.326600\t5.249600\t4.419400\t3.065800\t2.829200\n+NODE_111_length_80116_cov_8.871636.3\t10000\t2.148700\t4.985700\t2.937300\t0\t0\t0\t3.676800\t0\t4.434200\t0.990000\t0.339900\t1.108600\t4.116000\t4.655600\t2.799300\t1.470000\n+NODE_111_length_80116_cov_8.871636.4\t10000\t1.959300\t4.606200\t3.178800\t0\t0\t0\t4.028800\t0\t4.507900\t0.909700\t0.410000\t1.059800\t4.422800\t4.746700\t2.597200\t1.729200\n+NODE_111_length_80116_cov_8.871636.5\t10000\t2.248400\t4.691800\t3.496100\t0\t0\t0\t4.134500\t0\t4.415500\t1.070000\t0.230000\t1.099600\t4.136500\t4.824700\t2.795400\t1.578100\n+NODE_111_length_80116_cov_8.871636.6\t10000\t1.747300\t4.446100\t3.118100\t0\t0\t0\t3.876600\t0\t4.505200\t1.130000\t0.260000\t1.169800\t4.025700\t4.704400\t2.567700\t1.458300\n+NODE_111_length_80116_cov_8.871636.7\t10186\t1.822990\t4.628609\t2.521696\t0\t0\t0\t4.021993\t0\t4.181820\t1.128413\t0.363243\t1.187120\t4.238861\t4.236105\t3.147359\t1.362556\n+NODE_112_length_846_cov_18.049644\t916\t4.366811\t9.022922\t4.908290\t0\t0\t0\t6.643011\t0\t6.979259\t1.965066\t0.545851\t2.719436\t10.689957\t7.525108\t5.786028\t2.183407\n+NODE_113_length_286_cov_12.695805\t356\t1.963482\t2.241571\t3.084266\t0\t0\t0\t2.528089\t0\t2.525277\t1.120788\t0\t1.120787\t1.117979\t3.362363\t1.957863\t1.398876\n+NODE_114_length_4896_cov_18.356821\t4966\t4.024768\t8.623427\t6.495364\t0\t0\t0\t8.872340\t0\t9.499799\t3.200765\t1.026984\t2.635322\t8.229552\t10.224925\t6.341319\t4.405350\n+NODE_115_length_78252_cov_8.512166.0\t10000\t1.645100\t4.396300\t2.918400\t0\t0\t0.060000\t3.955100\t0\t4.177900\t0.748800\t0.379900\t1.187400\t3.886300\t4.266200\t2.869100\t1.517900\n+NODE_115_length_78252_cov_8.512166.1\t10000\t1.787400\t3.998500\t2.974900\t0\t0\t0\t3.878600\t0\t4.197900\t1.109100\t0.219100\t1.160000\t3.807500\t5.064600\t2.636800\t1.799000\n+NODE_115_length_78252_cov_8.512166.2\t10000\t1.738300\t4.779000\t2.947800\t0\t0\t0\t3.467900\t0\t4.514200\t1.169200\t0.360000\t1.229500\t4.267100\t5.474900\t2.589100\t1.589300\n+NODE_115_length_78252_cov_8.512166.3\t10000\t1.598400\t4.621700\t2.349000\t0\t0\t0\t4.087400\t0\t4.692600\t0.929400\t0.350000\t1.208900\t4.039300\t5.058200\t3.152200\t1.'..b'9\t1.499457\t5.103376\t2.714908\t0\t0\t0\t3.808488\t0\t5.549510\t0.652883\t0.544069\t0.761696\t2.829162\t5.099029\t3.253538\t2.378674\n+NODE_26_length_3431_cov_15.138152\t3501\t3.049701\t6.612392\t6.391893\t0\t0\t0\t7.302201\t0\t7.279065\t2.370750\t0.628393\t1.853471\t6.960295\t8.834908\t4.391602\t3.565265\n+NODE_270_length_151_cov_4.258278\t221\t1.809956\t0\t0\t0.452489\t0\t0\t0\t3.619902\t0\t0\t0.452489\t3.167421\t0\t0.452489\t1.357464\t0.904977\n+NODE_271_length_164_cov_15.048780\t234\t0.427350\t2.136753\t0.854701\t0\t0\t0\t0.427350\t0\t2.136750\t0\t0.427350\t0.427350\t0.854701\t1.709400\t0.854700\t0\n+NODE_272_length_154_cov_14.188312\t224\t0\t0.892857\t2.232143\t0\t0\t0\t0.892857\t0\t0.446429\t0\t0\t0.446429\t0.892857\t2.232140\t0.446429\t0.446429\n+NODE_274_length_173_cov_11.838150\t243\t0.411523\t1.234569\t2.057611\t0\t0\t0\t3.292181\t0\t4.938271\t0.823045\t0\t1.234569\t2.469137\t5.349804\t2.057616\t0\n+NODE_275_length_149_cov_4.000000\t219\t0.913242\t0.456621\t1.369863\t0\t0\t0\t0.456621\t0\t0.456621\t1.369863\t0.456621\t0\t1.826483\t0\t0.456621\t3.159820\n+NODE_277_length_146_cov_4.280822\t216\t0.462963\t0.462963\t0.462963\t0\t0\t0\t0\t0\t0.884258\t2.777777\t0.462963\t0.462963\t1.388889\t0\t0\t3.240744\n+NODE_27_length_35477_cov_8.181526.0\t10000\t1.918300\t4.347400\t2.546500\t0\t0\t0\t4.424700\t0\t4.586700\t0.889800\t0.220000\t1.417900\t3.819400\t5.105800\t3.167800\t1.730000\n+NODE_27_length_35477_cov_8.181526.1\t10000\t1.688200\t4.124600\t3.029500\t0\t0\t0\t3.928900\t0\t4.115200\t1.050000\t0.229500\t1.118800\t3.576900\t4.612100\t2.677600\t1.789400\n+NODE_27_length_35477_cov_8.181526.2\t15547\t1.547439\t4.370681\t2.942946\t0\t0\t0\t3.870905\t0\t4.512256\t1.079823\t0.347334\t1.504536\t3.989711\t4.647326\t2.783237\t1.568791\n+NODE_283_length_165_cov_4.200000\t235\t0.851064\t1.276595\t0.851063\t0\t0\t0\t0\t0\t0.425532\t2.127663\t0.425532\t0.425532\t0.425532\t0\t0.425532\t2.127663\n+NODE_284_length_148_cov_4.236486\t218\t0.917431\t0.454128\t0\t0.458716\t0\t0\t0\t2.752293\t0.458716\t0.422018\t0\t2.293577\t0\t0.917431\t3.211007\t0.917431\n+NODE_285_length_433_cov_5.106236\t503\t1.590457\t2.582506\t3.165008\t0\t0\t0\t4.165009\t0\t2.783302\t0.994036\t0.198807\t1.192843\t2.972167\t3.976146\t1.391651\t0.984095\n+NODE_288_length_174_cov_4.316092\t244\t2.049180\t1.639345\t1.229508\t0\t0\t0\t2.036885\t0\t2.036885\t0.409836\t0.409836\t0.409836\t2.032786\t1.229508\t0.409836\t3.278688\n+NODE_28_length_34835_cov_8.108799.0\t10000\t1.908700\t4.387700\t2.947500\t0\t0\t0\t3.797100\t0\t4.693900\t0.987500\t0.349000\t1.328800\t4.478800\t5.105200\t2.888500\t1.397900\n+NODE_28_length_34835_cov_8.108799.1\t10000\t1.847500\t4.356800\t2.648600\t0\t0\t0\t3.735900\t0\t3.876700\t1.019300\t0.259600\t1.159900\t3.745800\t4.606000\t2.606500\t1.108800\n+NODE_28_length_34835_cov_8.108799.2\t14905\t1.775715\t4.499630\t2.730357\t0\t0\t0\t3.825425\t0\t4.424353\t0.944917\t0.315331\t1.307078\t3.888697\t4.907619\t2.850249\t1.540825\n+NODE_290_length_1962_cov_6.746177\t2032\t1.525591\t5.413389\t2.944881\t0\t0\t0\t4.079231\t0\t3.885336\t0.929626\t0.049213\t1.427166\t4.221458\t4.773624\t1.968506\t1.377954\n+NODE_293_length_832_cov_6.325721\t902\t3.436805\t4.988915\t4.101994\t0\t0\t0\t4.758316\t0\t7.283812\t1.108646\t0.442350\t3.092018\t5.518847\t4.317073\t4.529935\t1.771619\n+NODE_294_length_210_cov_4.066667\t280\t2.142857\t1.414286\t0.357143\t0\t0\t0\t0.357143\t0\t0.357143\t2.499999\t0.357143\t0\t1.071428\t0\t1.071428\t1.785713\n+NODE_295_length_196_cov_4.397959\t266\t1.879699\t0.375940\t1.127819\t0\t0\t0\t0.751880\t0\t0.375940\t3.349628\t0\t0\t0\t0\t0.375940\t0.751880\n+NODE_29_length_44540_cov_9.429951.0\t10000\t2.027900\t5.076300\t3.653900\t0\t0\t0\t4.475300\t0\t4.737900\t2.168800\t0.700000\t1.458700\t4.274900\t4.467800\t3.147200\t3.525100\n+NODE_29_length_44540_cov_9.429951.1\t10000\t2.258300\t4.988600\t3.198400\t0\t0\t0\t3.565800\t0\t4.268500\t2.740000\t0.570000\t1.366900\t4.895400\t4.835200\t3.426900\t4.108300\n+NODE_29_length_44540_cov_9.429951.2\t10000\t2.716900\t5.218600\t3.607200\t0\t0\t0\t3.996700\t0\t4.797500\t3.179400\t0.770000\t1.508700\t4.924600\t4.254900\t3.564600\t4.146500\n+NODE_29_length_44540_cov_9.429951.3\t14610\t2.230117\t5.066739\t3.647501\t0\t0\t0\t4.351748\t0\t4.827725\t1.531828\t0.342231\t1.298221\t4.664888\t5.032990\t3.008352\t2.216154\n+NODE_2_length_16156_cov_8.219856\t16226\t1.545114\t4.323432\t2.912856\t0\t0\t0\t3.922715\t0\t4.290770\t0.992173\t0.388265\t1.182792\t4.050168\t5.099648\t2.813631\t1.411069\n'
b
diff -r 000000000000 -r 1bc8fd1b3ed0 test-data/inputcluster1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputcluster1.tabular Fri Feb 18 14:17:48 2022 +0000
b
@@ -0,0 +1,3 @@
+contig_id cluster_id
+NODE_1_length_2054_cov_17.474684 77
+NODE_2_length_16156_cov_8.219856 93
b
diff -r 000000000000 -r 1bc8fd1b3ed0 test-data/process_log.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/process_log.txt Fri Feb 18 14:17:48 2022 +0000
b
@@ -0,0 +1,10 @@
+Results created at
+Successfully loaded composition data.
+Successfully loaded coverage data.
+Performed PCA, resulted in 154 dimensions
+Wrote PCA transformed file.
+Wrote PCA components file.
+PCA transformed data.
+Will call vbgmm with parameters: outdir/, 400, 1000, 1
+Wrote assign file.
+CONCOCT Finished