comparison concoct.xml @ 3:3842ef1b2f34 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 1a79c139165648b969d82530784cea3fc8f2d2c0"
author iuc
date Thu, 07 Jul 2022 08:33:35 +0000
parents 7a145c72d375
children 28e8d2bd6aba
comparison
equal deleted inserted replaced
2:7a145c72d375 3:3842ef1b2f34
1 <tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> 1 <tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>metagenome binning</description> 2 <description>for metagenome binning</description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements"/> 6 <expand macro="requirements"/>
7 <command detect_errors="exit_code"><![CDATA[ 7 <command detect_errors="exit_code"><![CDATA[
8 #set pca_components_file_name = 'PCA_components_data_gt' + str($advanced.length_threshold) + '.csv'
9 #set pca_transformed_file_name = 'PCA_transformed_data_gt' + str($advanced.length_threshold) + '.csv'
10 #set clustering_file_name = 'clustering_gt' + str($advanced.length_threshold) + '.csv'
11
12 ## CONCOCT doesn't handle gzipped files. 8 ## CONCOCT doesn't handle gzipped files.
13 #if $composition_file.ext.endswith(".gz") 9 #if $composition_file.ext.endswith(".gz")
14 gunzip -c '$composition_file' > composition_file.fa && 10 gunzip -c '$composition_file' > 'composition_file.fa' &&
15 #else: 11 #else:
16 ln -s '$composition_file' composition_file.fa && 12 ln -s '$composition_file' 'composition_file.fa' &&
17 #end if 13 #end if
18 14
19 mkdir outdir && 15 mkdir outdir &&
20 concoct 16 concoct
21 --coverage_file '$coverage_file' 17 --coverage_file '$coverage_file'
22 --composition_file composition_file.fa 18 --composition_file 'composition_file.fa'
23 --clusters $advanced.clusters 19 --clusters $advanced.clusters
24 --kmer_length $advanced.kmer_length 20 --kmer_length $advanced.kmer_length
25 --threads \${GALAXY_SLOTS:-4} 21 --threads \${GALAXY_SLOTS:-4}
26 --length_threshold $advanced.length_threshold 22 --length_threshold $advanced.length_threshold
27 --read_length $advanced.read_length 23 --read_length $advanced.read_length
28 --total_percentage_pca $advanced.total_percentage_pca 24 --total_percentage_pca $advanced.total_percentage_pca
29 --basename 'outdir/' 25 --basename 'outdir/'
30 --seed $advanced.seed 26 --seed $advanced.seed
31 --iterations $advanced.iterations 27 --iterations $advanced.iterations
32 --epsilon $advanced.epsilon 28 $advanced.no_cov_normalization
33 $advanced.no_cov_normalization 29 $output.no_total_coverage
34 $advanced.no_total_coverage 30 --no_original_data
35 --no_original_data 31 $output.converge_out
36 $advanced.converge_out
37
38 ## Convert all CONCOCT .csv outputs to tabular.
39 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_components_file_name > '$output_pca_components'
40 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_transformed_file_name > '$output_pca_transformed'
41 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$clustering_file_name > '$output_clustering'
42 #if str($advanced.output_process_log) == 'yes':
43 && mv outdir/log.txt '$process_log'
44 #end if
45 ]]></command> 32 ]]></command>
46 <inputs> 33 <inputs>
47 <param argument="--coverage_file" type="data" format="tabular" label="Tabular coverage file" help="Columns correspond to samples and rows to contigs"/> 34 <param argument="--coverage_file" type="data" format="tabular" label="Coverage file" help="Table where each row correspond to a contig, and each column correspond to a sample. The values are the average coverage for this contig in that sample"/>
48 <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Fasta file" help="Used to calculate the kmer composition (the genomic signature) of each contig"/> 35 <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Composition file with sequences" help="It is named the composition file since it is used to calculate the kmer composition (the genomic signature) of each contig."/>
49 <section name="advanced" title="Advanced options"> 36 <section name="advanced" title="Advanced options">
50 <param argument="--clusters" type="integer" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model algorithm"/> 37 <param argument="--clusters" type="integer" min="0" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model (VGMM) algorithm"/>
51 <param argument="--kmer_length" type="integer" value="4" label="Kmer length"/> 38 <param argument="--kmer_length" type="integer" min="0" value="4" label="Kmer length"/>
52 <param argument="--length_threshold" type="integer" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/> 39 <param argument="--length_threshold" type="integer" min="0" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/>
53 <param argument="--read_length" type="integer" value="100" label="Read length for coverage"/> 40 <param argument="--read_length" type="integer" min="0" value="100" label="Read length for coverage"/>
54 <param argument="--total_percentage_pca" type="integer" value="100" label="Percentage of variance explained by the principal components for the combined data"/> 41 <param argument="--total_percentage_pca" type="integer" min="0" value="100" label="Percentage of variance explained by the principal components for the combined data"/>
55 <param argument="--seed" type="integer" min="0" value="1" label="Integer to use as seed for clustering" help="Zero value will use random seed"/> 42 <param argument="--seed" type="integer" min="0" value="1" label="Seed for clustering" help="Zero value will use random seed"/>
56 <param argument="--iterations" type="integer" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models"/> 43 <param argument="--iterations" type="integer" min="0" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models (VBGMM)"/>
57 <param argument="--epsilon" type="float" value="0.000001" label="Epsilon for the Variational Gaussian Mixture Model algorithm"/> 44 <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed. By setting this flag you skip the normalization and only do log transorm of the coverage."/>
58 <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed"/> 45 </section>
59 <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation"/> 46 <section name="output" title="Output">
60 <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Output convergence information?"/> 47 <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation. Use this tag to escape this behaviour."/>
61 <param name="output_process_log" type="select" label="Output process log file?"> 48 <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Write convergence information to files?"/>
62 <option value="no" selected="true">No</option> 49 <param name="log" type="boolean" checked="false" label="Output process log file?"/>
63 <option value="yes">Yes</option>
64 </param>
65 </section> 50 </section>
66 </inputs> 51 </inputs>
67 <outputs> 52 <outputs>
68 <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> 53 <data name="output_clustering" format="csv" from_work_dir="outdir/clustering_gt*" label="${tool.name} on ${on_string}: Clusters"/>
69 <filter>advanced['output_process_log'] == 'yes'</filter> 54 <data name="process_log" format="txt" from_work_dir="outdir/log.txt" label="${tool.name} on ${on_string}: Log">
55 <filter>output['log']</filter>
70 </data> 56 </data>
71 <data name="output_pca_components" format="tabular" label="${tool.name} on ${on_string} (PCA components)"/> 57 <data name="output_pca_components" format="csv" from_work_dir="outdir/PCA_components_data_gt*" label="${tool.name} on ${on_string}: PCA components"/>
72 <data name="output_pca_transformed" format="tabular" label="${tool.name} on ${on_string} (PCA transformed)"/> 58 <data name="output_pca_transformed" format="csv" from_work_dir="outdir/PCA_transformed_data_gt*" label="${tool.name} on ${on_string}: PCA transformed clusters"/>
73 <data name="output_clustering" format="tabular" label="${tool.name} on ${on_string} (Clusters)"/>
74 </outputs> 59 </outputs>
75 <tests> 60 <tests>
76 <test expect_num_outputs="4"> 61 <test expect_num_outputs="4">
77 <param name="coverage_file" value="input1.tabular" ftype="tabular"/> 62 <param name="coverage_file" value="coverage" ftype="tabular"/>
78 <param name="composition_file" value="input1.fa.gz" ftype="fasta.gz"/> 63 <param name="composition_file" value="composition.fa" ftype="fasta"/>
79 <param name="output_process_log" value="yes"/> 64 <section name="advanced">
80 <output name="process_log" file="process_log.txt" ftype="txt" compare="re_match"/> 65 <param name="clusters" value="400"/>
81 <output name="output_pca_components" ftype="tabular"> 66 <param name="kmer_length" value="4"/>
67 <param name="length_threshold" value="1000"/>
68 <param name="read_length" value="100"/>
69 <param name="total_percentage_pca" value="100"/>
70 <param name="seed" value="1"/>
71 <param name="iterations" value="500"/>
72 <param name="no_cov_normalization" value=""/>
73 </section>
74 <section name="output">
75 <param name="no_total_coverage" value=""/>
76 <param name="converge_out" value=""/>
77 <param name="log" value="true"/>
78 </section>
79 <output name="process_log" ftype="txt" compare="contains">
82 <assert_contents> 80 <assert_contents>
83 <has_size value="367636"/> 81 <has_size value="786"/>
84 <has_text text="7377051e-02"/> 82 <has_text text="CONCOCT Finished"/>
85 </assert_contents> 83 </assert_contents>
86 </output> 84 </output>
87 <output name="output_pca_transformed" ftype="tabular"> 85 <output name="output_pca_components" ftype="csv">
88 <assert_contents> 86 <assert_contents>
89 <has_size value="737926"/> 87 <has_size value="362924" delta="10"/>
90 <has_text text="NODE_103_length_20202_cov_8.395357.0"/> 88 <has_text text="-5.90697200e-02"/>
91 </assert_contents> 89 </assert_contents>
92 </output> 90 </output>
93 <output name="output_clustering" ftype="tabular"> 91 <output name="output_pca_transformed" ftype="csv">
94 <assert_contents> 92 <assert_contents>
95 <has_size value="12167"/> 93 <has_size value="834200" delta="10"/>
96 <has_text text="NODE_103_length_20202_cov_8.395357"/> 94 <has_text text="contig-21000001"/>
95 </assert_contents>
96 </output>
97 <output name="output_clustering" ftype="csv">
98 <assert_contents>
99 <has_size value="6923" delta="10"/>
100 <has_text text="contig-21000001,"/>
101 </assert_contents>
102 </output>
103 </test>
104 <test expect_num_outputs="3">
105 <param name="coverage_file" value="coverage" ftype="tabular"/>
106 <param name="composition_file" value="composition.fa.gz" ftype="fasta.gz"/>
107 <section name="advanced">
108 <param name="clusters" value="400"/>
109 <param name="kmer_length" value="4"/>
110 <param name="length_threshold" value="1000"/>
111 <param name="read_length" value="100"/>
112 <param name="total_percentage_pca" value="100"/>
113 <param name="seed" value="1"/>
114 <param name="iterations" value="500"/>
115 <param name="no_cov_normalization" value=""/>
116 </section>
117 <section name="output">
118 <param name="no_total_coverage" value=""/>
119 <param name="converge_out" value=""/>
120 <param name="log" value="false"/>
121 </section>
122 <output name="output_pca_components" ftype="csv">
123 <assert_contents>
124 <has_size value="362924" delta="10"/>
125 <has_text text="-5.90697200e-02"/>
126 </assert_contents>
127 </output>
128 <output name="output_pca_transformed" ftype="csv">
129 <assert_contents>
130 <has_size value="834200" delta="10"/>
131 <has_text text="contig-21000001"/>
132 </assert_contents>
133 </output>
134 <output name="output_clustering" ftype="csv">
135 <assert_contents>
136 <has_size value="6923" delta="10"/>
137 <has_text text="contig-21000001,"/>
97 </assert_contents> 138 </assert_contents>
98 </output> 139 </output>
99 </test> 140 </test>
100 </tests> 141 </tests>
101 <help><![CDATA[ 142 <help><![CDATA[
102 **What it does** 143 **What it does**
103 144
104 CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by 145 CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by
105 using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately 146 using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately
106 (up to species level) bin metagenomic contigs. 147 (up to species level) bin metagenomic contigs.
107 148
108 The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a 149 The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a
109 sample (the values are the average coverage for this contig in that sample) and a file containing sequences in 150 sample (the values are the average coverage for this contig in that sample) and a file containing sequences in
110 fasta format. 151 fasta format.
111 152