Mercurial > repos > iuc > concoct
comparison concoct.xml @ 3:3842ef1b2f34 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 1a79c139165648b969d82530784cea3fc8f2d2c0"
author | iuc |
---|---|
date | Thu, 07 Jul 2022 08:33:35 +0000 |
parents | 7a145c72d375 |
children | 28e8d2bd6aba |
comparison
equal
deleted
inserted
replaced
2:7a145c72d375 | 3:3842ef1b2f34 |
---|---|
1 <tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | 1 <tool id="concoct" name="CONCOCT" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> |
2 <description>metagenome binning</description> | 2 <description>for metagenome binning</description> |
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements"/> | 6 <expand macro="requirements"/> |
7 <command detect_errors="exit_code"><![CDATA[ | 7 <command detect_errors="exit_code"><![CDATA[ |
8 #set pca_components_file_name = 'PCA_components_data_gt' + str($advanced.length_threshold) + '.csv' | |
9 #set pca_transformed_file_name = 'PCA_transformed_data_gt' + str($advanced.length_threshold) + '.csv' | |
10 #set clustering_file_name = 'clustering_gt' + str($advanced.length_threshold) + '.csv' | |
11 | |
12 ## CONCOCT doesn't handle gzipped files. | 8 ## CONCOCT doesn't handle gzipped files. |
13 #if $composition_file.ext.endswith(".gz") | 9 #if $composition_file.ext.endswith(".gz") |
14 gunzip -c '$composition_file' > composition_file.fa && | 10 gunzip -c '$composition_file' > 'composition_file.fa' && |
15 #else: | 11 #else: |
16 ln -s '$composition_file' composition_file.fa && | 12 ln -s '$composition_file' 'composition_file.fa' && |
17 #end if | 13 #end if |
18 | 14 |
19 mkdir outdir && | 15 mkdir outdir && |
20 concoct | 16 concoct |
21 --coverage_file '$coverage_file' | 17 --coverage_file '$coverage_file' |
22 --composition_file composition_file.fa | 18 --composition_file 'composition_file.fa' |
23 --clusters $advanced.clusters | 19 --clusters $advanced.clusters |
24 --kmer_length $advanced.kmer_length | 20 --kmer_length $advanced.kmer_length |
25 --threads \${GALAXY_SLOTS:-4} | 21 --threads \${GALAXY_SLOTS:-4} |
26 --length_threshold $advanced.length_threshold | 22 --length_threshold $advanced.length_threshold |
27 --read_length $advanced.read_length | 23 --read_length $advanced.read_length |
28 --total_percentage_pca $advanced.total_percentage_pca | 24 --total_percentage_pca $advanced.total_percentage_pca |
29 --basename 'outdir/' | 25 --basename 'outdir/' |
30 --seed $advanced.seed | 26 --seed $advanced.seed |
31 --iterations $advanced.iterations | 27 --iterations $advanced.iterations |
32 --epsilon $advanced.epsilon | 28 $advanced.no_cov_normalization |
33 $advanced.no_cov_normalization | 29 $output.no_total_coverage |
34 $advanced.no_total_coverage | 30 --no_original_data |
35 --no_original_data | 31 $output.converge_out |
36 $advanced.converge_out | |
37 | |
38 ## Convert all CONCOCT .csv outputs to tabular. | |
39 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_components_file_name > '$output_pca_components' | |
40 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$pca_transformed_file_name > '$output_pca_transformed' | |
41 && sed 's/\("\([^"]*\)"\)\?,/\2\t/g' outdir/$clustering_file_name > '$output_clustering' | |
42 #if str($advanced.output_process_log) == 'yes': | |
43 && mv outdir/log.txt '$process_log' | |
44 #end if | |
45 ]]></command> | 32 ]]></command> |
46 <inputs> | 33 <inputs> |
47 <param argument="--coverage_file" type="data" format="tabular" label="Tabular coverage file" help="Columns correspond to samples and rows to contigs"/> | 34 <param argument="--coverage_file" type="data" format="tabular" label="Coverage file" help="Table where each row correspond to a contig, and each column correspond to a sample. The values are the average coverage for this contig in that sample"/> |
48 <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Fasta file" help="Used to calculate the kmer composition (the genomic signature) of each contig"/> | 35 <param argument="--composition_file" type="data" format="fasta,fasta.gz" label="Composition file with sequences" help="It is named the composition file since it is used to calculate the kmer composition (the genomic signature) of each contig."/> |
49 <section name="advanced" title="Advanced options"> | 36 <section name="advanced" title="Advanced options"> |
50 <param argument="--clusters" type="integer" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model algorithm"/> | 37 <param argument="--clusters" type="integer" min="0" value="400" label="Maximum number of clusters for the Variational Gaussian Mixture Model (VGMM) algorithm"/> |
51 <param argument="--kmer_length" type="integer" value="4" label="Kmer length"/> | 38 <param argument="--kmer_length" type="integer" min="0" value="4" label="Kmer length"/> |
52 <param argument="--length_threshold" type="integer" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/> | 39 <param argument="--length_threshold" type="integer" min="0" value="1000" label="Sequence length threshold" help="Contigs shorter than this value will not be included"/> |
53 <param argument="--read_length" type="integer" value="100" label="Read length for coverage"/> | 40 <param argument="--read_length" type="integer" min="0" value="100" label="Read length for coverage"/> |
54 <param argument="--total_percentage_pca" type="integer" value="100" label="Percentage of variance explained by the principal components for the combined data"/> | 41 <param argument="--total_percentage_pca" type="integer" min="0" value="100" label="Percentage of variance explained by the principal components for the combined data"/> |
55 <param argument="--seed" type="integer" min="0" value="1" label="Integer to use as seed for clustering" help="Zero value will use random seed"/> | 42 <param argument="--seed" type="integer" min="0" value="1" label="Seed for clustering" help="Zero value will use random seed"/> |
56 <param argument="--iterations" type="integer" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models"/> | 43 <param argument="--iterations" type="integer" min="0" value="500" label="Maximum number of iterations for the Variational Bayes Gaussian Mixture Models (VBGMM)"/> |
57 <param argument="--epsilon" type="float" value="0.000001" label="Epsilon for the Variational Gaussian Mixture Model algorithm"/> | 44 <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed. By setting this flag you skip the normalization and only do log transorm of the coverage."/> |
58 <param argument="--no_cov_normalization" type="boolean" truevalue="--no_cov_normalization" falsevalue="" checked="false" label="Skip normalization and only do log transorm of the coverage?" help="By default, the coverage is normalized for samples, then normalized for contigs and finally log transformed"/> | 45 </section> |
59 <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation"/> | 46 <section name="output" title="Output"> |
60 <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Output convergence information?"/> | 47 <param argument="--no_total_coverage" type="boolean" truevalue="--no_total_coverage" falsevalue="" checked="false" label="Eliminate the total coverage column from the coverage data matrix?" help="By default, total coverage is included, independently of coverage normalization but previous to log transformation. Use this tag to escape this behaviour."/> |
61 <param name="output_process_log" type="select" label="Output process log file?"> | 48 <param argument="--converge_out" type="boolean" truevalue="--converge_out" falsevalue="" checked="false" label="Write convergence information to files?"/> |
62 <option value="no" selected="true">No</option> | 49 <param name="log" type="boolean" checked="false" label="Output process log file?"/> |
63 <option value="yes">Yes</option> | |
64 </param> | |
65 </section> | 50 </section> |
66 </inputs> | 51 </inputs> |
67 <outputs> | 52 <outputs> |
68 <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> | 53 <data name="output_clustering" format="csv" from_work_dir="outdir/clustering_gt*" label="${tool.name} on ${on_string}: Clusters"/> |
69 <filter>advanced['output_process_log'] == 'yes'</filter> | 54 <data name="process_log" format="txt" from_work_dir="outdir/log.txt" label="${tool.name} on ${on_string}: Log"> |
55 <filter>output['log']</filter> | |
70 </data> | 56 </data> |
71 <data name="output_pca_components" format="tabular" label="${tool.name} on ${on_string} (PCA components)"/> | 57 <data name="output_pca_components" format="csv" from_work_dir="outdir/PCA_components_data_gt*" label="${tool.name} on ${on_string}: PCA components"/> |
72 <data name="output_pca_transformed" format="tabular" label="${tool.name} on ${on_string} (PCA transformed)"/> | 58 <data name="output_pca_transformed" format="csv" from_work_dir="outdir/PCA_transformed_data_gt*" label="${tool.name} on ${on_string}: PCA transformed clusters"/> |
73 <data name="output_clustering" format="tabular" label="${tool.name} on ${on_string} (Clusters)"/> | |
74 </outputs> | 59 </outputs> |
75 <tests> | 60 <tests> |
76 <test expect_num_outputs="4"> | 61 <test expect_num_outputs="4"> |
77 <param name="coverage_file" value="input1.tabular" ftype="tabular"/> | 62 <param name="coverage_file" value="coverage" ftype="tabular"/> |
78 <param name="composition_file" value="input1.fa.gz" ftype="fasta.gz"/> | 63 <param name="composition_file" value="composition.fa" ftype="fasta"/> |
79 <param name="output_process_log" value="yes"/> | 64 <section name="advanced"> |
80 <output name="process_log" file="process_log.txt" ftype="txt" compare="re_match"/> | 65 <param name="clusters" value="400"/> |
81 <output name="output_pca_components" ftype="tabular"> | 66 <param name="kmer_length" value="4"/> |
67 <param name="length_threshold" value="1000"/> | |
68 <param name="read_length" value="100"/> | |
69 <param name="total_percentage_pca" value="100"/> | |
70 <param name="seed" value="1"/> | |
71 <param name="iterations" value="500"/> | |
72 <param name="no_cov_normalization" value=""/> | |
73 </section> | |
74 <section name="output"> | |
75 <param name="no_total_coverage" value=""/> | |
76 <param name="converge_out" value=""/> | |
77 <param name="log" value="true"/> | |
78 </section> | |
79 <output name="process_log" ftype="txt" compare="contains"> | |
82 <assert_contents> | 80 <assert_contents> |
83 <has_size value="367636"/> | 81 <has_size value="786"/> |
84 <has_text text="7377051e-02"/> | 82 <has_text text="CONCOCT Finished"/> |
85 </assert_contents> | 83 </assert_contents> |
86 </output> | 84 </output> |
87 <output name="output_pca_transformed" ftype="tabular"> | 85 <output name="output_pca_components" ftype="csv"> |
88 <assert_contents> | 86 <assert_contents> |
89 <has_size value="737926"/> | 87 <has_size value="362924" delta="10"/> |
90 <has_text text="NODE_103_length_20202_cov_8.395357.0"/> | 88 <has_text text="-5.90697200e-02"/> |
91 </assert_contents> | 89 </assert_contents> |
92 </output> | 90 </output> |
93 <output name="output_clustering" ftype="tabular"> | 91 <output name="output_pca_transformed" ftype="csv"> |
94 <assert_contents> | 92 <assert_contents> |
95 <has_size value="12167"/> | 93 <has_size value="834200" delta="10"/> |
96 <has_text text="NODE_103_length_20202_cov_8.395357"/> | 94 <has_text text="contig-21000001"/> |
95 </assert_contents> | |
96 </output> | |
97 <output name="output_clustering" ftype="csv"> | |
98 <assert_contents> | |
99 <has_size value="6923" delta="10"/> | |
100 <has_text text="contig-21000001,"/> | |
101 </assert_contents> | |
102 </output> | |
103 </test> | |
104 <test expect_num_outputs="3"> | |
105 <param name="coverage_file" value="coverage" ftype="tabular"/> | |
106 <param name="composition_file" value="composition.fa.gz" ftype="fasta.gz"/> | |
107 <section name="advanced"> | |
108 <param name="clusters" value="400"/> | |
109 <param name="kmer_length" value="4"/> | |
110 <param name="length_threshold" value="1000"/> | |
111 <param name="read_length" value="100"/> | |
112 <param name="total_percentage_pca" value="100"/> | |
113 <param name="seed" value="1"/> | |
114 <param name="iterations" value="500"/> | |
115 <param name="no_cov_normalization" value=""/> | |
116 </section> | |
117 <section name="output"> | |
118 <param name="no_total_coverage" value=""/> | |
119 <param name="converge_out" value=""/> | |
120 <param name="log" value="false"/> | |
121 </section> | |
122 <output name="output_pca_components" ftype="csv"> | |
123 <assert_contents> | |
124 <has_size value="362924" delta="10"/> | |
125 <has_text text="-5.90697200e-02"/> | |
126 </assert_contents> | |
127 </output> | |
128 <output name="output_pca_transformed" ftype="csv"> | |
129 <assert_contents> | |
130 <has_size value="834200" delta="10"/> | |
131 <has_text text="contig-21000001"/> | |
132 </assert_contents> | |
133 </output> | |
134 <output name="output_clustering" ftype="csv"> | |
135 <assert_contents> | |
136 <has_size value="6923" delta="10"/> | |
137 <has_text text="contig-21000001,"/> | |
97 </assert_contents> | 138 </assert_contents> |
98 </output> | 139 </output> |
99 </test> | 140 </test> |
100 </tests> | 141 </tests> |
101 <help><![CDATA[ | 142 <help><![CDATA[ |
102 **What it does** | 143 **What it does** |
103 | 144 |
104 CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by | 145 CONCOCT (Clustering cONtigs with COverage and ComposiTion) performs unsupervised binning of metagenomic contigs by |
105 using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately | 146 using nucleotide composition - kmer frequencies - and coverage data for multiple samples. CONCOCT can accurately |
106 (up to species level) bin metagenomic contigs. | 147 (up to species level) bin metagenomic contigs. |
107 | 148 |
108 The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a | 149 The tool accepts 2 inputs; a tabular file where each row corresponds to a contig and each column corresponds to a |
109 sample (the values are the average coverage for this contig in that sample) and a file containing sequences in | 150 sample (the values are the average coverage for this contig in that sample) and a file containing sequences in |
110 fasta format. | 151 fasta format. |
111 | 152 |