Mercurial > repos > iuc > genomescope
changeset 1:3169a38c2656 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genomescope commit a0ba4e5bb9dd542bbf1395af64e59b9f72823fec"
author | iuc |
---|---|
date | Sat, 26 Jun 2021 14:17:47 +0000 |
parents | b2f674562a18 |
children | 01210c4e9144 |
files | genomescope.xml test-data/genomescope-out1-2.txt |
diffstat | 2 files changed, 155 insertions(+), 52 deletions(-) [+] |
line wrap: on
line diff
--- a/genomescope.xml Fri Apr 30 20:21:25 2021 +0000 +++ b/genomescope.xml Sat Jun 26 14:17:47 2021 +0000 @@ -1,7 +1,8 @@ -<tool id="genomescope" name="GenomeScope" version="@VERSION@" profile="20.01"> - <description>Analyze unassembled short reads</description> +<tool id="genomescope" name="GenomeScope" version="@VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> + <description>reference-free genome profiling</description> <macros> <token name="@VERSION@">2.0</token> + <token name="@SUFFIX_VERSION@">1</token> </macros> <requirements> <requirement type="package" version="@VERSION@">genomescope2</requirement> @@ -12,7 +13,9 @@ --input '$input' --output . --kmer_length $kmer_length - $no_unique_sequence $testing $trace_flag + $output_options.no_unique_sequence + $advanced_options.testing + $advanced_options.trace_flag #if $ploidy: --ploidy $ploidy #end if @@ -22,44 +25,60 @@ #if $max_kmercov: --max_kmercov $max_kmercov #end if - #if $topology: - --topology $topology + #if $advanced_options.topology: + --topology $advanced_options.topology #end if - #if $initial_repetitiveness: - --initial_repetitiveness $initial_repetitiveness + #if $advanced_options.initial_repetitiveness: + --initial_repetitiveness $advanced_options.initial_repetitiveness #end if - #if $initial_heterozygosities: - --initial_heterozygosities $initial_heterozygosities + #if $advanced_options.initial_heterozygosities: + --initial_heterozygosities '${advanced_options.initial_heterozygosities}' #end if - #if $transform_exp: - --transform_exp $transform_exp + #if $advanced_options.transform_exp: + --transform_exp $advanced_options.transform_exp #end if - #if $true_params: - --true_params $true_params + #if $advanced_options.true_params: + --true_params '${advanced_options.true_params}' #end if - #if $num_rounds: - --num_rounds $num_rounds + #if $advanced_options.num_rounds: + --num_rounds $advanced_options.num_rounds #end if ]]> </command> <inputs> <param argument="--input" type="data" format="tabular" label="Input histogram file" help="This file is a two column tabular file for example generated with the histo function of Jellyfish."/> - <param name="model_output" type="boolean" label="Add the model parameters to your history"/> - <param name="summary_output" type="boolean" label="Output a summary of the analysis"/> - <param name="progress_output" type="boolean" label="Additional information for each optimization round"/> - <param argument="--ploidy" type="integer" optional="true" label="Ploidy for model to use" help="Default: 2"/> + <param argument="--ploidy" type="integer" min="1" max="6" optional="true" label="Ploidy for model to use" help="Default: 2"/> <param argument="--kmer_length" type="integer" value="21" optional="false" label="K-mer length used to calculate k-mer spectra"/> - <param argument="--lambda" type="integer" optional="true" label="Optional initial kmercov estimate for model to use"/> - <param argument="--max_kmercov" type="integer" optional="true" label="Optional maximum k-mer coverage threshold" help="K-mers with coverage greater than max_kmercov are ignored by the model"/> - <param argument="--no_unique_sequence" type="boolean" truevalue="--no_unique_sequence" falsevalue="" label="Turn off yellow unique sequence line in plots"/> - <param argument="--topology" type="integer" optional="true" label="Flag for topology for model to use"/> - <param argument="--initial_repetitiveness" type="integer" optional="true" label="Initial value for repetitiveness"/> - <param argument="--initial_heterozygosities" type="integer" optional="true" label="Initial values for nucleotide heterozygosity rates"/> - <param argument="--transform_exp" type="integer" optional="true" label="Parameter for the exponent when fitting a transformed (x**transform_exp*y vs. x) k-mer histogram" help="Default: 1"/> - <param argument="--testing" type="boolean" truevalue="--testing" falsevalue="" label="Create testing.tsv file with model parameters"/> - <param argument="--true_params" type="integer" optional="true" label="Flag to state true simulated parameters for testing mode"/> - <param argument="--trace_flag" type="boolean" truevalue="--trace_flag" falsevalue="" label="Turn on printing of iteration progress of nlsLM function"/> - <param argument="--num_rounds" type="integer" min="1" optional="true" label="Number of optimization rounds"/> + <param argument="--lambda" type="integer" optional="true" label="Initial k-mer coverage estimate for model to use" help="This parameter is optional"/> + <param argument="--max_kmercov" type="integer" optional="true" label="Maximum k-mer coverage threshold" help="K-mers with coverage greater than max_kmercov are ignored by the model. This parameter is optional"/> + <section name="output_options" title="Output options" expanded="true"> + <param name="output_files" type="select" multiple="true" display="checkboxes" label="Output files"> + <option value="model_output">Generate a file with the model parameters</option> + <option value="summary_output">Summary of the analysis</option> + <option value="progress_output">Additional information for each optimization round</option> + </param> + <param argument="--no_unique_sequence" type="boolean" truevalue="--no_unique_sequence" falsevalue="" label="Turn off yellow unique sequence line in plots"/> + </section> + <section name="advanced_options" title="Advanced options"> + <param argument="--topology" type="integer" optional="true" label="Topological relationships between the homologous chromosomes" help="Flag for topology for model to use. Topology refers to the similarities between distinct homologues. Each triploid topology consists of two nucleotide heterozygosity forms (e.g. aab and abc), while each tetraploid, pentaploid, and hexaploid topology consists of three, four, and five heterozygosity forms respectively."/> + <param argument="--initial_repetitiveness" type="float" min="0" max="1" optional="true" label="Initial value for repetitiveness"/> + <param argument="--initial_heterozygosities" type="text" optional="true" label="Initial values for nucleotide heterozygosity rates" help="Nucleotide hetegozygosity rates should be float values (separated by commas if necessary)"> + <sanitizer> + <valid initial="string.digits"><add value="," /><add value="." /></valid> + </sanitizer> + <validator type="regex">[0-9,.]+</validator> + </param> + <param argument="--transform_exp" type="integer" optional="true" label="Parameter for the exponent when fitting a transformed (x**transform_exp*y vs. x) k-mer histogram" help="Default: 1"/> + <param argument="--testing" type="boolean" truevalue="--testing" falsevalue="" label="Create testing.tsv file with model parameters"/> + <param argument="--true_params" type="text" optional="true" label="Set true simulated parameters for testing mode"> + <sanitizer> + <valid initial="string.digits"><add value="," /><add value="." /></valid> + </sanitizer> + <validator type="regex">[0-9,.]+</validator> + </param> + <param argument="--trace_flag" type="boolean" truevalue="--trace_flag" falsevalue="" label="Turn on printing of iteration progress of nlsLM function"/> + <param argument="--num_rounds" type="integer" min="1" optional="true" label="Number of optimization rounds"/> + </section> </inputs> <outputs> <data name="linear_plot" format="png" from_work_dir="linear_plot.png" label="${tool.name} on ${on_string} Linear plot"/> @@ -67,26 +86,28 @@ <data name="transformed_linear_plot" format="png" from_work_dir="transformed_linear_plot.png" label="${tool.name} on ${on_string} Transformed linear plot"/> <data name="transformed_log_plot" format="png" from_work_dir="transformed_log_plot.png" label="${tool.name} on ${on_string} Transformed log plot"/> <data name="model" format="txt" from_work_dir="model.txt" label="${tool.name} on ${on_string} Model"> - <filter>model_output</filter> + <filter>output_options['output_files'] and 'model_output' in output_options['output_files']</filter> </data> <data name="summary" format="txt" from_work_dir="summary.txt" label="${tool.name} on ${on_string} Summary"> - <filter>summary_output</filter> + <filter>output_options['output_files'] and 'summary_output' in output_options['output_files']</filter> </data> <data name="progress" format="txt" from_work_dir="progress.txt" label="${tool.name} on ${on_string} Progress"> - <filter>progress_output</filter> + <filter>output_options['output_files'] and 'progress_output' in output_options['output_files']</filter> </data> <data name="model_params" format="tabular" from_work_dir="SIMULATED_testing.tsv" label="${tool.name} on ${on_string} Model parameters"> - <filter>testing</filter> + <filter>advanced_options['testing']</filter> </data> </outputs> <tests> <test expect_num_outputs="8"> <param name="input" value="genomescope-in1.tab"/> <param name="kmer_length" value="21"/> - <param name="testing" value="true"/> - <param name="model_output" value="true"/> - <param name="summary_output" value="true"/> - <param name="progress_output" value="true"/> + <section name="output_options"> + <param name="output_files" value="model_output,summary_output,progress_output"/> + </section> + <section name="advanced_options"> + <param name="testing" value="true"/> + </section> <output name="linear_plot" file="genomescope-out1-1.png" ftype="png"/> <output name="log_plot" file="genomescope-out1-2.png" ftype="png"/> <output name="transformed_linear_plot" file="genomescope-out1-3.png" ftype="png"/> @@ -94,7 +115,82 @@ <output name="model" file="genomescope-out1-1.txt" ftype="txt" lines_diff="2"/> <output name="summary" file="genomescope-out1-2.txt" ftype="txt" lines_diff="2"/> <output name="progress" file="genomescope-out1-3.txt" ftype="txt" lines_diff="2"/> - <output name="testing" file="genomescope-out1-1.tab" ftype="tabular"/> + <output name="model_params" file="genomescope-out1-1.tab" ftype="tabular"/> + </test> + <!--Test initial_repetitiveness option--> + <test expect_num_outputs="5"> + <param name="input" value="genomescope-in1.tab"/> + <param name="kmer_length" value="21"/> + <section name="advanced_options"> + <param name="initial_repetitiveness" value="0.1"/> + </section> + <section name="output_options"> + <param name="output_files" value="summary_output"/> + </section> + <output name="linear_plot" ftype="png"> + <assert_contents> + <has_size value="223370" delta="300"/> + </assert_contents> + </output> + <output name="log_plot" ftype="png"> + <assert_contents> + <has_size value="201425" delta="300"/> + </assert_contents> + </output> + <output name="transformed_linear_plot" ftype="png"> + <assert_contents> + <has_size value="221442" delta="300"/> + </assert_contents> + </output> + <output name="transformed_log_plot" ftype="png"> + <assert_contents> + <has_size value="210889" delta="300"/> + </assert_contents> + </output> + <output name="summary" ftype="txt" lines_diff="2"> + <assert_contents> + <has_line line="initial repetitiveness = 0.1"/> + <has_text text="Homozygous (aa) 98.9538% 98.967%"/> + </assert_contents> + </output> + </test> + <!--Test initial_heterozigosity option--> + <test expect_num_outputs="5"> + <param name="input" value="genomescope-in1.tab"/> + <param name="kmer_length" value="21"/> + <param name="ploidy" value="3"/> + <section name="advanced_options"> + <param name="initial_heterozygosities" value="0.04,0.01"/> + </section> + <section name="output_options"> + <param name="output_files" value="summary_output"/> + </section> + <output name="linear_plot" ftype="png"> + <assert_contents> + <has_size value="213366" delta="300"/> + </assert_contents> + </output> + <output name="log_plot" ftype="png"> + <assert_contents> + <has_size value="218425" delta="300"/> + </assert_contents> + </output> + <output name="transformed_linear_plot" ftype="png"> + <assert_contents> + <has_size value="217280" delta="300"/> + </assert_contents> + </output> + <output name="transformed_log_plot" ftype="png"> + <assert_contents> + <has_size value="229021" delta="300"/> + </assert_contents> + </output> + <output name="summary" ftype="txt" lines_diff="2"> + <assert_contents> + <has_line line="initial heterozygosities = 0.04,0.01"/> + <has_text text="Homozygous (aaa) 6.03606% 100%"/> + </assert_contents> + </output> </test> </tests> <help><![CDATA[ @@ -102,18 +198,25 @@ GenomeScope 2.0: Reference-free profiling of polyploid genomes ============================================================== -GenomeScope 2.0 applies classical insights from combinatorial theory to establish -a detailed mathematical model of how k-mer frequencies will be distributed in -heterozygous and polyploid genomes. GenomeScope 2.0 employs a polyploid-aware -mixture model that, within seconds, accurately infers genome properties from -unassembled sequencing data. GenomeScope 2.0 uses the k-mer count distribution, -e.g. from KMC or Jellyfish, and produces a report and several informative plots -describing the genome properties. We validate the approach on simulated polyploid -data created using a generative model with parameters for genome size, heterozygosity, -repetitiveness, ploidy, and sequencing coverage, and find GenomeScope 2.0 retains -accuracy across a broad range of realistic and extreme parameter values. We also -validate GenomeScope 2.0 by analyzing genuine sequence data from 11 diverse -polyploid genomes with known genome characteristics. +GenomeScope 2.0 applies classical insights from combinatorial theory to establish a detailed mathematical model of how k-mer frequencies will be distributed in heterozygous and polyploid genomes.It employs a polyploid-aware mixture model that, within seconds, accurately infers genome properties from +unassembled sequencing data. + +GenomeScope 2.0 uses the k-mer count distribution, e.g. from KMC or Jellyfish, and produces a report and several informative plots describing the genome properties. We validate the approach on simulated polyploid data created using a generative model with parameters for genome size, heterozygosity, repetitiveness, ploidy, and sequencing coverage, and find GenomeScope 2.0 retains accuracy across a broad range of realistic and extreme parameter values. + +----- + +.. class:: infomark + +**Topological relationships** + +In the field of phylogenetics, the evolutionary relationships between species are often depicted in a branching diagram known as a phylogenetic tree. In this setting, the topology of the tree refers to the branching structure of the tree. We may also depict the similarities between homologous chromosomes in a branching diagram. In this case, a topology refers to the similarities between distinct homologues. + +For ploidies of 4 and greater, there are multiple possible topologies. For example, the two tetraploid topologies are: + + :: + + AAAA → AAAB → AABC → ABCD + AAAA → AABB → AABC → ABCD ]]></help> <citations>
--- a/test-data/genomescope-out1-2.txt Fri Apr 30 20:21:25 2021 +0000 +++ b/test-data/genomescope-out1-2.txt Sat Jun 26 14:17:47 2021 +0000 @@ -1,5 +1,5 @@ GenomeScope version 2.0 -input file = /tmp/tmp32yh4464/files/1/e/2/dataset_1e2ac011-2bc8-45cf-971a-75abc3cda6d6.dat +input file = /tmp/tmpiqlcd57_/files/3/b/b/dataset_3bb1f7de-024a-4b3c-aa68-e735f06791c1.dat output directory = . p = 2 k = 21