repeatexplorer_clustering: repex_full_clustering.xml comparison

comparison repex_full_clustering.xml @ 0:6eec21828dd4 draft default tip

planemo upload for repository https://github.com/galaxy-genome-annotation/galaxy-tools/tree/master/tools/repeatexplorer2 commit 3407a4e6a60ff89a0ab5eab87ab94b0d9a209500

author	gga
date	Thu, 02 Nov 2023 16:20:35 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:6eec21828dd4
+<tool id="repeatexplorer_clustering" name="RepeatExplorer (clustering)" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+<description>repeat discovery and characterization using graph-based sequence clustering</description>
+<macros>
+<import>macros.xml</import>
+</macros>
+<expand macro="creator"/>
+<expand macro="requirements"/>
+<command><![CDATA[
+export GALAXY_MEMORY_KB=\$((\${GALAXY_MEMORY_MB:-8192}*1024))
+&&
+export PYTHONHASHSEED=0
+&&
+## output will go here
+mkdir -p '${reportfile.extra_files_path}'
+&&
+/repex_tarean/seqclust
+--cpu \${GALAXY_SLOTS:-1}
+--max_memory \${GALAXY_MEMORY_KB}
+'${paired}'
+#if $sample:
+--sample '${sample}'
+#end if
+--taxon '${taxon}'
+--output_dir='${reportfile.extra_files_path}'
+#if $advanced.mincl:
+--mincl '${advanced.mincl}'
+#end if
+--assembly_min '${advanced.assembly_min}'
+#if $advanced.keep_names:
+--keep_names
+#end if
+'${fastafile}'
+&&
+## pick up the html index
+cp '${reportfile.extra_files_path}/index.html' ./index.html
+]]></command>
+<inputs>
+<param name="fastafile" label="NGS reads" type="data" format="fasta" help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/>
+<param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, they must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/>
+<param argument="--sample" type="integer" min="2" optional="true" label="Subsample reads (number)" help="Use an integer &gt; 1 to select a specific number of reads to use. Leave this field blank to use the entire dataset."/>
+<param argument="--taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
+<option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0</option>
+<option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
+<option value="METAZOA3.0">Metazoa version 3.0</option>
+<option value="METAZOA2.0">Metazoa version 2.0</option>
+</param>
+<section name="advanced" title="Advanced options" expanded="false">
+<param argument="--mincl" label="Cluster size threshold  for detailed analysis" type="float" value="" min="0.0001" max="100" optional="true" help="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; clusters with less than 20 reads are not considered."/>
+<param argument="--assembly_min" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/>
+<param argument="--keep_names" label="Keep original read names" type="boolean" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/>
+</section>
+</inputs>
+<outputs>
+<data name="reportfile" format="html" from_work_dir="index.html" label="RepeatExplorer - HTML report on ${on_string}"/>
+</outputs>
+<tests>
+<!-- test1: basic function -->
+<test expect_num_outputs="1">
+<param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+<param name="paired" value="True"/>
+<param name="taxon" value="VIRIDIPLANTAE3.0"/>
+<output name="reportfile">
+<assert_contents>
+<has_text text="Clustering summary"/>
+</assert_contents>
+</output>
+</test>
+<!-- test2: read subsample -->
+<test expect_num_outputs="1">
+<param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+<param name="paired" value="True"/>
+<param name="sample" value="5000"/>
+<param name="taxon" value="VIRIDIPLANTAE3.0"/>
+<output name="reportfile">
+<assert_contents>
+<has_text text="Clustering summary"/>
+</assert_contents>
+</output>
+</test>
+<!-- test3: advanced params -->
+<test expect_num_outputs="1">
+<param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+<param name="paired" value="True"/>
+<param name="taxon" value="VIRIDIPLANTAE3.0"/>
+<param name="mincl" value="0.01"/>
+<param name="keep_names" value="True"/>
+<output name="reportfile">
+<assert_contents>
+<has_text text="Clustering summary"/>
+</assert_contents>
+</output>
+</test>
+</tests>
+<help><![CDATA[
+**HELP**
+RepeatExplorer2 clustering is a computational pipeline for unsupervised
+identification of repeats from unassembled sequence reads. The
+pipeline uses low-pass whole genome sequence reads and performs graph-based
+clustering. Resulting clusters, representing all types of repeats, are then
+examined to identify and classify into repeats groups.
+**Input data**
+The analysis requires either **single** or **paired-end reads** generated
+by whole genome shotgun sequencing provided as a single fasta-formatted file.
+Generally, paired-end reads provide significantly better results than single
+reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
+the number of analyzed reads should represent less than 1x genome equivalent
+(genome coverage of 0.01 - 0.50 x is recommended). Reads should be
+quality-filtered (recommended filtering : quality score >=10 over 95% of bases
+and no Ns allowed) and only **complete read pairs** should be submitted for
+analysis. When paired reads are used, input data must be **interlaced** format
+as fasta file:
+example of interlaced input format::
+>0001_f
+CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+>0001_r
+GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+>0002_f
+ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+>0002_r
+TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+>0003_f
+TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+>0003_r
+TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+...
+**Comparative analysis**
+For comparative analysis sequence names must contain code (prefix) for each group.
+Prefix in sequences names  must be of fixed length.
+Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
+>AA0001_f
+CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
+>AA0001_r
+GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
+>AA0002_f
+ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
+>AA0002_r
+TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
+>BB0001_f
+TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+>BB0001_r
+TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+>BB0002_f
+TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
+>BB0002_r
+TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
+To prepare quality filtered and interlaced input fasta file from fastq
+files, use `Preprocessing of paired-reads`__  tool.
+.. __: tool_runner?tool_id=paired_fastq_filtering
+**Additional parameters**
+**Sample size** defines how many reads should be used in calculation.
+Default setting with 500,000 reads will enable detection of high copy
+repeats within several hours of computation time. For higher
+sensitivity the sample size can be set higher. Since sample size affects
+the memory usage, this parameter may be automatically adjusted to lower
+value during the run. Maximum sample size which can be processed depends on
+the repetitiveness of analyzed genome.
+**Select taxon and protein domain database version (REXdb)**. Classification
+of transposable elements is based on the similarity to our reference database
+of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
+can be obtained on `repeatexplorer.org`__. Classification
+system used in REXdb is described in article `Systematic survey of plant
+LTR-retrotransposons elucidates phylogenetic relationships of their
+polyprotein domains and provides a reference for element classification`__
+Database for Metazoa species is still under development so use it with caution.
+.. __: http://repeatexplorer.org
+.. __: https://doi.org/10.1186/s13100-018-0144-1
+**Select parameters for protein domain search** REXdb is compared with s
+equence clusters either using blastx or diamond aligner. Diamond program
+is about three time faster than blastx with word size 3.
+**Similarity search options** By default sequence reads are compared using
+mgblast program. Default threshold is explicitly set to 90% sequence
+similarity spanning at least 55% of the read length (in the case of reads
+differing in length it applies to the longer one). Additionally, sequence
+overlap must be at least 55 nt. If you select option for shorter reads
+than 100 nt,  minimum overlap 55 nt is not required.
+By default,
+mgblast search use DUST program to filter out
+low-complexity sequences. If you want
+to increase sensitivity of detection of satellites with shorter monomer
+use option with '*no masking of low complexity repeats*'. Note that omitting
+DUST filtering will significantly increase running times
+**Automatic filtering of abundant satellite repeats** perform clustering on
+smaller dataset of sequence reads to detect abundant high confidence
+satellite repeats. If such satellites are detected, sequence reads derived
+from these satellites are depleted from input dataset. This step enable more
+sensitive detection of less abundant repeats as more reads can be used
+in clustering step.
+**Use custom repeat database**. This option allows users to perform similarity
+comparison of identified repeats to their custom databases. The repeat class must
+be encoded in FASTA headers of database entries in order to allow correct
+parsing of similarity hits. Required format for custom database sequence name is: ::
+>reapeatname#class/subclass
+**Output**
+List of clusters identified as putative satellite repeats, their genomic
+abundance and various cluster characteristics.
+Output includes a **HTML summary** with table listing of all analyzed
+clusters. More detailed information about clusters is provided in
+additional files and directories. All results are also provided as
+downloadable **zip archive**. Additionally a **log file** reporting
+the progress of the computational pipeline is provided.
+]]></help>
+<expand macro="citations"/>
+</tool>

Mercurial > repos > gga > repeatexplorer_clustering

comparison repex_full_clustering.xml @ 0:6eec21828dd4 draft default tip