Mercurial > repos > gga > repeatexplorer_clustering
comparison repex_full_clustering.xml @ 0:6eec21828dd4 draft default tip
planemo upload for repository https://github.com/galaxy-genome-annotation/galaxy-tools/tree/master/tools/repeatexplorer2 commit 3407a4e6a60ff89a0ab5eab87ab94b0d9a209500
author | gga |
---|---|
date | Thu, 02 Nov 2023 16:20:35 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6eec21828dd4 |
---|---|
1 <tool id="repeatexplorer_clustering" name="RepeatExplorer (clustering)" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | |
2 <description>repeat discovery and characterization using graph-based sequence clustering</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="creator"/> | |
7 <expand macro="requirements"/> | |
8 <command><![CDATA[ | |
9 | |
10 export GALAXY_MEMORY_KB=\$((\${GALAXY_MEMORY_MB:-8192}*1024)) | |
11 && | |
12 | |
13 export PYTHONHASHSEED=0 | |
14 && | |
15 | |
16 ## output will go here | |
17 mkdir -p '${reportfile.extra_files_path}' | |
18 && | |
19 | |
20 /repex_tarean/seqclust | |
21 --cpu \${GALAXY_SLOTS:-1} | |
22 --max_memory \${GALAXY_MEMORY_KB} | |
23 '${paired}' | |
24 #if $sample: | |
25 --sample '${sample}' | |
26 #end if | |
27 --taxon '${taxon}' | |
28 --output_dir='${reportfile.extra_files_path}' | |
29 #if $advanced.mincl: | |
30 --mincl '${advanced.mincl}' | |
31 #end if | |
32 --assembly_min '${advanced.assembly_min}' | |
33 #if $advanced.keep_names: | |
34 --keep_names | |
35 #end if | |
36 '${fastafile}' | |
37 && | |
38 | |
39 ## pick up the html index | |
40 cp '${reportfile.extra_files_path}/index.html' ./index.html | |
41 | |
42 ]]></command> | |
43 <inputs> | |
44 <param name="fastafile" label="NGS reads" type="data" format="fasta" help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/> | |
45 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, they must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/> | |
46 <param argument="--sample" type="integer" min="2" optional="true" label="Subsample reads (number)" help="Use an integer > 1 to select a specific number of reads to use. Leave this field blank to use the entire dataset."/> | |
47 <param argument="--taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats"> | |
48 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0</option> | |
49 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option> | |
50 <option value="METAZOA3.0">Metazoa version 3.0</option> | |
51 <option value="METAZOA2.0">Metazoa version 2.0</option> | |
52 </param> | |
53 <section name="advanced" title="Advanced options" expanded="false"> | |
54 <param argument="--mincl" label="Cluster size threshold for detailed analysis" type="float" value="" min="0.0001" max="100" optional="true" help="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; clusters with less than 20 reads are not considered."/> | |
55 <param argument="--assembly_min" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/> | |
56 <param argument="--keep_names" label="Keep original read names" type="boolean" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/> | |
57 </section> | |
58 </inputs> | |
59 <outputs> | |
60 <data name="reportfile" format="html" from_work_dir="index.html" label="RepeatExplorer - HTML report on ${on_string}"/> | |
61 </outputs> | |
62 <tests> | |
63 <!-- test1: basic function --> | |
64 <test expect_num_outputs="1"> | |
65 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/> | |
66 <param name="paired" value="True"/> | |
67 <param name="taxon" value="VIRIDIPLANTAE3.0"/> | |
68 <output name="reportfile"> | |
69 <assert_contents> | |
70 <has_text text="Clustering summary"/> | |
71 </assert_contents> | |
72 </output> | |
73 </test> | |
74 <!-- test2: read subsample --> | |
75 <test expect_num_outputs="1"> | |
76 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/> | |
77 <param name="paired" value="True"/> | |
78 <param name="sample" value="5000"/> | |
79 <param name="taxon" value="VIRIDIPLANTAE3.0"/> | |
80 <output name="reportfile"> | |
81 <assert_contents> | |
82 <has_text text="Clustering summary"/> | |
83 </assert_contents> | |
84 </output> | |
85 </test> | |
86 <!-- test3: advanced params --> | |
87 <test expect_num_outputs="1"> | |
88 <param name="fastafile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/> | |
89 <param name="paired" value="True"/> | |
90 <param name="taxon" value="VIRIDIPLANTAE3.0"/> | |
91 <param name="mincl" value="0.01"/> | |
92 <param name="keep_names" value="True"/> | |
93 <output name="reportfile"> | |
94 <assert_contents> | |
95 <has_text text="Clustering summary"/> | |
96 </assert_contents> | |
97 </output> | |
98 </test> | |
99 </tests> | |
100 <help><![CDATA[ | |
101 **HELP** | |
102 | |
103 RepeatExplorer2 clustering is a computational pipeline for unsupervised | |
104 identification of repeats from unassembled sequence reads. The | |
105 pipeline uses low-pass whole genome sequence reads and performs graph-based | |
106 clustering. Resulting clusters, representing all types of repeats, are then | |
107 examined to identify and classify into repeats groups. | |
108 | |
109 **Input data** | |
110 | |
111 The analysis requires either **single** or **paired-end reads** generated | |
112 by whole genome shotgun sequencing provided as a single fasta-formatted file. | |
113 Generally, paired-end reads provide significantly better results than single | |
114 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and | |
115 the number of analyzed reads should represent less than 1x genome equivalent | |
116 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be | |
117 quality-filtered (recommended filtering : quality score >=10 over 95% of bases | |
118 and no Ns allowed) and only **complete read pairs** should be submitted for | |
119 analysis. When paired reads are used, input data must be **interlaced** format | |
120 as fasta file: | |
121 | |
122 example of interlaced input format:: | |
123 | |
124 >0001_f | |
125 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG | |
126 >0001_r | |
127 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT | |
128 >0002_f | |
129 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG | |
130 >0002_r | |
131 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC | |
132 >0003_f | |
133 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
134 >0003_r | |
135 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
136 ... | |
137 | |
138 | |
139 **Comparative analysis** | |
140 | |
141 For comparative analysis sequence names must contain code (prefix) for each group. | |
142 Prefix in sequences names must be of fixed length. | |
143 | |
144 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB :: | |
145 | |
146 >AA0001_f | |
147 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG | |
148 >AA0001_r | |
149 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT | |
150 >AA0002_f | |
151 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG | |
152 >AA0002_r | |
153 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC | |
154 >BB0001_f | |
155 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
156 >BB0001_r | |
157 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
158 >BB0002_f | |
159 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT | |
160 >BB0002_r | |
161 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT | |
162 | |
163 | |
164 To prepare quality filtered and interlaced input fasta file from fastq | |
165 files, use `Preprocessing of paired-reads`__ tool. | |
166 | |
167 .. __: tool_runner?tool_id=paired_fastq_filtering | |
168 | |
169 | |
170 **Additional parameters** | |
171 | |
172 **Sample size** defines how many reads should be used in calculation. | |
173 Default setting with 500,000 reads will enable detection of high copy | |
174 repeats within several hours of computation time. For higher | |
175 sensitivity the sample size can be set higher. Since sample size affects | |
176 the memory usage, this parameter may be automatically adjusted to lower | |
177 value during the run. Maximum sample size which can be processed depends on | |
178 the repetitiveness of analyzed genome. | |
179 | |
180 | |
181 **Select taxon and protein domain database version (REXdb)**. Classification | |
182 of transposable elements is based on the similarity to our reference database | |
183 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species | |
184 can be obtained on `repeatexplorer.org`__. Classification | |
185 system used in REXdb is described in article `Systematic survey of plant | |
186 LTR-retrotransposons elucidates phylogenetic relationships of their | |
187 polyprotein domains and provides a reference for element classification`__ | |
188 Database for Metazoa species is still under development so use it with caution. | |
189 | |
190 .. __: http://repeatexplorer.org | |
191 .. __: https://doi.org/10.1186/s13100-018-0144-1 | |
192 | |
193 **Select parameters for protein domain search** REXdb is compared with s | |
194 equence clusters either using blastx or diamond aligner. Diamond program | |
195 is about three time faster than blastx with word size 3. | |
196 | |
197 **Similarity search options** By default sequence reads are compared using | |
198 mgblast program. Default threshold is explicitly set to 90% sequence | |
199 similarity spanning at least 55% of the read length (in the case of reads | |
200 differing in length it applies to the longer one). Additionally, sequence | |
201 overlap must be at least 55 nt. If you select option for shorter reads | |
202 than 100 nt, minimum overlap 55 nt is not required. | |
203 | |
204 By default, | |
205 mgblast search use DUST program to filter out | |
206 low-complexity sequences. If you want | |
207 to increase sensitivity of detection of satellites with shorter monomer | |
208 use option with '*no masking of low complexity repeats*'. Note that omitting | |
209 DUST filtering will significantly increase running times | |
210 | |
211 | |
212 **Automatic filtering of abundant satellite repeats** perform clustering on | |
213 smaller dataset of sequence reads to detect abundant high confidence | |
214 satellite repeats. If such satellites are detected, sequence reads derived | |
215 from these satellites are depleted from input dataset. This step enable more | |
216 sensitive detection of less abundant repeats as more reads can be used | |
217 in clustering step. | |
218 | |
219 **Use custom repeat database**. This option allows users to perform similarity | |
220 comparison of identified repeats to their custom databases. The repeat class must | |
221 be encoded in FASTA headers of database entries in order to allow correct | |
222 parsing of similarity hits. Required format for custom database sequence name is: :: | |
223 | |
224 >reapeatname#class/subclass | |
225 | |
226 | |
227 **Output** | |
228 | |
229 List of clusters identified as putative satellite repeats, their genomic | |
230 abundance and various cluster characteristics. | |
231 | |
232 Output includes a **HTML summary** with table listing of all analyzed | |
233 clusters. More detailed information about clusters is provided in | |
234 additional files and directories. All results are also provided as | |
235 downloadable **zip archive**. Additionally a **log file** reporting | |
236 the progress of the computational pipeline is provided. | |
237 | |
238 ]]></help> | |
239 <expand macro="citations"/> | |
240 </tool> |