comparison repex_full_clustering.xml @ 2:349b197133dc draft

Uploaded
author petr-novak
date Fri, 24 Jul 2020 07:26:59 -0400
parents
children d1f67a13b70f
comparison
equal deleted inserted replaced
1:422485508110 2:349b197133dc
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.8" >
2 <stdio>
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
7 <exit_code range="1:" level="fatal" description="Error" />
8 </stdio>
9 <description>Improved version or repeat discovery and characterization using graph-based sequence clustering</description>
10 <requirements>
11 <requirement type="package" version="1.18.1">numpy</requirement>
12 <requirement type="package" version="0.8">logomaker</requirement>
13 <requirement type="package" version="1.0.3">pandas</requirement>
14 <requirement type="package" version="3.1.3">matplotlib</requirement>
15 <requirement type="package">last</requirement>
16 <requirement type="package">imagemagick</requirement>
17 <requirement type="package">mafft</requirement>
18 <requirement type="package">blast</requirement>
19 <requirement type="package" version="0.9.29" >diamond</requirement>
20 <requirement type="package">blast-legacy</requirement>
21 <requirement type="package">r-igraph</requirement>
22 <requirement type="package">r-data.tree</requirement>
23 <requirement type="package">r-stringr</requirement>
24 <requirement type="package">r-r2html</requirement>
25 <requirement type="package">r-hwriter</requirement>
26 <requirement type="package">r-dt</requirement>
27 <requirement type="package">r-scales</requirement>
28 <requirement type="package">r-plotrix</requirement>
29 <requirement type="package">r-png</requirement>
30 <requirement type="package">r-plyr</requirement>
31 <requirement type="package">r-dplyr</requirement>
32 <requirement type="package">r-optparse</requirement>
33 <requirement type="package">r-dbi</requirement>
34 <requirement type="package">r-rsqlite</requirement>
35 <requirement type="package">r-rserve</requirement>
36 <requirement type="package">bioconductor-biostrings</requirement>
37 <requirement type="package" version="2.3.8">repex_tarean</requirement>
38 <requirement type="set_environment">REPEX</requirement>
39 <requirement type="set_environment">REPEX_VERSION</requirement>
40 <requirement type="package" version="0.9.1" >pyrserve</requirement>
41 </requirements>
42 <command >
43 export PYTHONHASHSEED=0;
44 \${REPEX}/seqclust --sample ${read_sampling.sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
45
46 #if $advanced_options.advanced:
47 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx
48 --assembly_min $advanced_options.assembly_min_cluster_size
49
50 #if $advanced_options.comparative.options_comparative:
51 --prefix_length $advanced_options.comparative.prefix_length
52 #end if
53
54 #if $advanced_options.custom_library.options_custom_library:
55 -d $advanced_options.custom_library.library extra_database
56 #end if
57
58 #if $advanced_options.options.options:
59 -opt $advanced_options.options.options
60 #end if
61 #end if
62 ${FastaFile} >stdout.log 2> stderr.log ;
63 echo "STDOUT CONTENT:" >> ${log} ;
64 cat stdout.log >> ${log} ;
65 echo "STDERR CONTENT:" >> ${log};
66 cat stderr.log >> ${log} &amp;&amp;
67 \${REPEX}/stderr_filter.py stderr.log &amp;&amp;
68 cd tarean_output &amp;&amp;
69 zip -r ${ReportArchive}.zip * &amp;&amp;
70 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
71 cp index.html ${ReportFile} &amp;&amp;
72 mkdir ${ReportFile.files_path} &amp;&amp;
73 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
74 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
75 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
76 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
77 cp *.png ${ReportFile.files_path}/ &amp;&amp;
78 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
79 cp *.html ${ReportFile.files_path}/ &amp;&amp;
80 cp *.css ${ReportFile.files_path}/ &amp;&amp;
81 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
82
83 </command>
84 <inputs>
85 <param name="FastaFile" label="NGS reads" type="data" format="fasta"
86 help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/>
87 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, left- and right-hand reads must be interlaced and all pairs must be complete. Example of the correct format is provided in the help below." />
88
89 <conditional name="read_sampling">
90 <param name="do_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling" help="Use this option if you want to analyze only a part of the reads" />
91 <when value="false">
92 <!-- pass -->
93 <param name="sample" label="Sample size" hidden="True" type="integer" value="0" help="Number of analyzed reads"/>
94 </when>
95 <when value="true">
96 <param name="sample" label="Sample size" type="integer" value="500000" min="10000" help="Number of analyzed reads"/>
97 </when>
98 </conditional>
99
100
101 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
102 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
103 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
104 <option value="METAZOA3.0" >Metazoa version 3.0</option>
105 <option value="METAZOA2.0" >Metazoa version 2.0</option>
106 <!-- Modify setting in config.py accordingly -->
107 </param>
108
109 <conditional name="advanced_options">
110 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
111 <when value="false">
112 <!-- pass -->
113 </when>
114 <when value="true">
115 <conditional name="comparative">
116 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options to analyze multiple samples simultaneously"/>
117 <when value="false">
118 <!-- do nothing here -->
119 </when>
120 <when value="true">
121 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, reads from different samples are distinguished by sample codes included as prefix to the read names. See example below."/>
122 </when>
123 </conditional>
124
125 <conditional name="blastx">
126 <param name="options_blastx" type="select" label="Select parameters for protein domain search">
127 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
128 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
129 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
130 </param>
131 </conditional>
132
133 <conditional name="options">
134 <param name="options" type="select" label="Similarity search options">
135 <option value="ILLUMINA" selected="true">Default </option>
136 <option value="ILLUMINA_DUST_OFF" selected="false">Masking of low complexity repeats disabled </option>
137
138 <!-- <option value="ILLUMINA_SENSITIVE_MGBLAST" selected="false">Illumina reads, sensitive search (search parameters: mgblast, min PID 80, -W8) slow, experimental feature!</option> -->
139 <!-- <option value="ILLUMINA_SENSITIVE_BLASTPLUS" selected="false">Illumina reads, more sensitive search (search parameters: blastn, min PID 80, -W6) extremely slow, experimental feature!</option> -->
140 <!-- <option value="OXFORD_NANOPORE" selected="false"> -->
141 <!-- Pseudo short reads simulated from Oxford Nanopore data, experimental feature! -->
142 <!-- </option> -->
143 </param>
144 </conditional>
145
146 <conditional name="custom_library">
147 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
148 <when value="false">
149 <!-- do nothing here -->
150 </when>
151 <when value="true">
152 <param name="library" format="fasta" type="data" label="Custom repeat database" help="The database should contain DNA sequences in FASTA format. The required format for sequence IDs is : '>reapeatname#class/subclass'"/>
153 </when>
154 </conditional>
155 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed; clusters with less than 20 reads are not considered."/>
156 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering identifies the most abundant tandem repeats and partially removes their reads from the analysis. This enables to analyze higher proportions of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
157 <param name="keep_names" label="Keep original read names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/>
158 <param name="assembly_min_cluster_size" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/>
159 </when>
160 </conditional>
161
162
163
164 </inputs>
165 <outputs>
166 <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
167 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
168 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
169 </outputs>
170
171 <help>
172 **HELP**
173
174 RepeatExplorer2 clustering is a computational pipeline for unsupervised
175 identification of repeats from unassembled sequence reads. The
176 pipeline uses low-pass whole genome sequence reads and performs graph-based
177 clustering. Resulting clusters, representing all types of repeats, are then
178 examined to identify and classify into repeats groups.
179
180 **Input data**
181
182 The analysis requires either **single** or **paired-end reads** generated
183 by whole genome shotgun sequencing provided as a single fasta-formatted file.
184 Generally, paired-end reads provide significantly better results than single
185 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
186 the number of analyzed reads should represent less than 1x genome equivalent
187 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
188 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
189 and no Ns allowed) and only **complete read pairs** should be submitted for
190 analysis. When paired reads are used, input data must be **interlaced** format
191 as fasta file:
192
193 example of interlaced input format::
194
195 >0001_f
196 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
197 >0001_r
198 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
199 >0002_f
200 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
201 >0002_r
202 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
203 >0003_f
204 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
205 >0003_r
206 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
207 ...
208
209
210 **Comparative analysis**
211
212 For comparative analysis sequence names must contain code (prefix) for each group.
213 Prefix in sequences names must be of fixed length.
214
215 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
216
217 >AA0001_f
218 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
219 >AA0001_r
220 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
221 >AA0002_f
222 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
223 >AA0002_r
224 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
225 >BB0001_f
226 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
227 >BB0001_r
228 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
229 >BB0002_f
230 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
231 >BB0002_r
232 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
233
234
235 To prepare quality filtered and interlaced input fasta file from fastq
236 files, use `Preprocessing of paired-reads`__ tool.
237
238 .. __: tool_runner?tool_id=paired_fastq_filtering
239
240
241 **Additional parameters**
242
243 **Sample size** defines how many reads should be used in calculation.
244 Default setting with 500,000 reads will enable detection of high copy
245 repeats within several hours of computation time. For higher
246 sensitivity the sample size can be set higher. Since sample size affects
247 the memory usage, this parameter may be automatically adjusted to lower
248 value during the run. Maximum sample size which can be processed depends on
249 the repetitiveness of analyzed genome.
250
251
252 **Select taxon and protein domain database version (REXdb)**. Classification
253 of transposable elements is based on the similarity to our reference database
254 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
255 can be obtained on `repeatexplorer.org`__. Classification
256 system used in REXdb is described in article `Systematic survey of plant
257 LTR-retrotransposons elucidates phylogenetic relationships of their
258 polyprotein domains and provides a reference for element classification`__
259 Database for Metazoa species is still under development so use it with caution.
260
261 .. __: http://repeatexplorer.org
262 .. __: https://doi.org/10.1186/s13100-018-0144-1
263
264 **Select parameters for protein domain search** REXdb is compared with s
265 equence clusters either using blastx or diamond aligner. Diamond program
266 is about three time faster than blastx with word size 3.
267
268 **Similarity search options** By default sequence reads are compared using
269 mgblast program. Default threshold is explicitly set to 90% sequence
270 similarity spanning at least 55% of the read length (in the case of reads
271 differing in length it applies to the longer one). Additionally, sequence
272 overlap must be at least 55 nt. If you select option for shorter reads
273 than 100 nt, minimum overlap 55 nt is not required.
274
275 By default,
276 mgblast search use DUST program to filter out
277 low-complexity sequences. If you want
278 to increase sensitivity of detection of satellites with shorter monomer
279 use option with '*no masking of low complexity repeats*'. Note that omitting
280 DUST filtering will significantly increase running times
281
282
283 **Automatic filtering of abundant satellite repeats** perform clustering on
284 smaller dataset of sequence reads to detect abundant high confidence
285 satellite repeats. If such satellites are detected, sequence reads derived
286 from these satellites are depleted from input dataset. This step enable more
287 sensitive detection of less abundant repeats as more reads can be used
288 in clustering step.
289
290 **Use custom repeat database**. This option allows users to perform similarity
291 comparison of identified repeats to their custom databases. The repeat class must
292 be encoded in FASTA headers of database entries in order to allow correct
293 parsing of similarity hits. Required format for custom database sequence name is: ::
294
295 >reapeatname#class/subclass
296
297
298 **Output**
299
300 List of clusters identified as putative satellite repeats, their genomic
301 abundance and various cluster characteristics.
302
303 Output includes a **HTML summary** with table listing of all analyzed
304 clusters. More detailed information about clusters is provided in
305 additional files and directories. All results are also provided as
306 downloadable **zip archive**. Additionally a **log file** reporting
307 the progress of the computational pipeline is provided.
308
309 </help>
310
311 </tool>