comparison repenrich2.xml @ 0:4905a332a094 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
author artbio
date Sat, 20 Apr 2024 11:56:53 +0000
parents
children 6d59fbca2db4
comparison
equal deleted inserted replaced
-1:000000000000 0:4905a332a094
1 <tool id="repenrich2" name="RepEnrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>Repeat Element Profiling</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="repenrich_requirements"/>
7 <stdio>
8 <exit_code range="1:" level="fatal" description="Tool exception" />
9 </stdio>
10 <command detect_errors="exit_code"><![CDATA[
11 #import re
12 ## uncompress fastq.gz or fastqsanger.gz is not required with bowtie2
13 #if $seq_method.seq_method_list == "single-read":
14 ln -f -s '$seq_method.input_fastq' 'input.fastq' &&
15 #elif $seq_method.seq_method_list == 'paired_collection':
16 ln -f -s '$seq_method.input_fastq.forward' 'input.fastq' &&
17 ln -f -s '$seq_method.input_fastq.reverse' 'input_2.fastq' &&
18 #else:
19 ln -f -s '$seq_method.input_fastq' 'input.fastq' &&
20 ln -f -s '$seq_method.input2_fastq' 'input_2.fastq' &&
21 #end if
22
23 #if $refGenomeSource.genomeSource == "history":
24 bowtie2-build --threads \${GALAXY_SLOTS:-4} -f $refGenomeSource.genome genome 1>/dev/null &&
25 ln -s -f '$refGenomeSource.genome' 'genome.fa' &&
26 #set index_path = 'genome'
27 #else:
28 #set index_path = $refGenomeSource.index.fields.path
29 bowtie-inspect $index_path > genome.fa &&
30 #end if
31
32 python $__tool_directory__/RepEnrich2_setup.py
33 --annotation_file '$repeatmasker'
34 --genomefasta 'genome.fa'
35 --cpus "\${GALAXY_SLOTS:-4}" &&
36
37 #if $seq_method.seq_method_list == "single-read":
38 bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} input.fastq
39 | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 &&
40 samtools view -@ "\${GALAXY_SLOTS:-4}" -F 4 -b -q 38 aligned.bam -o unique.bam &&
41 samtools view -@ "\${GALAXY_SLOTS:-4}" -h -F 4 -b aligned.bam \
42 | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \
43 | bedtools bamtofastq -i /dev/stdin -fq multimap.fastq &&
44 #else:
45 bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} -1 input.fastq -2 input_2.fastq
46 | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 &&
47 samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b -q 38 aligned.bam -o unique.bam &&
48 samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b aligned.bam \
49 | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \
50 | samtools sort -@ "\${GALAXY_SLOTS:-4}" -n - -
51 | bedtools bamtofastq -i /dev/stdin -fq multimap_1.fastq -fq2 multimap_2.fastq &&
52 #end if
53 samtools index unique.bam &&
54
55
56 python $__tool_directory__/RepEnrich2.py
57 --annotation_file $repeatmasker
58 --alignment_bam unique.bam
59 --cpus "\${GALAXY_SLOTS:-4}"
60 #if $seq_method.seq_method_list == "single-read":
61 --fastqfile multimap.fastq
62 #else:
63 --fastqfile multimap_1.fastq
64 --fastqfile2 multimap_2.fastq
65 #end if
66 ]]></command>
67 <!-- basic error handling -->
68 <inputs>
69 <conditional name="seq_method">
70 <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select">
71 <option selected="True" value="single-read">Single-read sequencing</option>
72 <option value="paired-end">Paired-end sequencing</option>
73 <option value="paired_collection">Paired-end Dataset Collection</option>
74 </param>
75 <when value="single-read">
76 <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Single-reads" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
77 </when>
78 <when value="paired-end">
79 <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="1st paired-end sequencing dataset" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
80 <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="2nd paired-end sequencing dataset" name="input2_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
81 </when>
82 <when value="paired_collection">
83 <param name="input_fastq" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired Collection" help="Must be of datatype &quot;fastqsanger&quot; or &quot;fasta&quot;" />
84 </when>
85 </conditional>
86 <conditional name="refGenomeSource">
87 <param help="Built-ins were indexed using default options" label="Will you select a reference genome from your history or use a built-in index?" name="genomeSource" type="select">
88 <option value="indexed">Use a built-in index</option>
89 <option value="history">Use one from the history</option>
90 </param>
91 <when value="indexed">
92 <param help="if your genome of interest is not listed - contact instance administrator" label="Select a DNA reference index" name="genome" type="select">
93 <options from_data_table="bowtie2_indexes" />
94 </param>
95 </when>
96 <when value="history">
97 <param format="fasta" label="Select a fasta file, to serve as index reference" name="genome" type="data" />
98 </when>
99 </conditional>
100
101 <param format="txt" label="RepeatMasker description file" name="repeatmasker" type="data" help="see help section"/>
102 </inputs>
103
104 <outputs>
105 <data format="tabular" name="class_fraction_counts" label="RepEnrich on ${on_string}: class fraction counts" from_work_dir="class_fraction_counts.tsv" />
106 <data format="tabular" name="family_fraction_counts" label="RepEnrich on ${on_string}: family fraction counts" from_work_dir="family_fraction_counts.tsv" />
107 <data format="tabular" name="fraction_counts" label="RepEnrich on ${on_string}: fraction counts" from_work_dir="fraction_counts.tsv" />
108 </outputs>
109
110 <tests>
111 <test>
112 <param name="seq_method_list" value="single-read"/>
113 <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/>
114 <param name="genomeSource" value="history"/>
115 <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/>
116 <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/>
117 <output name="class_fraction_counts" file="chrY_single_class_fraction_counts.tab" ftype="tabular"/>
118 <output name="family_fraction_counts" file="chrY_single_family_fraction_counts.tab" ftype="tabular"/>
119 <output name="fraction_counts" file="chrY_single_fraction_counts.tab" ftype="tabular"/>
120 </test>
121 <test>
122 <param name="seq_method_list" value="paired-end"/>
123 <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/>
124 <param name="input2_fastq" value="chrY-500k.R2.fastqsanger.gz" ftype="fastq.gz"/>
125 <param name="genomeSource" value="history"/>
126 <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/>
127 <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/>
128 <output name="class_fraction_counts" file="chrY_paired_class_fraction_counts.tab" ftype="tabular"/>
129 <output name="family_fraction_counts" file="chrY_paired_family_fraction_counts.tab" ftype="tabular"/>
130 <output name="fraction_counts" file="chrY_paired_fraction_counts.tab" ftype="tabular"/>
131 </test>
132 </tests>
133
134 <help>
135
136 **What it does**
137
138 Reads are mapped to the genome using the Bowtie2 aligner. Reads mapping uniquely to the
139 genome are assigned to subfamilies of repetitive elements based on their degree of overlap
140 to RepeatMasker annotated genomic instances of each repetitive element subfamily.
141
142 Reads mapping to multiple locations are separately mapped to repetitive element assemblies
143 – referred to as repetitive element psuedogenomes – built from RepeatMasker annotated
144 genomic instances of repetitive element subfamilies.
145
146 RepEnrich then return tables of counts merged from both strategies, that can be further
147 processed in statistical analysis for differential expression. For information on the method,
148 see the `original publication`_.
149
150 .. _original publication: https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-583
151
152 **Inputs**
153
154 *Reference genome* : reference genome in fasta format
155
156 *Sequencing dataset*: Single-reads or Paired-end sequencing datasets in fastq format.
157
158 *RepeatMasker description file*: a txt repeatmasker file which can be downloaded from
159 https://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
160
161 This file looks like:
162
163 <![CDATA[
164
165 SW perc perc perc query position in query matching repeat position in repeat
166
167 score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID
168
169 16 20.2 5.9 0.0 chrM 1211 1261 (18263) + (TTTTA)n Simple_repeat 1 54 (0) 84486
170
171 13 23.9 2.2 2.2 chrM 2014 2059 (17465) + (TTA)n Simple_repeat 1 46 (0) 84487
172
173 24 18.8 5.3 2.6 chrM 3924 3999 (15525) + (TAT)n Simple_repeat 1 78 (0) 84488
174
175 18 4.5 0.0 0.0 chrM 5961 5983 (13541) + (AT)n Simple_repeat 1 23 (0) 84489
176
177 13 25.9 4.0 4.0 chrM 6247 6320 (13204) + (ATTTAT)n Simple_repeat 1 74 (0) 84490
178
179 11 14.6 7.5 2.4 chrM 8783 8822 (10702) + (CTAATT)n Simple_repeat 1 42 (0) 84491
180
181 17 19.0 0.0 8.6 chrM 9064 9126 (10398) + A-rich Low_complexity 1 58 (0) 84492
182
183 13 21.0 5.9 1.9 chrM 11723 11773 (7751) + (ATA)n Simple_repeat 1 53 (0) 84493
184
185 66 20.4 12.3 12.3 chrM 12823 13001 (6523) C LSU-rRNA_Cel rRNA (1) 2431 2253 84494
186
187 16 16.6 0.0 2.9 chrM 14361 14396 (5128) + (ATT)n Simple_repeat 1 35 (0) 84495
188
189 44 2.4 0.0 0.0 chrM 15966 16007 (3517) + (TA)n Simple_repeat 1 42 (0) 84496
190
191 35 5.3 0.0 0.0 chrM 16559 16597 (2927) + (AT)n Simple_repeat 1 39 (0) 84497
192
193 36 2.9 0.0 0.0 chrM 16922 16956 (2568) + (AT)n Simple_repeat 1 35 (0) 84498
194
195 37 0.0 0.0 0.0 chrM 17040 17071 (2453) + (TA)n Simple_repeat 1 32 (0) 84499
196
197 20 4.3 0.0 0.0 chrM 17417 17440 (2084) + (T)n Simple_repeat 1 24 (0) 84500
198
199 31 6.9 6.3 1.5 chrM 17451 17513 (2011) + (TA)n Simple_repeat 1 66 (0) 84501
200
201 26 17.0 0.0 0.0 chrM 19469 19514 (10) + A-rich Low_complexity 1 46 (0) 84502
202
203 ]]>
204
205 Users may filter this file so that it contains only desired items (for instance only satellites, repeats and transposons)
206
207 **Outputs**
208
209 (1) Fraction counts, (2) Family fraction counts and (3) Class fraction counts are returned
210 in tabular format for further statistical tests, differential expression analysis or graphics.
211
212 **RepEnrich2**
213
214 .. class:: warningmark
215
216 the repenrich2 Galaxy wrapper was derived from the repenrich Galaxy wrapper
217
218 repenrich2 uses bowtie2 for all alignment operations. We refer exclusively to our
219 `GitHub repository`_ for code review.
220
221 .. _GitHub repository: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2
222
223 **Execution time**
224
225 .. class:: warningmark
226
227 This tool includes time-consuming steps to index the reference genome, index repeat
228 sequences and to align reads to these indexes.
229
230 </help>
231
232 <citations>
233 <citation type="doi">10.1186/1471-2164-15-583</citation>
234 </citations>
235 </tool>