comparison repenrich.xml @ 0:f6f0f1e5e940 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author artbio
date Wed, 02 Aug 2017 05:17:29 -0400
parents
children 51b4590a972d
comparison
equal deleted inserted replaced
-1:000000000000 0:f6f0f1e5e940
1 <tool id="repenrich" name="RepEnrich" version="1.4.0">
2 <description>Repeat Element Profiling</description>
3 <requirements>
4 <requirement type="package" version="1.2.0">bowtie</requirement>
5 <requirement type="package" version="0.1.19">samtools</requirement>
6 <requirement type="package" version="2.20.1">bedtools</requirement>
7 <requirement type="package" version="1.69">biopython</requirement>
8 </requirements>
9 <stdio>
10 <exit_code range="1:" level="fatal" description="Tool exception" />
11 </stdio>
12 <command detect_errors="exit_code"><![CDATA[
13 #import re
14 #set input_base = 'Sample'
15 #set baseReference = 'Genome'
16 ln -f -s '$genome' '${baseReference}.fa' &&
17 ln -f -s '$input_fastq' '${input_base}.fastq' &&
18 #if $seq_method.seq_method_list == "paired-end":
19 ln -f -s '$input2_fastq' '${input_base}_2.fastq' &&
20 #end if
21 bowtie-build '$genome' ${baseReference} &&
22 python $__tool_directory__/RepEnrich_setup.py $repeatmasker ${baseReference}.fa setup_folder_${baseReference} &&
23 #if $seq_method.seq_method_list == "single-read":
24 bowtie $baseReference -p \${GALAXY_SLOTS:-4} -t -m 1 -S --max ${input_base}_multimap.fastq ${input_base}.fastq ${input_base}_unique.sam 2>bowtie_alignments.txt &&
25 TOTAL=\$(grep 'reads processed:' bowtie_alignments.txt | cut -d ' ' -f 4) &&
26 NONALIGNED=\$(grep 'reads that failed to align:' bowtie_alignments.txt | cut -d ' ' -f 7) &&
27 echo \$((\$TOTAL-\$NONALIGNED)) > bowtie_aligned.numb &&
28 #else:
29 bowtie $baseReference -p \${GALAXY_SLOTS:-4} -t -m 1 -S --max ${input_base}_multimap.fastq -1 ${input_base}.fastq -2 ${input_base}_2.fastq ${input_base}_unique.sam 2>bowtie_alignments.txt &&
30 TOTAL=\$(grep 'reads processed:' bowtie_alignments.txt | cut -d ' ' -f 4) &&
31 NONALIGNED=\$(grep 'reads that failed to align:' bowtie_alignments.txt | cut -d ' ' -f 7) &&
32 echo \$((\$TOTAL-\$NONALIGNED)) > bowtie_aligned.numb &&
33 #end if
34 samtools view -bS ${input_base}_unique.sam > ${input_base}_unique.bam &&
35 samtools sort ${input_base}_unique.bam ${input_base}_unique_sorted &&
36 mv ${input_base}_unique_sorted.bam ${input_base}_unique.bam &&
37 samtools index ${input_base}_unique.bam &&
38 rm ${input_base}_unique.sam &&
39 #if $seq_method.seq_method_list == "single-read":
40 python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" &&
41 #else:
42 python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap_1.fastq --fastqfile2 ${input_base}_multimap_2.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" --pairedend TRUE &&
43 #end if
44 cp $input_base/${input_base}_class_fraction_counts.txt class_fraction_counts.tabular &&
45 cp $input_base/${input_base}_family_fraction_counts.txt family_fraction_counts.tabular &&
46 cp $input_base/${input_base}_fraction_counts.txt fraction_counts.tabular
47 ]]></command>
48 <!-- basic error handling -->
49 <inputs>
50 <conditional name="seq_method">
51 <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select">
52 <option selected="True" value="single-read">Single-read sequencing</option>
53 <option value="paired-end">Paired-end sequencing</option>
54 </param>
55 <when value="single-read">
56 <param format="fastq,fastqsanger" label="Single-reads" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
57 </when>
58 <when value="paired-end">
59 <param format="fastq,fastqsanger" label="1st paired-end sequencing dataset" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
60 <param format="fastq,fastqsanger" label="2nd paired-end sequencing dataset" name="input2_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
61 </when>
62 </conditional>
63 <param format="fasta" label="Reference genome in fasta format" name="genome" type="data" />
64 <param format="txt" label="RepeatMasker description file" name="repeatmasker" type="data" help="see help section"/>
65 </inputs>
66
67 <outputs>
68 <data format="tabular" name="bowtie_alignments" label="RepEnrich on ${on_string}: reads aligned" from_work_dir="bowtie_aligned.numb">
69 </data>
70 <data format="tabular" name="class_fraction_counts" label="RepEnrich on ${on_string}: class fraction counts" from_work_dir="class_fraction_counts.tabular">
71 </data>
72 <data format="tabular" name="family_fraction_counts" label="RepEnrich on ${on_string}: family fraction counts" from_work_dir="family_fraction_counts.tabular">
73 </data>
74 <data format="tabular" name="fraction_counts" label="RepEnrich on ${on_string}: fraction counts" from_work_dir="fraction_counts.tabular">
75 </data>
76 </outputs>
77
78 <tests>
79 <test>
80 <param name="seq_method_list" value="single-read"/>
81 <param name="input_fastq" value="Samp.fastq" ftype="fastq"/>
82 <param name="genome" value="chrM.fa" ftype="fasta"/>
83 <param name="repeatmasker" value="chrM_repeatmasker.txt" ftype="txt"/>
84 <output name="bowtie_alignments" file="aligned_reads.tab" ftype="tabular"/>
85 <output name="class_fraction_counts" file="Samp_class_fraction_counts.tabular" ftype="tabular"/>
86 <output name="family_fraction_counts" file="Samp_family_fraction_counts.tabular" ftype="tabular"/>
87 <output name="fraction_counts" file="Samp_fraction_counts.tabular" ftype="tabular"/>
88 </test>
89 <test>
90 <param name="seq_method_list" value="paired-end"/>
91 <param name="input_fastq" value="Samp_L.fastq" ftype="fastq"/>
92 <param name="input2_fastq" value="Samp_R.fastq" ftype="fastq"/>
93 <param name="genome" value="chrM.fa" ftype="fasta"/>
94 <param name="repeatmasker" value="chrM_repeatmasker.txt" ftype="txt"/>
95 <output name="bowtie_alignments" file="paired-aligned_reads.tab" ftype="tabular"/>
96 <output name="class_fraction_counts" file="Samp-paired_class_fraction_counts.tab" ftype="tabular"/>
97 <output name="family_fraction_counts" file="Samp-paired_family_fraction_counts.tab" ftype="tabular"/>
98 <output name="fraction_counts" file="Samp-paired_fraction_counts.tab" ftype="tabular"/>
99 </test>
100 </tests>
101
102 <help>
103
104 **What it does**
105
106 Reads are mapped to the genome using the Bowtie1 aligner. Reads mapping uniquely to the genome are assigned to subfamilies of repetitive elements based on their degree of overlap to RepeatMasker annotated genomic instances of each repetitive element subfamily. Reads mapping to multiple locations are separately mapped to repetitive element assemblies – referred to as repetitive element psuedogenomes – built from RepeatMasker annotated genomic instances of repetitive element subfamilies. RepEnrich then return tables of counts merged from both strategies, that can be further processed in statistical analysis for differential expression. For detailed information see the `original publication`_.
107
108 .. _original publication: https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-583
109
110 **Inputs**
111
112 *Reference genome* : reference genome in fasta format
113
114 *Sequencing dataset*: Single-reads or Paired-end sequencing datasets in fastq format.
115
116 *RepeatMasker description file*: a txt repeatmasker file which can be downloaded from http://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
117
118 This file looks like:
119
120 <![CDATA[
121
122 SW perc perc perc query position in query matching repeat position in repeat
123
124 score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID
125
126 16 20.2 5.9 0.0 chrM 1211 1261 (18263) + (TTTTA)n Simple_repeat 1 54 (0) 84486
127
128 13 23.9 2.2 2.2 chrM 2014 2059 (17465) + (TTA)n Simple_repeat 1 46 (0) 84487
129
130 24 18.8 5.3 2.6 chrM 3924 3999 (15525) + (TAT)n Simple_repeat 1 78 (0) 84488
131
132 18 4.5 0.0 0.0 chrM 5961 5983 (13541) + (AT)n Simple_repeat 1 23 (0) 84489
133
134 13 25.9 4.0 4.0 chrM 6247 6320 (13204) + (ATTTAT)n Simple_repeat 1 74 (0) 84490
135
136 11 14.6 7.5 2.4 chrM 8783 8822 (10702) + (CTAATT)n Simple_repeat 1 42 (0) 84491
137
138 17 19.0 0.0 8.6 chrM 9064 9126 (10398) + A-rich Low_complexity 1 58 (0) 84492
139
140 13 21.0 5.9 1.9 chrM 11723 11773 (7751) + (ATA)n Simple_repeat 1 53 (0) 84493
141
142 66 20.4 12.3 12.3 chrM 12823 13001 (6523) C LSU-rRNA_Cel rRNA (1) 2431 2253 84494
143
144 16 16.6 0.0 2.9 chrM 14361 14396 (5128) + (ATT)n Simple_repeat 1 35 (0) 84495
145
146 44 2.4 0.0 0.0 chrM 15966 16007 (3517) + (TA)n Simple_repeat 1 42 (0) 84496
147
148 35 5.3 0.0 0.0 chrM 16559 16597 (2927) + (AT)n Simple_repeat 1 39 (0) 84497
149
150 36 2.9 0.0 0.0 chrM 16922 16956 (2568) + (AT)n Simple_repeat 1 35 (0) 84498
151
152 37 0.0 0.0 0.0 chrM 17040 17071 (2453) + (TA)n Simple_repeat 1 32 (0) 84499
153
154 20 4.3 0.0 0.0 chrM 17417 17440 (2084) + (T)n Simple_repeat 1 24 (0) 84500
155
156 31 6.9 6.3 1.5 chrM 17451 17513 (2011) + (TA)n Simple_repeat 1 66 (0) 84501
157
158 26 17.0 0.0 0.0 chrM 19469 19514 (10) + A-rich Low_complexity 1 46 (0) 84502
159
160 ]]>
161
162 Users may filter this file so that it contains only desired items (for instance only satellites, repeats and transposons)
163
164 **Outputs**
165
166 (1) Fraction counts, (2) Family fraction counts and (3) Class fraction counts are returned in tabular format, for further statistical tests differential expression analysis or graphics
167
168 **RepEnrich**
169
170 This Galaxy tool is a wrapper of the RepEnrich tool by steven_criscione@brown.edu et al. whose code and manual are available in `GitHub`_.
171
172 .. _GitHub: https://github.com/nskvir/RepEnrich
173
174 Python scripts RepEnrich.py and RepEnrich_setup.py have been adapted to python 3. Note that sorting of Fraction counts, Family fraction counts and Class fraction counts is different with this Galaxy wrapper or with RepEnrich as found in the `RepEnrich code repository`_. However, this different sorting does not affect subsequent statistical analyses
175
176 .. _RepEnrich code repository: https://github.com/nskvir/RepEnrich
177
178 **Execution time**
179
180 .. class:: warningmark
181
182 This tool includes steps to index the reference genome, index repeat sequences and align reads to these indexes. Therefore the run time may be **long to very long**.
183
184 .. class:: infomark
185
186 For more information on the tools, please visit our `code repository`_.
187
188 If you would like to give us feedback or you run into any trouble, please send an email to artbio.ibps@gmail.com
189
190 This tool wrapper is developed by the `ARTbio team`_ at the `Institut de Biologie Paris Seine (IBPS)`_.
191
192 .. _code repository: https://github.com/ARTbio/tools-artbio/tree/master/tools/
193 .. _ARTbio team: http://artbio.fr
194 .. _Institut de Biologie Paris Seine (IBPS): http://www.ibps.upmc.fr/en/core-facilities/bioinformatics
195
196 </help>
197
198 <citations>
199 <citation type="doi">10.1186/1471-2164-15-583</citation>
200 </citations>
201 </tool>