diff repenrich2.xml @ 0:4905a332a094 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
author artbio
date Sat, 20 Apr 2024 11:56:53 +0000
parents
children 6d59fbca2db4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repenrich2.xml	Sat Apr 20 11:56:53 2024 +0000
@@ -0,0 +1,235 @@
+<tool id="repenrich2" name="RepEnrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Repeat Element Profiling</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="repenrich_requirements"/>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+        ## uncompress fastq.gz or fastqsanger.gz is not required with bowtie2
+        #if $seq_method.seq_method_list == "single-read":
+            ln -f -s '$seq_method.input_fastq' 'input.fastq' &&
+        #elif $seq_method.seq_method_list == 'paired_collection':
+            ln -f -s '$seq_method.input_fastq.forward' 'input.fastq' &&
+            ln -f -s '$seq_method.input_fastq.reverse' 'input_2.fastq' &&
+        #else:
+            ln -f -s '$seq_method.input_fastq' 'input.fastq' &&
+            ln -f -s '$seq_method.input2_fastq' 'input_2.fastq' &&
+        #end if
+
+        #if $refGenomeSource.genomeSource == "history":
+            bowtie2-build --threads \${GALAXY_SLOTS:-4} -f $refGenomeSource.genome genome 1>/dev/null &&
+            ln -s -f '$refGenomeSource.genome' 'genome.fa' &&
+            #set index_path = 'genome'
+        #else:
+            #set index_path = $refGenomeSource.index.fields.path
+            bowtie-inspect $index_path > genome.fa &&
+        #end if
+
+        python $__tool_directory__/RepEnrich2_setup.py
+            --annotation_file '$repeatmasker'
+            --genomefasta 'genome.fa'
+            --cpus "\${GALAXY_SLOTS:-4}" &&
+
+        #if $seq_method.seq_method_list == "single-read":
+            bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} input.fastq
+                | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 &&
+            samtools view -@ "\${GALAXY_SLOTS:-4}" -F 4 -b -q 38 aligned.bam -o unique.bam &&
+            samtools view -@ "\${GALAXY_SLOTS:-4}" -h -F 4 -b aligned.bam \
+                | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \
+                | bedtools bamtofastq -i /dev/stdin -fq multimap.fastq &&
+        #else:
+            bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} -1 input.fastq -2 input_2.fastq
+                | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 &&
+            samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b -q 38 aligned.bam -o unique.bam &&
+            samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b aligned.bam \
+                | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \
+                | samtools sort -@ "\${GALAXY_SLOTS:-4}" -n - -
+                | bedtools bamtofastq -i /dev/stdin -fq multimap_1.fastq -fq2 multimap_2.fastq &&
+        #end if
+        samtools index unique.bam &&
+
+
+        python $__tool_directory__/RepEnrich2.py
+            --annotation_file $repeatmasker
+            --alignment_bam unique.bam
+            --cpus "\${GALAXY_SLOTS:-4}"
+        #if $seq_method.seq_method_list == "single-read":
+            --fastqfile multimap.fastq
+        #else:
+            --fastqfile multimap_1.fastq
+            --fastqfile2 multimap_2.fastq
+        #end if
+    ]]></command>
+    <!-- basic error handling -->
+    <inputs>
+    <conditional name="seq_method">
+        <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select">
+            <option selected="True" value="single-read">Single-read sequencing</option>
+            <option value="paired-end">Paired-end sequencing</option>
+            <option value="paired_collection">Paired-end Dataset Collection</option>
+        </param>
+        <when value="single-read">
+            <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Single-reads" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
+        </when>
+        <when value="paired-end">
+            <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="1st paired-end sequencing dataset" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
+            <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="2nd paired-end sequencing dataset" name="input2_fastq" type="data" help="accepted formats: fastq, fastqsanger" />
+        </when>
+        <when value="paired_collection">
+            <param name="input_fastq" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired Collection" help="Must be of datatype &quot;fastqsanger&quot; or &quot;fasta&quot;" />
+        </when>
+    </conditional>
+    <conditional name="refGenomeSource">
+        <param help="Built-ins were indexed using default options" label="Will you select a reference genome from your history or use a built-in index?" name="genomeSource" type="select">
+            <option value="indexed">Use a built-in index</option>
+            <option value="history">Use one from the history</option>
+        </param>
+        <when value="indexed">
+            <param help="if your genome of interest is not listed - contact instance administrator" label="Select a DNA reference index" name="genome" type="select">
+                <options from_data_table="bowtie2_indexes" />
+            </param>
+        </when>
+        <when value="history">
+            <param format="fasta" label="Select a fasta file, to serve as index reference" name="genome" type="data" />
+        </when>
+    </conditional>
+
+    <param format="txt" label="RepeatMasker description file" name="repeatmasker" type="data" help="see help section"/>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="class_fraction_counts" label="RepEnrich on ${on_string}: class fraction counts" from_work_dir="class_fraction_counts.tsv" />
+        <data format="tabular" name="family_fraction_counts" label="RepEnrich on ${on_string}: family fraction counts" from_work_dir="family_fraction_counts.tsv" />
+        <data format="tabular" name="fraction_counts" label="RepEnrich on ${on_string}: fraction counts" from_work_dir="fraction_counts.tsv" />
+   </outputs>
+
+    <tests>
+        <test>
+            <param name="seq_method_list" value="single-read"/>
+            <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/>
+            <param name="genomeSource" value="history"/>
+            <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/>
+            <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/>
+            <output name="class_fraction_counts" file="chrY_single_class_fraction_counts.tab" ftype="tabular"/>
+            <output name="family_fraction_counts" file="chrY_single_family_fraction_counts.tab" ftype="tabular"/>
+            <output name="fraction_counts" file="chrY_single_fraction_counts.tab" ftype="tabular"/>
+        </test>
+         <test>
+            <param name="seq_method_list" value="paired-end"/>
+            <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/>
+            <param name="input2_fastq" value="chrY-500k.R2.fastqsanger.gz" ftype="fastq.gz"/>
+            <param name="genomeSource" value="history"/>
+            <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/>
+            <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/>
+            <output name="class_fraction_counts" file="chrY_paired_class_fraction_counts.tab" ftype="tabular"/>
+            <output name="family_fraction_counts" file="chrY_paired_family_fraction_counts.tab" ftype="tabular"/>
+            <output name="fraction_counts" file="chrY_paired_fraction_counts.tab" ftype="tabular"/>
+        </test>
+  </tests>
+
+    <help>
+
+**What it does**
+
+Reads are mapped to the genome using the Bowtie2 aligner. Reads mapping uniquely to the
+genome are assigned to subfamilies of repetitive elements based on their degree of overlap
+to RepeatMasker annotated genomic instances of each repetitive element subfamily.
+
+Reads mapping to multiple locations are separately mapped to repetitive element assemblies
+– referred to as repetitive element psuedogenomes – built from RepeatMasker annotated
+genomic instances of repetitive element subfamilies.
+
+RepEnrich then return tables of counts merged from both strategies, that can be further
+processed in statistical analysis for differential expression. For information on the method,
+see the `original publication`_.
+
+.. _original publication: https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-583
+
+**Inputs**
+
+*Reference genome* : reference genome in fasta format
+
+*Sequencing dataset*: Single-reads or Paired-end sequencing datasets in fastq format.
+
+*RepeatMasker description file*: a txt repeatmasker file which can be downloaded from
+https://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
+
+This file looks like:
+
+<![CDATA[
+
+SW  perc perc perc  query      position in query           matching       repeat              position in  repeat
+
+score  div. del. ins.  sequence    begin     end    (left)    repeat         class/family         begin  end (left)   ID
+
+16  20.2  5.9  0.0  chrM         1211    1261   (18263) +  (TTTTA)n       Simple_repeat            1   54    (0)  84486
+
+13  23.9  2.2  2.2  chrM         2014    2059   (17465) +  (TTA)n         Simple_repeat            1   46    (0)  84487
+
+24  18.8  5.3  2.6  chrM         3924    3999   (15525) +  (TAT)n         Simple_repeat            1   78    (0)  84488
+
+18   4.5  0.0  0.0  chrM         5961    5983   (13541) +  (AT)n          Simple_repeat            1   23    (0)  84489
+
+13  25.9  4.0  4.0  chrM         6247    6320   (13204) +  (ATTTAT)n      Simple_repeat            1   74    (0)  84490
+
+11  14.6  7.5  2.4  chrM         8783    8822   (10702) +  (CTAATT)n      Simple_repeat            1   42    (0)  84491
+
+17  19.0  0.0  8.6  chrM         9064    9126   (10398) +  A-rich         Low_complexity           1   58    (0)  84492
+
+13  21.0  5.9  1.9  chrM        11723   11773    (7751) +  (ATA)n         Simple_repeat            1   53    (0)  84493
+
+66  20.4 12.3 12.3  chrM        12823   13001    (6523) C  LSU-rRNA_Cel   rRNA                   (1) 2431   2253  84494
+
+16  16.6  0.0  2.9  chrM        14361   14396    (5128) +  (ATT)n         Simple_repeat            1   35    (0)  84495
+
+44   2.4  0.0  0.0  chrM        15966   16007    (3517) +  (TA)n          Simple_repeat            1   42    (0)  84496
+
+35   5.3  0.0  0.0  chrM        16559   16597    (2927) +  (AT)n          Simple_repeat            1   39    (0)  84497
+
+36   2.9  0.0  0.0  chrM        16922   16956    (2568) +  (AT)n          Simple_repeat            1   35    (0)  84498
+
+37   0.0  0.0  0.0  chrM        17040   17071    (2453) +  (TA)n          Simple_repeat            1   32    (0)  84499
+
+20   4.3  0.0  0.0  chrM        17417   17440    (2084) +  (T)n           Simple_repeat            1   24    (0)  84500
+
+31   6.9  6.3  1.5  chrM        17451   17513    (2011) +  (TA)n          Simple_repeat            1   66    (0)  84501
+
+26  17.0  0.0  0.0  chrM        19469   19514      (10) +  A-rich         Low_complexity           1   46    (0)  84502
+
+]]>
+
+Users may filter this file so that it contains only desired items (for instance only satellites, repeats and transposons)
+ 
+**Outputs**
+
+(1) Fraction counts, (2) Family fraction counts and (3) Class fraction counts are returned
+in tabular format for further statistical tests, differential expression analysis or graphics.
+
+**RepEnrich2**
+
+.. class:: warningmark
+
+the repenrich2 Galaxy wrapper was derived from the repenrich Galaxy wrapper
+
+repenrich2 uses bowtie2 for all alignment operations. We refer exclusively to our
+`GitHub repository`_ for code review.
+
+.. _GitHub repository: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2
+
+**Execution time**
+
+.. class:: warningmark
+
+This tool includes time-consuming steps to index the reference genome, index repeat
+sequences and to align reads to these indexes.
+
+    </help>
+
+    <citations>
+    <citation type="doi">10.1186/1471-2164-15-583</citation>
+  </citations>
+</tool>