Mercurial > repos > artbio > repenrich2
diff repenrich2.xml @ 0:4905a332a094 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 commit 73721d980c1f422dc880d80f61e44d270992e537
author | artbio |
---|---|
date | Sat, 20 Apr 2024 11:56:53 +0000 |
parents | |
children | 6d59fbca2db4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repenrich2.xml Sat Apr 20 11:56:53 2024 +0000 @@ -0,0 +1,235 @@ +<tool id="repenrich2" name="RepEnrich" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Repeat Element Profiling</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="repenrich_requirements"/> + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + <command detect_errors="exit_code"><![CDATA[ + #import re + ## uncompress fastq.gz or fastqsanger.gz is not required with bowtie2 + #if $seq_method.seq_method_list == "single-read": + ln -f -s '$seq_method.input_fastq' 'input.fastq' && + #elif $seq_method.seq_method_list == 'paired_collection': + ln -f -s '$seq_method.input_fastq.forward' 'input.fastq' && + ln -f -s '$seq_method.input_fastq.reverse' 'input_2.fastq' && + #else: + ln -f -s '$seq_method.input_fastq' 'input.fastq' && + ln -f -s '$seq_method.input2_fastq' 'input_2.fastq' && + #end if + + #if $refGenomeSource.genomeSource == "history": + bowtie2-build --threads \${GALAXY_SLOTS:-4} -f $refGenomeSource.genome genome 1>/dev/null && + ln -s -f '$refGenomeSource.genome' 'genome.fa' && + #set index_path = 'genome' + #else: + #set index_path = $refGenomeSource.index.fields.path + bowtie-inspect $index_path > genome.fa && + #end if + + python $__tool_directory__/RepEnrich2_setup.py + --annotation_file '$repeatmasker' + --genomefasta 'genome.fa' + --cpus "\${GALAXY_SLOTS:-4}" && + + #if $seq_method.seq_method_list == "single-read": + bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} input.fastq + | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 && + samtools view -@ "\${GALAXY_SLOTS:-4}" -F 4 -b -q 38 aligned.bam -o unique.bam && + samtools view -@ "\${GALAXY_SLOTS:-4}" -h -F 4 -b aligned.bam \ + | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \ + | bedtools bamtofastq -i /dev/stdin -fq multimap.fastq && + #else: + bowtie2 -x $index_path -p \${GALAXY_SLOTS:-4} -1 input.fastq -2 input_2.fastq + | samtools sort -@ "\${GALAXY_SLOTS:-4}" -T tmp -O bam -o aligned.bam 2>&1 && + samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b -q 38 aligned.bam -o unique.bam && + samtools view -@ "\${GALAXY_SLOTS:-4}" -f 3 -b aligned.bam \ + | samtools view -@ "\${GALAXY_SLOTS:-4}" -U -b -q 38 - \ + | samtools sort -@ "\${GALAXY_SLOTS:-4}" -n - - + | bedtools bamtofastq -i /dev/stdin -fq multimap_1.fastq -fq2 multimap_2.fastq && + #end if + samtools index unique.bam && + + + python $__tool_directory__/RepEnrich2.py + --annotation_file $repeatmasker + --alignment_bam unique.bam + --cpus "\${GALAXY_SLOTS:-4}" + #if $seq_method.seq_method_list == "single-read": + --fastqfile multimap.fastq + #else: + --fastqfile multimap_1.fastq + --fastqfile2 multimap_2.fastq + #end if + ]]></command> + <!-- basic error handling --> + <inputs> + <conditional name="seq_method"> + <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select"> + <option selected="True" value="single-read">Single-read sequencing</option> + <option value="paired-end">Paired-end sequencing</option> + <option value="paired_collection">Paired-end Dataset Collection</option> + </param> + <when value="single-read"> + <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="Single-reads" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" /> + </when> + <when value="paired-end"> + <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="1st paired-end sequencing dataset" name="input_fastq" type="data" help="accepted formats: fastq, fastqsanger" /> + <param format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" label="2nd paired-end sequencing dataset" name="input2_fastq" type="data" help="accepted formats: fastq, fastqsanger" /> + </when> + <when value="paired_collection"> + <param name="input_fastq" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired Collection" help="Must be of datatype "fastqsanger" or "fasta"" /> + </when> + </conditional> + <conditional name="refGenomeSource"> + <param help="Built-ins were indexed using default options" label="Will you select a reference genome from your history or use a built-in index?" name="genomeSource" type="select"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param help="if your genome of interest is not listed - contact instance administrator" label="Select a DNA reference index" name="genome" type="select"> + <options from_data_table="bowtie2_indexes" /> + </param> + </when> + <when value="history"> + <param format="fasta" label="Select a fasta file, to serve as index reference" name="genome" type="data" /> + </when> + </conditional> + + <param format="txt" label="RepeatMasker description file" name="repeatmasker" type="data" help="see help section"/> + </inputs> + + <outputs> + <data format="tabular" name="class_fraction_counts" label="RepEnrich on ${on_string}: class fraction counts" from_work_dir="class_fraction_counts.tsv" /> + <data format="tabular" name="family_fraction_counts" label="RepEnrich on ${on_string}: family fraction counts" from_work_dir="family_fraction_counts.tsv" /> + <data format="tabular" name="fraction_counts" label="RepEnrich on ${on_string}: fraction counts" from_work_dir="fraction_counts.tsv" /> + </outputs> + + <tests> + <test> + <param name="seq_method_list" value="single-read"/> + <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/> + <param name="genomeSource" value="history"/> + <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/> + <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/> + <output name="class_fraction_counts" file="chrY_single_class_fraction_counts.tab" ftype="tabular"/> + <output name="family_fraction_counts" file="chrY_single_family_fraction_counts.tab" ftype="tabular"/> + <output name="fraction_counts" file="chrY_single_fraction_counts.tab" ftype="tabular"/> + </test> + <test> + <param name="seq_method_list" value="paired-end"/> + <param name="input_fastq" value="chrY-500k.R1.fastqsanger.gz" ftype="fastq.gz"/> + <param name="input2_fastq" value="chrY-500k.R2.fastqsanger.gz" ftype="fastq.gz"/> + <param name="genomeSource" value="history"/> + <param name="genome" value="chrY-1-500k.fa" ftype="fasta"/> + <param name="repeatmasker" value="chrY-1-500k.fa.out" ftype="txt"/> + <output name="class_fraction_counts" file="chrY_paired_class_fraction_counts.tab" ftype="tabular"/> + <output name="family_fraction_counts" file="chrY_paired_family_fraction_counts.tab" ftype="tabular"/> + <output name="fraction_counts" file="chrY_paired_fraction_counts.tab" ftype="tabular"/> + </test> + </tests> + + <help> + +**What it does** + +Reads are mapped to the genome using the Bowtie2 aligner. Reads mapping uniquely to the +genome are assigned to subfamilies of repetitive elements based on their degree of overlap +to RepeatMasker annotated genomic instances of each repetitive element subfamily. + +Reads mapping to multiple locations are separately mapped to repetitive element assemblies +– referred to as repetitive element psuedogenomes – built from RepeatMasker annotated +genomic instances of repetitive element subfamilies. + +RepEnrich then return tables of counts merged from both strategies, that can be further +processed in statistical analysis for differential expression. For information on the method, +see the `original publication`_. + +.. _original publication: https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-583 + +**Inputs** + +*Reference genome* : reference genome in fasta format + +*Sequencing dataset*: Single-reads or Paired-end sequencing datasets in fastq format. + +*RepeatMasker description file*: a txt repeatmasker file which can be downloaded from +https://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html + +This file looks like: + +<![CDATA[ + +SW perc perc perc query position in query matching repeat position in repeat + +score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID + +16 20.2 5.9 0.0 chrM 1211 1261 (18263) + (TTTTA)n Simple_repeat 1 54 (0) 84486 + +13 23.9 2.2 2.2 chrM 2014 2059 (17465) + (TTA)n Simple_repeat 1 46 (0) 84487 + +24 18.8 5.3 2.6 chrM 3924 3999 (15525) + (TAT)n Simple_repeat 1 78 (0) 84488 + +18 4.5 0.0 0.0 chrM 5961 5983 (13541) + (AT)n Simple_repeat 1 23 (0) 84489 + +13 25.9 4.0 4.0 chrM 6247 6320 (13204) + (ATTTAT)n Simple_repeat 1 74 (0) 84490 + +11 14.6 7.5 2.4 chrM 8783 8822 (10702) + (CTAATT)n Simple_repeat 1 42 (0) 84491 + +17 19.0 0.0 8.6 chrM 9064 9126 (10398) + A-rich Low_complexity 1 58 (0) 84492 + +13 21.0 5.9 1.9 chrM 11723 11773 (7751) + (ATA)n Simple_repeat 1 53 (0) 84493 + +66 20.4 12.3 12.3 chrM 12823 13001 (6523) C LSU-rRNA_Cel rRNA (1) 2431 2253 84494 + +16 16.6 0.0 2.9 chrM 14361 14396 (5128) + (ATT)n Simple_repeat 1 35 (0) 84495 + +44 2.4 0.0 0.0 chrM 15966 16007 (3517) + (TA)n Simple_repeat 1 42 (0) 84496 + +35 5.3 0.0 0.0 chrM 16559 16597 (2927) + (AT)n Simple_repeat 1 39 (0) 84497 + +36 2.9 0.0 0.0 chrM 16922 16956 (2568) + (AT)n Simple_repeat 1 35 (0) 84498 + +37 0.0 0.0 0.0 chrM 17040 17071 (2453) + (TA)n Simple_repeat 1 32 (0) 84499 + +20 4.3 0.0 0.0 chrM 17417 17440 (2084) + (T)n Simple_repeat 1 24 (0) 84500 + +31 6.9 6.3 1.5 chrM 17451 17513 (2011) + (TA)n Simple_repeat 1 66 (0) 84501 + +26 17.0 0.0 0.0 chrM 19469 19514 (10) + A-rich Low_complexity 1 46 (0) 84502 + +]]> + +Users may filter this file so that it contains only desired items (for instance only satellites, repeats and transposons) + +**Outputs** + +(1) Fraction counts, (2) Family fraction counts and (3) Class fraction counts are returned +in tabular format for further statistical tests, differential expression analysis or graphics. + +**RepEnrich2** + +.. class:: warningmark + +the repenrich2 Galaxy wrapper was derived from the repenrich Galaxy wrapper + +repenrich2 uses bowtie2 for all alignment operations. We refer exclusively to our +`GitHub repository`_ for code review. + +.. _GitHub repository: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 + +**Execution time** + +.. class:: warningmark + +This tool includes time-consuming steps to index the reference genome, index repeat +sequences and to align reads to these indexes. + + </help> + + <citations> + <citation type="doi">10.1186/1471-2164-15-583</citation> + </citations> +</tool>