view sr_bowtie_dataset_annotation.xml @ 10:fd4a60fc3fca draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/sr_bowtie_dataset_annotation commit cf30b4fab90cf7910bb2900a05b85ca3344aae59
author artbio
date Tue, 15 Nov 2022 00:45:36 +0000
parents 6bf9de09aa74
children
line wrap: on
line source

<tool id="sr_bowtie_dataset_annotation" name="Annotate smRNA dataset" version="2.8">
  <description>by iterative alignments with sRbowtie</description>
  <requirements>
        <requirement type="package" version="1.3.1">bowtie</requirement>
        <requirement type="package" version="1.7.1">r-optparse</requirement>
        <requirement type="package" version="3.3.5">r-ggplot2</requirement>
        <requirement type="package" version="0.9.1">r-ggrepel</requirement>
  </requirements>
  <command  detect_errors="exit_code"><![CDATA[
        #if $refGenomeSource1.genomeSource == "history":
            bowtie-build --threads \${GALAXY_SLOTS:-4} -f $refGenomeSource1.ownFile genome  1>/dev/null &&
            #set index_path = 'genome'
        #else:
            #set index_path = $refGenomeSource1.index.fields.path
        #end if
        
        #for $i in $AdditionalQueries:
            bowtie-build --threads \${GALAXY_SLOTS:-4} -f $i.ownFile $i.ownFile.name  1>/dev/null &&
        #end for
                
        #set method_prefix = "-v %s -k 1 --best" % str($mismatches)
        #if $input[0].is_of_type('fasta'):
            #set format = "-f"
        #elif $input[0].is_of_type('fastq'):
            #set format = "-q"
        #end if
        
        mkdir unmatched_dir &&
        
        #for $file in $input:
            #set sample=$file.element_identifier
            bowtie -p \${GALAXY_SLOTS:-4}
                   $method_prefix
                   --al matched.fa
                   --un unmatched.fa
                   --suppress 6,7,8
                   $index_path $format $file > tabular_bowtie_output.tab &&
            genome_aligned=\$(wc -l < matched.fa) &&
            genome_aligned=\$(( \$genome_aligned/2)) &&
            #set counter = 0
            #for $i in $AdditionalQueries:
                #set $counter += 1
                #if $counter != 1:
                    #set to_align = "class_unmatched.fa"
                #else:
                    #set to_align = "matched.fa"
                #end if
                touch tmp_class_matched.fa tmp_class_unmatched.fa &&
                bowtie -p \${GALAXY_SLOTS:-4}
                    $method_prefix
                    --al tmp_class_matched.fa
                    --un tmp_class_unmatched.fa
                    --suppress 6,7,8
                    $i.ownFile.name $format '$to_align' > tabular_bowtie_output.tab &&
                class_aligned=\$(( \$(wc -l < tmp_class_matched.fa)/2)) &&
                class_unaligned=\$(( \$(wc -l < tmp_class_unmatched.fa)/2)) &&
                echo -e "$sample\t$i.ownFile.name\t\$class_aligned\t\${genome_aligned}" >> $output &&
                mv tmp_class_unmatched.fa class_unmatched.fa &&
                rm tmp_class_matched.fa &&
            #end for
            remaining=\$(( \$(wc -l < class_unmatched.fa)/2)) &&
            echo -e "$sample\tNot classified\t\${remaining}\t\${genome_aligned}" >> $output &&
            cp class_unmatched.fa unmatched_dir/${sample}_unmatched.fasta &&
            #if $format == '-q':
                mv unmatched_dir/${sample}_unmatched.fasta unmatched_dir/${sample}_unmatched.fastq &&
                sed -n '1~4s/^@/>/p;2~4p' unmatched_dir/${sample}_unmatched.fastq > unmatched_dir/${sample}_unmatched.fasta &&
                rm unmatched_dir/${sample}_unmatched.fastq &&
            #end if
        #end for
        ls -la unmatched_dir &&
        Rscript $__tool_directory__/barplot.r --input $output --barplot $barplot
        ]]></command>
  <inputs>
    <param name="input" type="data" multiple="True" format="fasta,fastq" label="Input file: reads clipped from their adapter" help="Only with clipped, raw fasta or fastq files"/>
    <param name="mismatches" type="select" label="Number of mismatches allowed" help="specify the number of mismatches allowed during alignments">
        <option value="0">0</option>
        <option value="1" selected="true">1</option>
        <option value="2">2</option>
        <option value="3">3</option>
    </param>
<!-- First bowtie index selection -->
    <conditional name="refGenomeSource1">
      <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Bowtie Built-ins were indexed using default options">
        <option value="indexed">Use a built-in index</option>
        <option value="history">Use one from the history</option>
      </param>
      <when value="indexed">
        <param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact instance administrator">
          <options from_data_table="bowtie_indexes"/>
        </param>
      </when>
      <when value="history">
        <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" />
      </when>
    </conditional>
<!-- End of first bowtie index selection -->
<!-- other  bowtie index selections from fasta in history (mandatory) -->
    <repeat name="AdditionalQueries" title="Additional Alignment Step">
        <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" />
    </repeat>
<!-- End of other bowtie index selections -->
   </inputs>
   <outputs>
       <collection name="unmatched" type="list" format="fasta" label="Annotate smRNAs: Unmatched reads">
           <discover_datasets pattern="__name_and_ext__" directory="unmatched_dir" />
       </collection>
       <data format="tabular" name="output" label="Cascade Annotation Analysis">
           <actions>
               <action name="column_names" type="metadata" default="Sample,Reference Index,Number of reads, Total reads" />
           </actions>
        </data>
        <data name="barplot" format="pdf" label="barplot from ${on_string}" />
    </outputs>
    <tests>
        <test>
            <param name="input" value ="sample5.fa,sample4.fa,sample3.fa,sample2.fa,sample1.fa" ftype="fasta" />
            <param name="genomeSource" value="history" />
            <param name="ownFile" value ="2L-tail.fa" ftype="fasta" />
            <param name="AdditionalQueries_0|ownFile" value="dme_miR21_hairpin.fa" ftype="fasta" />
            <param name="AdditionalQueries_1|ownFile" value="Ensembl_transposon_set.fa" ftype="fasta" />
            <output name="output" ftype="tabular" file="multisample5_output.tab" />
            <output name="barplot" ftype="pdf" file="multisample5_output.pdf" compare="sim_size" delta="500" />
            <output_collection name="unmatched" type="list" count="5">
                <element name="sample5.fa_unmatched" file="unmatched_5.fa" ftype="fasta" compare="sim_size" delta="0"/>
                <element name="sample4.fa_unmatched" file="unmatched_4.fa" ftype="fasta"  compare="sim_size" delta="0"/>
                <element name="sample3.fa_unmatched" file="unmatched_3.fa" ftype="fasta"  compare="sim_size" delta="0"/>
                <element name="sample2.fa_unmatched" file="unmatched_2.fa" ftype="fasta"  compare="sim_size" delta="0"/>
                <element name="sample1.fa_unmatched" file="unmatched_1.fa" ftype="fasta"  compare="sim_size" delta="0"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value ="sample1.fa" ftype="fasta" />
            <param name="genomeSource" value="history" />
            <param name="ownFile" value ="2L-tail.fa" ftype="fasta" />
            <param name="AdditionalQueries_0|ownFile" value="dme_miR21_hairpin.fa" ftype="fasta" />
            <param name="AdditionalQueries_1|ownFile" value="Ensembl_transposon_set.fa" ftype="fasta" />
            <output name="output" ftype="tabular" file="sample1_output.tab" />
            <output name="barplot" ftype="pdf" file="sample1_output.pdf" compare="sim_size" delta="500"/>
            <output_collection name="unmatched" type="list">
                <element name="sample1.fa_unmatched" file="unmatched_6.fa" ftype="fasta" compare="sim_size" delta="0"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value ="sample.fastq" ftype="fastq" />
            <param name="genomeSource" value="history" />
            <param name="ownFile" value ="2L-tail.fa" ftype="fasta" />
            <param name="AdditionalQueries_0|ownFile" value="dme_miR21_hairpin.fa" ftype="fasta" />
            <param name="AdditionalQueries_1|ownFile" value="Ensembl_transposon_set.fa" ftype="fasta" />
            <output name="output" ftype="tabular" file="sample_output.tab" />
            <output name="barplot" ftype="pdf" file="sample_output.pdf" compare="sim_size" delta="500"/>
            <output_collection name="unmatched" type="list">
                <element name="sample.fastq_unmatched" file="unmatched_fastq.fa" ftype="fasta"/>
            </output_collection>
        </test>
    </tests>
  <help>

**Introduction**

Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient.
A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution.

Here The sRbowtie wrapper specifically works with short reads FASTA or FASTQ inputs
(-v bowtie mode, with -k 1) which has to be clipped from adapter before alignment.

.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml


------

**What it does**

.. class:: infomark

This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes.
Read that aligned to the first reference are realigned to the second reference.
From this point, unaligned reads are taken as input for alignment to the third reference, etc.


Reads are Matched on DNA references (both strands) as fast as possible, without taking care of mapping issues

*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8*

unaligned reads at step N are used as input for sRbowtie at step N+1

-----

**Input formats**

.. class:: warningmark

*Reads must be clipped from their adapter and provided in a FASTA or FASTQ format*

-----

**OUTPUTS**

**- Annotation table in a tabular format**

**- Pie Charts of class abundances**

**- Unmatched reads in fasta format**

  </help>

  <citations>
    <citation type="doi">10.1038/nature11416</citation>
  </citations>

</tool>