view repeatmasker.xml @ 8:ade773848c3d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/repeat_masker commit b0aaae4488b95b54eaf85010f999b2638dfadd3e"
author iuc
date Thu, 23 Jan 2020 04:40:29 -0500
parents d7540a923c7b
children 438f65cb1d14
line wrap: on
line source

<tool id="repeatmasker_wrapper" name="RepeatMasker" version="4.0.7+galaxy2" profile="17.01">
  <description>screen DNA sequences for interspersed repeats and low complexity regions</description>

  <requirements>
    <requirement type="package" version="4.0.7">repeatmasker</requirement>
  </requirements>

  <command detect_errors="exit_code"><![CDATA[
    RM_PATH=\$(which RepeatMasker) && 
    if [ -z "\$RM_PATH" ] ; then echo "Failed to find RepeatMasker in PATH (\$PATH)" >&2 ; exit 1 ; fi &&
    RM_LIB_PATH=\$(dirname \$RM_PATH)/../share/RepeatMasker/Libraries &&
    mkdir lib &&
    export REPEATMASKER_LIB_DIR=\$(pwd)/lib &&
      for file in \$(ls \$RM_LIB_PATH) ; do  ln -s \$RM_LIB_PATH/\$file lib/\$file ; done &&
    #if $repeat_source.source_type == "repbase":
      cp '${repeat_source.repbase_file}' 'lib/${repeat_source.repbase_file_name}' &&
    #end if
    ln -s '${input_fasta}' rm_input.fasta &&
    RepeatMasker -dir \$(pwd)
    #if $repeat_source.source_type == "library":
      -lib '${repeat_source.repeat_lib}'
      -cutoff '${repeat_source.cutoff}'
    #else if $repeat_source.source_type == "repbase":
      #if $repeat_source.species_source.species_from_list == 'yes':
        $repeat_source.species_source.species_list
      #else
        -species '${repeat_source.species_source.species_name}'
      #end if
    #end if
    -parallel \${GALAXY_SLOTS:-1}
    ${gff}
    ${excln}
    ${advanced.is_only}
    ${advanced.is_clip}
    ${advanced.no_is}
    ${advanced.rodspec}
    ${advanced.primspec}
    ${advanced.nolow}
    ${advanced.noint}
    ${advanced.norna}
    ${advanced.alu}
    ${advanced.div}
    ${advanced.search_speed}
    -frag ${advanced.frag}
    ## -maxsize ${advanced.maxsize}
    #if str($advanced.gc):
      -gc ${advanced.gc}
    #end if
    ${advanced.gccalc}
    ${advanced.nocut}
    ${advanced.keep_alignments}
    ${advanced.invert_alignments}
    ${advanced.xout}
    ${advanced.xsmall}
    ${advanced.poly}
    rm_input.fasta &&
    #if $advanced.is_only != '-is_only':
      mv rm_input.fasta.masked '${output_masked_genome}' &&
      sed -E 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g ;  1,2c SW score\t% div.\t% del.\t% ins.\tquery sequence\tpos in  query: begin\tend\t(left)\trepeat\tclass/family\tpos in repeat: begin\tend\t(left)\tID' rm_input.fasta.out >'${output_log}' &&
      mv rm_input.fasta.tbl '${output_table}' &&
      #if $gff == '-gff':
        mv rm_input.fasta.out.gff '${output_gff}' &&
      #end if
      #if $advanced.keep_alignments == '-ali':
        mv rm_input.fasta.align '${output_alignment}' &&
      #end if
      #if $advanced.poly == '-poly':
        sed -E 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g' rm_input.fasta.polyout >'${output_polymorphic}' &&
      #end if
    #end if
    if [ -f 'rm_input.fasta.cat.gz' ]; then
      zcat 'rm_input.fasta.cat.gz' > '${output_repeat_catalog}';
    else
      mv rm_input.fasta.cat '${output_repeat_catalog}';
    fi
    ]]>
  </command>

  <inputs>
    <param name="input_fasta" type="data" format="fasta" label="Genomic DNA" />
    <conditional name="repeat_source">
      <param label="Repeat library source" name="source_type" type="select">
        <option selected="true" value="repbase">RepBase</option>
        <option value="library">Custom library of repeats</option>
      </param>
      <when value="repbase">
        <param name="repbase_file" type="data" format="embl" label="RepBase (RMRBSeqs.embl) file" />
        <param name="repbase_file_name" type="hidden" value="RMRBSeqs.embl"/> <!-- This is an ugly hack to allow testing with a fake repbase -->
        <conditional name="species_source">
          <param label="Select species name from a list?" name="species_from_list" type="select">
            <option value="yes" selected="true">Yes</option>
            <option value="no">No</option>
          </param>
          <when value="yes">
            <param name="species_list" type="select" label="Species">
              <option value="-species anopheles" selected="true">anopheles</option>
              <option value="-species arabidopsis">arabidopsis</option>
              <option value="-species artiodactyl">artiodactyl</option>
              <option value="-species aspergillus">aspergillus</option>
              <option value="-species carnivore">carnivore</option>
              <option value="-species cat">cat</option>
              <option value="-species chicken">chicken</option>
              <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
              <option value="-species 'ciona savignyi'">ciona savignyi</option>
              <option value="-species cow">cow</option>
              <option value="-species danio">danio</option>
              <option value="-species diatoaea">diatomea</option>
              <option value="-species dog">dog</option>
              <option value="-species drosophila">drosophila</option>
              <option value="-species elegans">elegans</option>
              <option value="-species fugu">fugu</option>
              <option value="-species fungi" selected="true">fungi</option>
              <option value="-species human">human</option>
              <option value="-species maize">maize</option>
              <option value="-species mammal">mammal</option>
              <option value="-species mouse">mouse</option>
              <option value="-species pig">pig</option>
              <option value="-species rat">rat</option>
              <option value="-species rice">rice</option>
              <option value="-species rodentia">rodentia</option>
              <option value="-species ruminantia">ruminantia</option>
              <option value="-species wheat">wheat</option>
            </param>
          </when>
          <when value="no">
            <param name="species_name" type="text" value="homo sapiens" label="Repeat source species" help="Source species (or clade name) used to select repeats from RepBase" />
          </when>
        </conditional>
      </when>
      <when value="library">
        <param name="repeat_lib" type="data" format="fasta" label="Custom library of repeats" />
        <param name="cutoff" type="integer" argument="-cutoff" value="225" label="Cutoff score for masking repeats" />
      </when>
    </conditional>
    <param type="boolean" argument="-gff" truevalue="-gff" falsevalue="" label="Output annotation of repeats in GFF format" checked="false" />
    <param argument="-excln" type="boolean" truevalue="-excln" falsevalue="" label="Ignore stretches of Ns when computing statistics" checked="true" help="Scaffolds are sometimes joined with stretches of 25 or more Ns. This option ignores them when calculating repeat statistics" />
    <section name="advanced" title="Advanced options" expanded="false">
      <param argument="-is_only" type="boolean" truevalue="-is_only" falsevalue="" checked="false" label="Only clip E coli insertion elements" />
      <param argument="-is_clip" type="boolean" truevalue="-is_clip" falsevalue="" checked="false" label="Clip IS elements before analysis" help="Normally RepeatMasker will report on IS element, with this option selected it will clip them before analysis" />
      <param argument="-no_is" type="boolean" truevalue="-no_is" falsevalue="" checked="false" label="Skip bacterial insertion element check" />
      <param argument="-rodspec" type="boolean" truevalue="-rodspec" falsevalue="" checked="false" label="Only check for rodent specific repeats" help="If this option is select a check for rodent specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-primspec" type="boolean" truevalue="-primspec" falsevalue="" checked="false" label="Only check for primate specific repeats" help="If this option is select a check for primate specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-nolow" type="boolean" truevalue="-nolow" falsevalue="" checked="false" label="No low complexity masking" help="Skip masking of simple tandem repeats and low complexity regions." />
      <param argument="-noint" type="boolean" truevalue="-noint" falsevalue="" checked="false" label="No interspersed repeat masking" help="Only mask simple repeats, skip masking of interspersed repeats." />
      <param argument="-norna" type="boolean" truevalue="-norna" falsevalue="" checked="false" label="No repeat-like-RNA masking" help="Skip masking of small pol III transcribed RNA (these are masked by default because they resemble SINEs)" />
      <param argument="-alu" type="boolean" truevalue="-alu" falsevalue="" checked="false" label="Limit masking to (primate) Alu repeats" />
      <param argument="-div" type="boolean" truevalue="-div" falsevalue="" checked="false" label="Limit masking to less diverged (younger) repeats" />
      <param type="select" name="search_speed" label="Search speed vs sensitiviy trade-off">
        <option value="">Default</option>
        <option value="-q">Quick (5-10% less sensitive, 3-4 times speedup)</option>
        <option value="-qq">Rush (10% less sensitive)</option>
        <option value="-s">Slow (0-5% more sensitive, 2.5 times slowdown)</option>
      </param>
      <param type="integer" argument="-frag" value="40000" label="Maximum contiguous sequence searched" help="Maximum length of sequencing that is search without fragmenting" />
      <!-- -maxsize option is in the help, but not in the code of repeatmasker-->
      <!--param type="integer" argument="-maxsize" value="4000000" label="Maximum length for IS or repeat clipped sequences" /-->
      <param type="integer" argument="-gc" optional="true" label="Select matrices for this GC%" help="Valid values are a percentage or -1 to choose the default" />
      <param type="boolean" argument="-gccalc" truevalue="-gcccalc" falsevalue="" checked="false" label="Calculate GC % for all sequences" help="By default RepeatMasker skips calculating GC % for small sequences" />
      <param type="boolean" argument="-nocut" truevalue="-nocut" falsevalue="" checked="false" label="Skips cutting of repeats" />
      <param name="xout" type="boolean" argument="-x" truevalue="-x" falsevalue="" checked="false" label="Mask with X instead of N characters" />
      <param name="keep_alignments" type="boolean" argument="-ali" truevalue="-ali" falsevalue="" checked="false" label="Output alignments file" />
      <param name="invert_alignments" type="boolean" argument="-inv" truevalue="-inv" falsevalue="" checked="false" label="Invert alignments in alignment file" help="Show alignments in the orientation of the repeat sequence, not the query sequence" />
      <param type="boolean" argument="-xsmall" truevalue="-xsmall" falsevalue="" checked="false" label="Output repetitive regions as lowercase, non-repetitive regions as uppercase" />
      <param type="boolean" argument="-poly" truevalue="-poly" falsevalue="" checked="false" label="Output list of potentially polymorphic microsatellites" />
    </section>
  </inputs>
  <outputs>
    <data name="output_masked_genome" format="fasta" label="RepeatMasker masked sequence on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_log" format="tabular" label="RepeatMasker output log on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_table" format="txt" label="RepeatMasker repeat statistics on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_repeat_catalog" format="txt" label="RepeatMasker repeat catalogue on ${on_string}" />
    <data name="output_alignment" format="txt" label="RepeatMasker alignment on ${on_string}">
      <filter>not advanced['is_only'] and advanced['keep_alignments']</filter>
    </data>
    <data name="output_polymorphic" format="tabular" label="RepeatMasker possible polymorphic repeats on ${on_string}">
      <filter>not advanced['is_only'] and advanced['poly']</filter>
    </data>
    <data name="output_gff" format="gff" label="RepeatMasker repeat annotation on ${on_string}">
      <filter>not advanced['is_only'] and gff is True</filter>
    </data>
  </outputs>
  <tests>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="6" />
      <output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small.fasta.log" lines_diff="2"/>
    </test>
    <test expect_num_outputs="7">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="gff" value="-gff" />
      <!-- <param name="show" value="yes" /> -->
      <param name="keep_alignments" value="-ali" />
      <param name="poly" value="-poly" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="6" />
      <output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small.fasta.log" lines_diff="2"/>
      <output name="output_alignment" file="small.fasta.align" />
      <output name="output_polymorphic" file="small.fasta.poly" />
      <output name="output_gff" file="small.fasta.gff" lines_diff="4" />
    </test>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="repbase" />
      <param name="repbase_file" value="fake_repbase.embl" />
      <param name="repbase_file_name" value="fake.embl" />
      <param name="species_list" value="anopheles" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small_repbase.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small_repbase.fasta.log" lines_diff="2"/>
    </test>
  </tests>
  <help><![CDATA[
RepeatMasker is a program that screens DNA for interspersed repeats and low
complexity DNA sequences. The database of repeats to screen for can be
provided as a FASTA file or downloaded from RepBase_. If the RepBase option is
chosen the RepBaseRepeatMaskerEdition file should be downloaded and
unpacked, and the enclosed EMBL format file ('RMRBSeqs.embl') should
be uploaded to Galaxy for use with this tool.

Further documentation is available on the RepeatMasker homepage_.

.. _RepBase: http://www.girinst.org/repbase/
.. _homepage: http://www.repeatmasker.org/webrepeatmaskerhelp.html
    ]]>
  </help>
  <citations>
    <citation type="bibtex">
      @misc{RepeatMasker,
        title = {RepeatMasker Open-4.0},
        howpublished = {\url{http://www.repeatmasker.org}},
        author = {Smit, AFA and Hubley, R and Green, P.},
        year = {2013-2015}}
    </citation>
  </citations>
</tool>