Mercurial > repos > galaxyp > retrieve_ensembl_bed

<tool id="retrieve_ensembl_bed" name="Retrieve Ensembl features in BED format" version="0.1.0">
    <description>using Ensembl REST API</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <expand macro="ensembl_requirements" />
        <expand macro="bedutil_requirements" />
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        python '$__tool_directory__/retrieve_ensembl_bed.py'
            --species '$species'
            #if $extended_bed:
                --extended_bed
            #end if
            $ucsc_chrom_names
            #if $biotypes:
                --biotypes '$biotypes'
            #end if
            #if $regions:
                --regions '$regions'
            #end if
            #if $interval_file:
                #if $interval_file.ext.find('bed') > -1
                    --interval_format bed
                #elif $interval_file.ext in ['gff','gtf','gff3']
                    --interval_format gff
                #else
                    --interval_format interval
                #end if
                --interval_file '$interval_file'
            #end if
            '$transcript_bed'
    ]]></command>
    <inputs>
        <param name="species" type="text" value="" label="Ensembl species" >
            <help>
            </help>
            <expand macro="species_options" />
            <validator type="regex" message="Enter an Ensembl organism">^\w+.*$</validator>
        </param>
        <param name="extended_bed" type="boolean" truevalue=",second_name,cds_start_status,cds_end_status,exon_frames,type,gene_name,second_gene_name,gene_type" falsevalue="" checked="true"
               label="Keep extra columns from ensembl BED"/>
        <param name="ucsc_chrom_names" type="boolean" truevalue="--ucsc_chrom_names" falsevalue="" checked="false"
               label="Use the UCSC names for Chromosomes"/>
        <param name="biotypes" type="text" value="" optional="true" label="Restrict Feature retrieval to these biotypes" >
            <expand macro="biotypes_help" />
        </param>
        <param name="regions" type="text" value="" optional="true" label="Restrict Feature retrieval to comma-separated list of regions" >
            <help>Each region is specifed as: chr or chr:pos or chr:from-to</help>
            <validator type="regex" message="">^(\w+(:\d+(-\d+)?)?(,\w+(:\d+(-\d+)?)?)*)?$</validator>
        </param>
        <param name="interval_file" type="data" format="bed,gff,interval" label="Retrieve the intervals from this file" optional="true"/>
    </inputs>
    <outputs>
        <data name="transcript_bed" format="bed" label="Ensembl ${species} transcripts.bed">
            <actions>
                <action name="column_names" type="metadata"
                 default="chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts${extended_bed}"/>
            </actions>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="species" value="human"/>
            <param name="biotypes" value="protein_coding"/>
            <param name="regions" value="1:51194990-51275150"/>
            <output name="transcript_bed">
                <assert_contents>
                    <has_text_matching expression="(chr)?1\t\d+\t\d+\tENST" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="species" value="mouse"/>
            <param name="biotypes" value="protein_coding"/>
            <param name="interval_file" ftype="bed" value="test.bed"/>
            <output name="transcript_bed">
                <assert_contents>
                    <has_text_matching expression="(chr)?1\t\d+\t\d+\tENSMUST" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Retrieve Ensembl cDNAs in BED format

usage: retrieve_ensembl_bed.py [-h] [-s SPECIES] [-R REGIONS] [-B BIOTYPES]
                               [-X] [-U] [-t] [-v] [-d]
                               output

positional arguments:
  output                   Output BED filepath, or for stdout: "-"

optional arguments:
  -h, --help               show this help message and exit
  -s SPECIES, --species SPECIES
                           Ensembl Species to retrieve
  -R REGIONS, --regions REGIONS
                           Restrict Ensembl retrieval to regions e.g.
                           X,2:20000-25000,3:100-500+
  -i INTERVAL_FILE, --interval_file INTERVAL_FILE
                           Regions from a bed, gff, or interval file

  -f {bed,gff,interval},  --interval_format {bed,gff,interval}
                           Interval format has TAB-separated
                           columns: Seq, Start, End, Strand

  -B BIOTYPES, --biotypes BIOTYPES
                           Restrict Ensembl biotypes to retrieve
  -X, --extended_bed       Include the extended columns returned from Ensembl
  -U, --ucsc_chrom_names
                           Use the UCSC names for Chromosomes
  -t, --toplevel           Print Ensembl toplevel for species
  -v, --verbose            Verbose
  -d, --debug              Debug


**Output**

Ensembl REST API returns an extended BED format with these additional columns::

  second_name, cds_start_status, cds_end_status, exon_frames, type, gene_name, second_gene_name, gene_type

    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btu613</citation>
        <citation type="doi">10.1093/nar/gku1010</citation>
    </citations>
</tool>
author	galaxyp
date	Mon, 07 Oct 2019 16:14:39 -0400
parents	da1b538b87e5
children