Mercurial > repos > iuc > gemini_load

<tool id="gemini_@BINARY@" name="GEMINI @BINARY@" version="@VERSION@+galaxy2">
    <description>Loading a VCF file into GEMINI</description>
    <macros>
        <import>gemini_macros.xml</import>
        <token name="@BINARY@">load</token>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command>
<![CDATA[
        @PROVIDE_ANNO_DATA@

        ## gemini load expects a bgzipped and tabixed vcf as input
        #set $tabixed_file = 'input.vcf.gz'
        #if $infile.ext == 'vcf':
            bgzip -c '$infile' > $tabixed_file &&
            tabix -p vcf $tabixed_file &&
        #else:
            ln -s '$infile' $tabixed_file &&
            ln -s '$infile.metadata.tabix_index' ${tabixed_file}.tbi &&
        #end if

        gemini
            @BINARY@
            -v input.vcf.gz
            #if str( $annotation_type ) != "None":
                -t $annotation_type
            #end if

            $has_genotypes

            #if $ped:
                -p $ped
            #end if

            #if 'gerp_bp' not in str($opt_content):
                --skip-gerp-bp
            #end if
            #if 'cadd' not in str($opt_content):
                --skip-cadd
            #end if
            #if 'gene_tables' not in str($opt_content):
                --skip-gene-tables
            #end if
            #if 'genotypes' not in str($opt_content):
                --no-load-genotypes
            #end if
            #if 'gt_pl' not in str($opt_content):
                --skip-pls
            #end if
            #if 'passonly' in str($opt_content):
                --passonly
            #end if
            #if 'info_string' in str($opt_content):
                --save-info-string
            #end if

            --cores \${GALAXY_SLOTS:-4}

            '$outfile'
]]>
    </command>
    <inputs>
        <param name="infile" type="data" format="vcf,vcf_bgzip"
        label="VCF dataset to be loaded in the GEMINI database"
        help="Only build 37 (aka hg19) of the human genome is supported.">
            <options>
                <filter type="add_value" value="hg19" />
                <filter type="add_value" value="Homo_sapiens_nuHg19_mtrCRS" />
                <filter type="add_value" value="hg_g1k_v37" />
            </options>
        </param>
        <param argument="-t" name="annotation_type" type="select"
        label="The variants in this input are"
        help="GEMINI can parse and use annotations generated with either snpEff (both 'EFF'- and 'ANN'-style annotations are supported) or VEP. You can also load unannotated variants, but most of GEMINI's functionality will not be available or not be very useful without annotations.">
            <option value="snpEff" selected="True">annotated with snpEff</option>
            <option value="VEP">annotated with VEP</option>
            <option value="None">not annotated (not recommended)</option>
        </param>
        <param argument="--no-genotypes" name="has_genotypes" type="boolean" falsevalue="--no-genotypes" truevalue="" checked="True"
        label="This input comes with genotype calls for its samples"
        help="This is usually the case, but some published datasets, like some 1000G VCFs, are missing genotype information."/>
        <expand macro="annotation_dir" />
        <param argument="-p" name="ped" type="data" format="tabular" optional="True"
        label="Sample and family information in PED format"
        help="The pedigree dataset is optional, but several GEMINI tools require the relationship between samples (i.e., the family structure) and/or the sample phenotype to be defined. The PED format is a simple tabular format (see the tool help below for details). If you choose to not provide sample information now, but later find that you need it for your analysis, you can also add it to an existing GEMINI database by using the GEMINI amend tool." />
        <param name="opt_content" type="select" display="checkboxes" multiple="true" optional="true"
        label="Load the following optional content into the database"
        help="The preselected defaults should be ok for most use cases (feel free to enable CADD scores for non-commercial use). If you are not interested in certain annotations, you can speed up database creation and decrease the resulting database size slightly by not loading them into the database. Note: GERP and CADD scores are optional parts of the annotation source and can only be loaded if available.">
            <option value="gerp_bp" selected="true">GERP scores</option>
            <option value="cadd" selected="false">CADD scores (non-commercial use only; see licensing note below)</option>
            <option value="gene_tables" selected="true">Gene tables</option>
            <option value="genotypes" selected="true">Sample genotypes</option>
            <option value="gt_pl" selected="true">Genotype likelihoods (sample PLs)</option>
            <option value="passonly" selected="false">only variants that passed all filters</option>
            <option value="info_string" selected="false">variant INFO field</option>
        </param>
    </inputs>
    <outputs>
        <data name="outfile" format="gemini.sqlite" />
    </outputs>
    <tests>
        <test>
            <param name="annotation_databases" value="1999-01-01" />
            <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
            <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
            <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
            <assert_command>
                <has_text text="--skip-gerp-bp" />
                <has_text text="--skip-cadd" />
                <not_has_text text="--skip-gene-tables" />
                <not_has_text text="--skip-pls" />
                <not_has_text text="--no-load-genotypes" />
                <not_has_text text="--passonly" />
                <not_has_text text="--save-info-string" />
                <not_has_text text="--no-genotypes" />
            </assert_command>
        </test>
        <test>
            <param name="annotation_databases" value="1999-01-01" />
            <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
            <param name="opt_content" value="gerp_bp,cadd,gene_tables,genotypes,gt_pl" />
            <param name="has_genotypes" value="True" />
            <output name="outfile" file="gemini_load_result1.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
            <assert_stderr>
                <has_text text="CADD scores are not being loaded because the annotation file could not be found." />
                <has_text text="GERP per bp is not being loaded because the annotation file could not be found." />
            </assert_stderr>
            <assert_command>
                <not_has_text text="--skip-gerp-bp" />
                <not_has_text text="--skip-cadd" />
                <not_has_text text="--skip-gene-tables" />
                <not_has_text text="--skip-pls" />
                <not_has_text text="--no-load-genotypes" />
                <not_has_text text="--passonly" />
                <not_has_text text="--save-info-string" />
                <not_has_text text="--no-genotypes" />
            </assert_command>
        </test>
        <test>
            <param name="annotation_databases" value="1999-01-01" />
            <param name="infile" dbkey="hg19" value="gemini_load_input.vcf" ftype="vcf" />
            <param name="opt_content" value="genotypes,gt_pl" />
            <param name="has_genotypes" value="False" />
            <output name="outfile" file="gemini_load_result2.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
            <assert_command>
                <has_text text="--skip-gerp-bp" />
                <has_text text="--skip-cadd" />
                <has_text text="--skip-gene-tables" />
                <not_has_text text="--skip-pls" />
                <not_has_text text="--no-load-genotypes" />
                <not_has_text text="--passonly" />
                <not_has_text text="--save-info-string" />
                <has_text text="--no-genotypes" />
            </assert_command>
        </test>
        <test>
            <param name="annotation_databases" value="1999-01-01" />
            <param name="infile" dbkey="hg19" value="gemini_amend.vcf" ftype="vcf" />
            <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
            <param name="has_genotypes" value="True" />
            <param name="ped" value="gemini_amend.ped" ftype="tabular" />
            <output name="outfile" file="gemini_auto_rec_input.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
        </test>
        <test>
            <param name="annotation_databases" value="1999-01-01" />
            <param name="infile" dbkey="hg19" value="gemini_amend.vcf.gz" ftype="vcf_bgzip" />
            <param name="opt_content" value="gene_tables,genotypes,gt_pl" />
            <param name="has_genotypes" value="True" />
            <param name="ped" value="gemini_amend.ped" ftype="tabular" />
            <output name="outfile" file="gemini_auto_rec_input.db" ftype="gemini.sqlite" compare="sim_size" delta="1000" />
        </test>
    </tests>
    <help><![CDATA[
.. class:: Warning mark

**CADD scores licensing**

CADD scores are freely available for non-commercial applications only. Make
sure you `contact the developers <https://cadd.gs.washington.edu/contact>`__
before using them in any commercial application.

-----

**What it does**

Before we can use GEMINI to explore genetic variation, we must first load the
variant information stored in VCF format into the GEMINI database framework.

To fully leverage the power of GEMINI, you should first **annotate your VCF
dataset** with the functional consequences of the variants using either *VEP*
or *snpEff*.

.. class:: Warning mark

    To avoid problems during annotation, but also during later variant queries with
    GEMINI tools, it is good practice to preprocess your VCF dataset even before
    annoation to split records with multiple alternate alleles, and to left-align
    and trim indels. The authors of GEMINI recommend the tool *vt* for this purpose,
    an equivalently good option is *bcftools norm*, and Galaxy wrappers exist for
    both tools.

In addition, you are encouraged to provide **family and sample phenotype
information in PED format**, if you are planning to use GEMINI for any kind of
variant identification based on inheritance patterns.

A PED file is simply a tabular text file (columns can be separated by either
spaces or TABs, but not a mixture of the two within the same file) with the
header::

  #family_id    name     paternal_id    maternal_id    sex    phenotype

and optional additional columns. The actual column names in the header are not
fixed, but there have to be at least six columns that are interpreted as
detailed next.

Subsequent lines describe one sample from the VCF input dataset each, where

- *family_id* is an alphanumeric identifier of a family

  If the family, to which the sample belongs, is unknown, a placeholder of
  ``0``, ``-9`` or ``None`` can be used to indicate this fact.

- *name* is the identifier of the sample described by the line

- *paternal_id* is the identifier of the sample's father

  If the sample's father is not available in the VCF, a placeholder of
  ``0``, ``-9`` or ``None`` can be used to indicate this fact.

- *maternal_id* is the identifier of the sample's mother

  If the sample's mother is not available in the VCF, a placeholder of
  ``0``, ``-9`` or ``None`` can be used to indicate this fact.

- *sex* is a numeric code for the sample's sex
  (1=male, 2=female, any other number=unknown sex)

- *phenotype* is a numeric code for the sample's phenotypic affection status
  (1=unaffected, 2=affected)

  If the sample's phenotype is unknown, a placeholder of ``0`` or ``-9`` can be
  used to indicate this fact.

- Optional additional columns can have any column name you like, and accept any
  per-sample value. The data from such extra columns will be added to the
  samples table of the GEMINI database so you can use them in queries. Extra
  columns can be used, *e.g.*, to describe additional phenotypes.

- If no extra columns are present in a PED file, then the header line is
  optional.

Here are two examples of valid PED file contents::

  #family_id    name    paternal_id    maternal_id    sex    phenotype    hair_color
  1             M10475  -9             -9             1      1            brown
  1             M10478  M10475         M10500         2      2            brown
  1             M10500  -9             -9             2      2            black
  1             M128215 M10475         M10500         1      1            blue

This describes a family with two kids, in which mother and daughter, but not
father and son are phenotypically affected. The file also stores the hair color
of all family members.

::

  #family_id    name         paternal_id    maternal_id    sex    phenotype
  0             M10475       0              0              -1     1
  0             M10478       0              0              -1     2
  0             M10500       0              0              -1     2
  0             M128215      0              0              -1     1

This describes the same samples as above, but without recording family
structure, sex or additional traits. Only the sample phenotypes are provided.
In this case (no extra columns), the header line could be omitted.

    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Fri, 24 Jan 2020 17:33:33 -0500
parents	b2c25142267e
children	2270a8b83c12