view snpSift_extractFields.xml @ 3:20c7d583fec1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpsift/snpsift commit fbc18d9128669e461e76ed13307ee88dd774afa5
author iuc
date Mon, 12 Jun 2017 10:25:32 -0400
parents bf8c1526871b
children 09d6806c609e
line wrap: on
line source

<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
    <options sanitize="False" />
    <description>from a VCF file into a tabular file</description>
    <macros>
        <import>snpSift_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command><![CDATA[
@CONDA_SNPSIFT_JAR_PATH@ &&
cat '$input'
#if $one_effect_per_line:
    | "\$SNPSIFT_JAR_PATH/scripts/vcfEffOnePerLine.pl"
#end if
| SnpSift -Xmx6G extractFields
#if $separator:
    -s '$separator'
#end if
#if $empty_text:
    -e '$empty_text'
#end if
-
#echo ' '.join(['"%s"' % x for x in $extract.split()])
> '$output'
    ]]></command>
    <inputs>
        <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
        <param name="extract" type="text" label="Extract" help="Need help? See below a few examples." />
        <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
        <param name="separator" type="text" value="" label="multiple field separator" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values" />
        <param name="empty_text" type="text" value="" label="empty field text" help="Represent empty fields with this value, rather than leaving them blank" />
    </inputs>
    <outputs>
        <data name="output" format="tabular" />
    </outputs>
    <tests>
        <test>
            <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
            <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
            <output name="output">
                <assert_contents>
                <has_text text="INTRAGENIC" />
                <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
                </assert_contents>
            </output>
        </test>

        <test>
            <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
            <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
            <param name="separator" value=","/>
            <output name="output">
                <assert_contents>
		<has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**SnpSift Extract Fields**

Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.

http://snpeff.sourceforge.net/SnpSift.html#Extract

You can also use sub-fields and genotype fields / sub-fields such as::

    Standard VCF fields:
        CHROM
        POS
        ID
        REF
        ALT
        FILTER
    INFO fields:
        AF
        AC
        DP
        MQ
        etc. (any info field available)
    SnpEff 'ANN' fields:
        "ANN[*].ALLELE" (alias GENOTYPE)
        "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
        "ANN[*].IMPACT" { HIGH, MODERATE, LOW, MODIFIER }
        "ANN[*].GENE" Gene name (e.g. 'PSD3')
        "ANN[*].GENEID" Gene ID
        "ANN[*].FEATURE"
        "ANN[*].FEATUREID" (alias TRID: Transcript ID)
        "ANN[*].BIOTYPE" Biotype, as described by the annotations (e.g. 'protein_coding')
        "ANN[*].RANK" Exon or Intron rank (i.e. exon number in a transcript)
        "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
        "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
        "ANN[*].CDNA_POS" (alias POS_CDNA)
        "ANN[*].CDNA_LEN" (alias LEN_CDNA)
        "ANN[*].CDS_POS" (alias POS_CDS)
        "ANN[*].CDS_LEN" (alias LEN_CDS)
        "ANN[*].AA_POS" (alias POS_AA)
        "ANN[*].AA_LEN" (alias LEN_AA)
        "ANN[*].DISTANCE"
        "ANN[*].ERRORS" (alias WARNING, INFOS)
    SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
        "EFF[*].EFFECT"
        "EFF[*].IMPACT"
        "EFF[*].FUNCLASS"
        "EFF[*].CODON"
        "EFF[*].AA"
        "EFF[*].AA_LEN"
        "EFF[*].GENE"
        "EFF[*].BIOTYPE"
        "EFF[*].CODING"
        "EFF[*].TRID"
        "EFF[*].RANK"
    SnpEff 'LOF' fields:
        "LOF[*].GENE"
        "LOF[*].GENEID"
        "LOF[*].NUMTR"
        "LOF[*].PERC"
    SnpEff' NMD' fields:
        "NMD[*].GENE"
        "NMD[*].GENEID"
        "NMD[*].NUMTR"
        "NMD[*].PERC"

Some examples:

- *Extracting chromosome, position, ID and allele frequency from a VCF file*:

  **CHROM POS ID AF**

  The result will look something like::

      #CHROM        POS        ID            AF
      1             69134                    0.086
      1             69496      rs150690004   0.001

- *Extracting genotype fields*:

  **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**

  This means to extract:

  - CHROM POS ID: regular fields (as in the previous example)
  - THETA : This one is from INFO
  - GEN[0].GL[1] : Second likelihood from first genotype
  - GEN[1].GL : The whole GL fiels (all entries without separating them)
  - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
  - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).

  The result will look something like::

      #CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
      1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1
      1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0
      1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0

- *Extracting fields with multiple values*:
  (notice that there are multiple effect columns per line because there are mutiple effects per variant)

  **CHROM POS REF ALT ANN[*].EFFECT**

  The result will look something like::

      #CHROM	POS	REF	ALT	ANN[*].EFFECT
      22	17071756	T	C	3_prime_UTR_variant	downstream_gene_variant
      22	17072035	C	T	missense_variant	downstream_gene_variant
      22	17072258	C	A	missense_variant	downstream_gene_variant

- *Extracting fields with multiple values using a comma as a multipe field separator:*

  **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**

  The result will look something like::

      #CHROM	POS	REF	ALT	ANN[*].EFFECT	ANN[*].HGVS_P
      22	17071756	T	C	3_prime_UTR_variant,downstream_gene_variant	.,.
      22	17072035	C	T	missense_variant,downstream_gene_variant	p.Gly469Glu,.
      22	17072258	C	A	missense_variant,downstream_gene_variant	p.Gly395Cys,.

- *Extracting fields with multiple values, one effect per line:*

  **CHROM POS REF ALT ANN[*].EFFECT**

  The result will look something like::

      #CHROM	POS	REF	ALT	ANN[*].EFFECT
      22	17071756	T	C	3_prime_UTR_variant
      22	17071756	T	C	downstream_gene_variant
      22	17072035	C	T	missense_variant
      22	17072035	C	T	downstream_gene_variant
      22	17072258	C	A	missense_variant
      22	17072258	C	A	downstream_gene_variant

@EXTERNAL_DOCUMENTATION@
- http://snpeff.sourceforge.net/SnpSift.html#Extract
    ]]></help>
    <expand macro="citations" />
</tool>