view vcf_filter.xml @ 23:5db0545b9004 draft

update to v0.1.7.3
author wolma
date Thu, 21 Jul 2016 03:55:49 -0400
parents c46406466625
children
line wrap: on
line source

<tool id="vcf_filter" name="VCF Filter" version="0.1.7.3">
  <description>Extracts lines from a vcf variant file based on field-specific filters</description>
  <macros>
    <import>toolshed_macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <version_command>python3 -m MiModD version -q</version_command>
  <command> 
	python3 -m MiModD vcf-filter
	"$inputfile"
	-o "$outputfile"
	#if len($datasets):
	-s
	#for $i in $datasets
		"$i.sample"
	#end for
	--gt
	#for $i in $datasets
	    ## remove whitespace from free-text input
	    "#echo ("".join($i.GT.split()) or "ANY")#"
	    #echo " "
	#end for
	--dp
	#for $i in $datasets
	    "$i.DP"
	#end for
	--gq
	#for $i in $datasets
	    "$i.GQ"
	#end for
	--af
	#for $i in $datasets
	    "#echo ($i.AF or "::")#"
	#end for
	#end if
	#if len($regions):
	-r
	    #for $i in $regions
	        #if $i.stop:
	"$i.chrom:$i.start-$i.stop"
	        #else:
	"$i.chrom:$i.start"
	        #end if
	    #end for
	#end if
	#if $vfilter:
	--vfilter
	## remove ',' and replace with ' '
	"#echo ('" "'.join($vfilter.split(',')))#"
	#end if
	$vartype
  </command>
  
  <inputs>
    <param format="vcf" label="VCF input file" name="inputfile" type="data" />
    <repeat default="0" min="0" name="datasets" title="Sample-specific Filter">
        <param help="name of a sample as it appears in the VCF input file and that indicates the sample that this filter should be applied to." label="sample" name="sample" type="text" />
	<param help="keep only variants for which the genotype of the sample matches the specified pattern; format: x/x where x = 0 is wildtype and x = 1 is mutant. Multiple genotypes can be specified as a comma-separated list." label="genotype pattern(s) for the inclusion of variants" name="GT" type="text" />
	<param help="keep only variants with at least this sample-specific coverage at the variant site" label="depth of coverage for the sample at the variant site" name="DP" type="integer" value="0" />
	<param help="keep only variants for which the genotype prediction for the sample has at least this quality" label="genotype quality for the variant in the sample" name="GQ" type="integer" value="0" />
	<param help="expected format: [allele number]:[minimal fraction]:[maximal fraction]; keep only variants for which the fraction of sample-specific reads supporting a given allele number is between minimal and maximal fraction; if allele number is omitted, the filter operates on the most frequent non-reference allele instead" label="allelic fraction filter" name="AF" type="text" />
    </repeat>
    <repeat default="0" help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported." min="0" name="regions" title="Region Filter">
      <param label="Chromosome" name="chrom" type="text" />
      <param label="Region Start" name="start" type="text" />
      <param label="Region End" name="stop" type="text" />
    </repeat>
    <param label="Select the types of variants to include in the output" name="vartype" type="select">
      <option value="">all types of variants</option>
      <option value="--no-indels">exclude indels</option>
      <option value="--indels-only">only indels</option>
    </param>
    <param help="Filter output by sample name; only the sample-specific columns with their sample name matching any of the comma separated filters will be retained in the output." label="sample" name="vfilter" type="text" />
  </inputs>
  
  <outputs>
    <data format="vcf" name="outputfile" />
  </outputs>

  <help>
.. class:: infomark

   **What it does**

The tool filters a variant file in VCF format to generate a new VCF file with only a subset of the original variants.

The following types of variant filters can be set up:

1) Sample-specific filters:
   
   Filter variants based on their characteristics in the sequenced reads of a specific sample. Multiple sample-specific filters are combined by logical AND, i.e., only variants that pass ALL sample-specific filters are kept.
   
2) Region filters:
   
   Filter variants based on the genomic region they affect. Multiple region filters are combined by logical OR, i.e., variants passing ANY region filter are kept.
   
3) Variant type filter:

   Filter variants by their type, i.e. whether they are single nucleotide variations (SNVs) or indels
   
In addition, the *sample* filter can be used to reduce the samples encoded in a multi-sample VCF file to just those specified by the filter.
The *sample* filter is included mainly for compatibility reasons: if an external tool cannot deal with the multisample file format, but instead looks only at the first sample-specific column of the file, you can use the filter to turn the multi-sample file into a single-sample file. Besides, the filter can also be used to change the order of the samples since it will sort the samples in the order specified in the filter field.

**Examples of sample-specific filters:**

*Simple genotype pattern*

genotype pattern: 1/1 ==&gt; keep all variants in the vcf input file for which the specified sample's genotype is homozygous mutant

*Complex genotype pattern*

genotype pattern: 0/1, 0/0 ==&gt; keep all variants for which the sample's genotype is either heterozygous or homozygous wildtype

*Multiple sample-specific filters*

Filter 1: genotype pattern: 0/0, Filter 2: genotype pattern 1/1:
==&gt; keep all variants for which the first sample's gentoype is homozygous wildtype **and** the second sample's genotype is homozygous mutant

*Combining sample-specific filter criteria*

genotype pattern: 1/1, depth of coverage: 3, genotype quality: 9
==&gt; keep variants for which the sample's genotype is homozygous mutant **and** for which this genotype assignment is corroborated by a genotype quality score of at least 9
**and** at least three reads from the sample cover the variant site

**TIP:**

As in the example above, genotype quality is typically most useful in combination with a genotype pattern.
It acts then, effectively, to make the genotype filter more stringent.



  </help>
</tool>