diff vcf_filter.xml @ 0:f0f2795de2c7 draft

planemo upload for repository https://github.com/wm75/mimodd_galaxy_wrappers commit 528bcf3b769c7c73f119b2a176d19071f9ef5312
author wolma
date Tue, 19 Dec 2017 04:54:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_filter.xml	Tue Dec 19 04:54:04 2017 -0500
@@ -0,0 +1,205 @@
+<tool id="mimodd_vcf_filter" name="MiModD VCF Filter" version="@MIMODD_WRAPPER_VERSION@">
+    <description>
+    extracts lines from a vcf variant file based on field-specific filters
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+	mimodd vcf-filter
+	    '$inputfile' -o '$outputfile'
+	  #if len($datasets):
+	    -s
+	    #for $i in $datasets
+		  '$i.sample'
+	    #end for
+	    --gt
+	    #for $i in $datasets
+	      ## remove whitespace from free-text input
+	      '#echo ("".join($i.GT.split()) or "ANY")#'
+	      #echo " "
+	    #end for
+	    --dp
+	    #for $i in $datasets
+	      $i.DP
+	    #end for
+	    --gq
+	    #for $i in $datasets
+	      $i.GQ
+	    #end for
+	    --af
+	    #for $i in $datasets
+	      '#echo ($i.AF or "::")#'
+	    #end for
+	  #end if
+	  #if len($regions):
+	    -r
+	    #for $i in $regions
+	      #if $i.stop:
+	        '$i.chrom:$i.start-$i.stop'
+	      #else:
+	        '$i.chrom:$i.start'
+	      #end if
+	    #end for
+	  #end if
+	  #if $vfilter:
+	    --vfilter
+	    ## remove ',' and replace with ' '
+	    '#echo ('" "'.join($vfilter.split(',')))#'
+	  #end if
+	  $vartype
+    ]]></command>
+  
+    <inputs>
+        <param name="inputfile" type="data" format="vcf" label="VCF input file" />
+        <repeat name="datasets" title="Sample-specific Filter" default="0" min="0">
+            <param name="sample" type="text" label="sample"
+            help="name of a sample as it appears in the VCF input file and that indicates the sample that this filter should be applied to.">
+                <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
+            </param>
+	        <param name="GT" type="text" 
+	        label="genotype pattern(s) for the inclusion of variants"
+	        help="keep only variants for which the genotype of the sample matches the specified pattern; format: x/x where x = 0 is wildtype and x = 1 is mutant. Multiple genotypes can be specified as a comma-separated list.">
+	            <validator type="expression" message="Malformed genotype pattern">not value or all(c.isdigit() or c in './|' for token in value.split(',') for c in token.strip(' '))</validator>
+	        </param>
+	        <param name="DP" type="integer" value="0" 
+	        label="depth of coverage for the sample at the variant site"
+	        help="keep only variants with at least this sample-specific coverage at the variant site" />
+        	<param name="GQ" type="integer" value="0"
+        	label="genotype quality for the variant in the sample"
+        	help="keep only variants for which the genotype prediction for the sample has at least this quality" />
+	        <param name="AF" type="text"
+	        label="allelic fraction filter"
+	        help="expected format: [allele number]:[minimal fraction]:[maximal fraction]; keep only variants for which the fraction of sample-specific reads supporting a given allele number is between minimal and maximal fraction; if allele number is omitted, the filter operates on the most frequent non-reference allele instead">
+	            <validator type="expression" message="Malformed allelic fraction filter">not value or all(c.isdigit() or c in '.:' for c in value)</validator>
+	        </param>
+        </repeat>
+        <repeat name="regions" title="Region Filter" default="0" min="0"
+        help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported.">
+            <param name="chrom" type="text" label="Chromosome">
+                <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in chromosome names." />
+            </param>
+            <param name="start" type="text" label="Region Start">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+            <param name="stop" type="text" label="Region End">
+                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
+            </param>
+        </repeat>
+        <param name="vartype" type="select" 
+        label="Select the types of variants to include in the output">
+            <option value="">all types of variants</option>
+            <option value="--no-indels">exclude indels</option>
+            <option value="--indels-only">only indels</option>
+        </param>
+        <param name="vfilter" type="text" label="sample"
+        help="Filter output by sample name; only the sample-specific columns with their sample name matching any of the comma separated filters will be retained in the output.">
+            <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
+        </param>
+    </inputs>
+  
+    <outputs>
+        <data name="outputfile" format="vcf" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="inputfile" value="a.vcf" />
+            <repeat name="datasets">
+                <param name="sample" value="N2" />
+                <param name="GT" value="0/0" />
+            </repeat>
+            <output name="outputfile" ftype="vcf" compare="diff">
+                <assert_contents>
+                    <has_text text="GT:PL:DP:DPR:GQ&#009;0/0" />
+                    <not_has_text text="GT:PL:DP:DPR:GQ&#009;1/1" />
+                    <not_has_text text="GT:PL:DP:DPR:GQ&#009;0/1" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="inputfile" value="a.vcf" />
+            <repeat name="regions">
+                <param name="chrom" value="chrX" />
+            </repeat>
+            <output name="outputfile" ftype="vcf">
+                <assert_contents>
+                    <has_text text="chrX&#009;" />
+                    <not_has_text text="chrI&#009;" />
+                    <not_has_text text="chrII&#009;" />
+                    <not_has_text text="chrIII&#009;" />
+                    <not_has_text text="chrIV&#009;" />
+                    <not_has_text text="chrV&#009;" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="inputfile" value="a.vcf" />
+            <param name="vartype" value="--no-indels" />
+            <param name="vfilter" value="ot266" />
+            <output name="outputfile" ftype="vcf">
+                <assert_contents>
+                    <not_has_text text="INDEL;" />
+                    <has_line line="#CHROM&#009;POS&#009;ID&#009;REF&#009;ALT&#009;QUAL&#009;FILTER&#009;INFO&#009;FORMAT&#009;ot266" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    
+    <help><![CDATA[
+.. class:: infomark
+
+   **What it does**
+
+The tool filters a variant file in VCF format to generate a new VCF file with only a subset of the original variants.
+
+The following types of variant filters can be set up:
+
+1) Sample-specific filters:
+   
+   Filter variants based on their characteristics in the sequenced reads of a specific sample. Multiple sample-specific filters are combined by logical AND, i.e., only variants that pass ALL sample-specific filters are kept.
+   
+2) Region filters:
+   
+   Filter variants based on the genomic region they affect. Multiple region filters are combined by logical OR, i.e., variants passing ANY region filter are kept.
+   
+3) Variant type filter:
+
+   Filter variants by their type, i.e. whether they are single nucleotide variations (SNVs) or indels
+   
+In addition, the *sample* filter can be used to reduce the samples encoded in a multi-sample VCF file to just those specified by the filter.
+The *sample* filter is included mainly for compatibility reasons: if an external tool cannot deal with the multisample file format, but instead looks only at the first sample-specific column of the file, you can use the filter to turn the multi-sample file into a single-sample file. Besides, the filter can also be used to change the order of the samples since it will sort the samples in the order specified in the filter field.
+
+**Examples of sample-specific filters:**
+
+*Simple genotype pattern*
+
+genotype pattern: 1/1 ==> keep all variants in the vcf input file for which the specified sample's genotype is homozygous mutant
+
+*Complex genotype pattern*
+
+genotype pattern: 0/1, 0/0 ==> keep all variants for which the sample's genotype is either heterozygous or homozygous wildtype
+
+*Multiple sample-specific filters*
+
+Filter 1: genotype pattern: 0/0, Filter 2: genotype pattern 1/1:
+==> keep all variants for which the first sample's gentoype is homozygous wildtype **and** the second sample's genotype is homozygous mutant
+
+*Combining sample-specific filter criteria*
+
+genotype pattern: 1/1, depth of coverage: 3, genotype quality: 9
+==> keep variants for which the sample's genotype is homozygous mutant **and** for which this genotype assignment is corroborated by a genotype quality score of at least 9
+**and** at least three reads from the sample cover the variant site
+
+**TIP:**
+
+As in the example above, genotype quality is typically most useful in combination with a genotype pattern.
+It acts then, effectively, to make the genotype filter more stringent.
+
+@HELP_FOOTER@
+    ]]></help>
+    <expand macro="citations" />
+</tool>