diff TV-vs-background.xml @ 0:1209f18a5a83 draft

Uploaded
author saskia-hiltemann
date Mon, 03 Aug 2015 05:01:15 -0400
parents
children 885ba15c2564
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TV-vs-background.xml	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,101 @@
+<tool id="t-vs-vnormal" name="Virtual Normal Correction SmallVars" version="1.6">
+	<description> Filter small variants based on presence in Virtual Normal set  </description>
+
+	<requirements>		
+		<requirement type="package" version="1.7">cgatools</requirement>
+	</requirements>
+	
+	<command interpreter="bash">
+	TV-vs-background.sh 
+		--variants $variants
+		--reference ${reference.fields.reference_crr_cgatools}
+		#if $virtnorm.VNset == "diversity":
+			--VN_varfiles ${reference.fields.VN_genomes_varfiles_list} 
+		#else
+			--VN_varfiles ${reference.fields.VN_genomes_varfiles_list_1000G} 
+		#end if
+		--threshold $threshold
+                --thresholdhc $thresholdhc
+		--outputfile_all $output_all
+		--outputfile_filtered $output_filtered		
+	</command>
+	
+	<inputs>
+		<param name="variants" type="data" format="tabular" label="List of Variants as produced by Listvariants program or VCF-2-LV conversion program"/>
+		<!--select build-->
+		<param name="reference" type="select" label="Select Build">
+			<options from_data_table="virtual_normal_correction" />
+			<filter type="data_meta" ref="variants" key="dbkey" column="0" />
+		</param>
+		<conditional name="virtnorm" >
+		<param name="VNset" type="select" label="Select Virtual Normal set to use" help="1000Genomes set can only be used for hg19 samples, for hg18 54 genomes will be used.">
+			<option value="diversity" > CG Diversity Panel and trios (54 Genomes) </option>
+			<option value="thousand" > CG 1000G project genomes (433 Genomes) (hg19 only) </option>
+		</param> 
+		</conditional>
+				
+		<param name="threshold" type="text" value="1" label="Threshold: Filter variants if present in at least this number of the background genomes"/>
+                <param name="thresholdhc" type="text" value="10" label="High Confidence Threshold: Label a somatic variant as high-confidence if locus was fully called in at least this many normal genomes" help="Please adjust according to number of normals used and desired stringency. "/>
+		<param name="fname" type="text" value="" label="Prefix for your output file" help="Optional. For example sample name."/>
+		<!--<param name="debug" type="select" label="individual level annotations?" help="get a columns per normal sample whether variant was present (only available for fully public normal samples)">
+			<option value="N" > No  </option>
+			<option value="Y" > Yes </option>
+		</param>
+                -->
+	</inputs>
+
+  <outputs>
+    <data format="tabular" name="output_all" label="${fname} All variants for ${tool.name} on ${on_string}"/>		
+    <data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on  ${on_string}"/>
+    <data format="tabular" name="output_filtered_highconf" label="${fname} High Confidence Filtered variants for ${tool.name} on  ${on_string}" from_work_dir="output_filtered_highconf.tsv"/>
+    <!--<data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on  ${on_string}"/>
+	<data format="tabular" name="output_expanded" from_work_dir="output_expanded" label="${fname} expanded annotation for ${tool.name} on  ${on_string}">
+		<filter> $debug == "Y" </filter>
+	</data>	
+    -->
+  </outputs>
+
+	<help> 
+**What it does**
+
+This tool compares a list of variants to a set of normal genomes. Each variant will be annotated with the number of normal samples it appears in.
+The tool will also output how often the variant was found in one or both alleles (01 or 11), and distinguish between a variant not being present in the normal (00) 
+or the location being no-called in the normal (NN) or half-called (0N,1N) etc. 
+
+This may take quite some time depending on the number of input variants and the number of normal genomes.
+
+**Input Files**  
+
+This program takes as input a list of variants as produced by the ListVariants tool, or the vcf-to-LV preprocessing tool. Input must be a tab-separated file of the following format::
+
+ variantID - chromosome - begin  -  end  -  varType - reference - alleleSeq - xRef
+ 1034	     chr1	  972803    972804  snp       T	          C	      dbsnp:rs31238120
+
+valid entries in varType column are: snp,sub,ins,del. 
+
+Chromosome coordinates must be zero-based half-open. 
+
+Column names must match the ones given above.
+
+
+**Output Files**
+
+1) Original input file annotated with presence (or lack thereof) in background genomes
+
+2) Filtered version of output 1, variants are removed when present in at least *threshold* of the background normal genomes (default: 1) (filters on column 9 of output file)
+
+3) High Confidence filtered version of output 2. Of all the variants labelled somatic, filter out any variants not fully called in at least *high confidence threshold* normals. (filter on column 11 of output file)
+
+Example output format::
+ 
+ variantId chromosome	begin	end	varType	reference alleleSeq	xRef	           VN_occurrences	VN_frequency	VN_fullycalled_count	VN_fullycalled_frequency  VN_00	VN_01	VN_11	VN_0N	VN_1N	VN_NN	VN_0	VN_1	VN_N
+ 34	   chr1	        46661	46662	snp	T	  C	        dbsnp.100:rs2691309	26	        0.472727	33	                0.6	                  7	19	7	1	0	20	0	0	0
+ 35	   chr1	        46850	46850	ins		  A	        	                0	        0	        10	                0.181818	          10	0	0	5	0	39	0	0	0 
+ 36	   chr1	        46895	46896	snp	T	  C	        dbsnp.100:rs2691311	8	        0.145455	40	                0.727273	          33	7	0	2	1	11	0	0	0
+ 37	   chr1	        46926	46927	snp	G	  A	        dbsnp.100:rs2548884	7	        0.127273	43	                0.781818	          36	7	0	2	0	9	0	0	0
+
+	</help>
+
+</tool>
+
+