changeset 0:1209f18a5a83 draft

Uploaded
author saskia-hiltemann
date Mon, 03 Aug 2015 05:01:15 -0400
parents
children 1c6710924e80
files JunctionDiff-vs-background.sh JunctionDiff-vs-background.xml README.txt TV-vs-background.sh TV-vs-background.xml VN_genomes_locations.txt tool-data/virtual_normal_correction.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml vcf2lv.sh vcf2lv.xml
diffstat 11 files changed, 594 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/JunctionDiff-vs-background.sh	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+#JunctionDiff-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $output_filtered $output_report $scoreThresholdA $scoreThresholdB $distance $minlength
+
+#set some defaults
+output_report="output_reports.tsv"   
+		
+set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_junctions: cgatools_binary: outputfile_filtered: outputfile_report: scoreThresholdA: scoreThresholdB: distance: minlength: " "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+       	--variants) 			variants=$2;shift;;  
+		--reference)			crr=$2;shift;; 
+		--VN_junctions)			VN_junctionfiles_list=$2;shift;;  	
+		--cgatools_binary)		cgatools_binary=$2;shift;; 				#cgatools binary to use
+		--outputfile_filtered) 	output_filtered=$2;shift;;  
+		--outputfile_report) 	output_report=$2;shift;;  
+		--scoreThresholdA) 		scoreThresholdA=$2;shift;;  
+		--scoreThresholdB) 		scoreThresholdB=$2;shift;;  
+		--distance) 			distance=$2;shift;;  
+		--minlength) 			minlength=$2;shift;;		
+        -h)        		shift;;
+		--)        		shift;break;;
+        -*)        		usage;;
+        *)         		break;;            
+    esac
+    shift
+done
+
+
+# make copy of input junctions file, as this file will be altered
+junctions="junctions.tsv"
+cp $variants $junctions  
+
+
+###  run JunctionDiff against all of the VN junctionfiles
+
+echo "running JunctionDiff against each of the VN genomes"
+
+# for each line in VN genomes list of junctionfiles, run junctiondiff
+count=0
+while read line           
+do  
+	if [[ $line != "" ]] # catch empty lines
+	then
+		count=$[$count+1]         
+		${cgatools_binary} junctiondiff \
+		--beta \
+		--statout \
+		--reference $crr \
+		--junctionsA $junctions \
+		--junctionsB $line \
+		--scoreThresholdA $scoreThresholdA \
+		--scoreThresholdB $scoreThresholdB \
+		--distance $distance \
+		--minlength $minlength 
+
+		#concatenate all reports
+		echo -e "report of run $count:\n----------------------" >> $output_report
+		cat report.tsv >> $output_report
+		echo "" >> $output_report
+
+	
+		#rename output file to junctions file for next iteration	
+		rm $junctions
+		mv "diff-$junctions" $junctions
+		fi
+done <  $VN_junctionfiles_list
+	
+cp $junctions $output_filtered
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/JunctionDiff-vs-background.xml	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,72 @@
+<tool id="t-vs-vnormal_junctions" name="Virtual Normal Correction SVs" version="1.6">
+	<description> Filter SVs based on presence in VN set </description>
+	
+	<requirements>		
+		<requirement type="package" version="1.7">cgatools</requirement>
+	</requirements>
+	
+	<command interpreter="bash">
+	JunctionDiff-vs-background.sh 
+		--variants $variants 
+		--reference ${reference.fields.reference_crr_cgatools} 
+		#if $virtnorm.VNset == "diversity"
+			--VN_junctions ${reference.fields.VN_genomes_junctionfile_list} 
+		#else
+			--VN_junctions ${reference.fields.VN_genomes_junctionfile_list_1000G} 
+		#end if
+		--cgatools_binary cgatools
+		--outputfile_filtered $output_filtered 
+		--scoreThresholdA $scoreThresholdA 
+		--scoreThresholdB $scoreThresholdB 
+		--distance $distance 
+		--minlength $minlength					
+	</command>
+	
+	<inputs>
+		<!--select build-->
+		<param name="reference" type="select" label="Select Build">
+			<options from_data_table="virtual_normal_correction" />
+			<filter type="data_meta" ref="variants" key="dbkey" column="0" />
+		</param>
+		<conditional name="virtnorm" >
+		<param name="VNset" type="select" label="Select Virtual Normal set to use" help="1000Genomes set can only be used for hg19 samples">
+			<option value="diversity" selected="true"> CG Diversity Panel and trios (54 Genomes) (hg18/hg19) </option>
+			<option value="thousand" > CG 1000G project genomes (433 Genomes) (hg19 only) </option>
+		</param>
+		</conditional>
+		<param name="variants" type="data" format="tabular" label="CG Junctions file"/>
+		<param name="scoreThresholdA" type="text" value="10" label="scoreThreshold" help="The minimum number of discordant mate pair alignments supporting the junction from input genome"/>
+		<param name="scoreThresholdB" type="text" value="10" label="scoreThreshold" help="The minimum number of discordant mate pair alignments supporting the junction from background genomes"/>
+		<param name="distance" type="text" value="200" label="Maximum distance between coordinates of potentially compatible junctions."/>		
+		<param name="minlength" type="text" value="500" label="Minimum deletion junctions length to be included into the difference file."/>
+		<param name="report" type="select"  label="Generate report file?">
+			<option value="N" selected="true"> No </option>
+			<option value="Y"> Yes </option>
+		</param>
+		<param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/>		
+	</inputs>
+
+  <outputs>
+       <data format="tabular" name="output_filtered" label="${fname} Filtered junctions for ${tool.name} on  ${on_string}"/>		
+       <data format="tabular" name="output_report" from_work_dir= "output_reports.tsv" label="${fname} report for ${tool.name} on  ${on_string}">
+       	<filter> report == "Y" </filter>
+       </data>			  
+  </outputs>
+
+	<help> 
+**What it does**
+
+
+
+**Input Files**
+Complete Genomics Junctions file
+
+**Output Files**
+Junctions remaining after filtering
+
+
+	</help>
+
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,32 @@
+Installs VirtualNormal Correction Tool
+
+After installing this tool via admin panel, manually configure the following:
+
+1) edit virtual_normal_correction.loc file
+
+   - change "/path/to/hg18.crr" to the location of the Complete Genomics reference crr file on your system
+     (can be downloaded from ftp://ftp.completegenomics.com/ReferenceFiles/ )
+     
+   - change "/path/to/VN_genomes_varfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics
+     varfiles to be used as a virtual normal. This file should contain 1 file location per line, e.g.
+     
+	    /path/to/normal-varfile-1
+		/path/to/normal-varfile-2
+		/path/to/normal-varfile-3
+		/path/to/normal-varfile-4
+		/path/to/normal-varfile-5
+		/path/to/normal-varfile-6
+		/path/to/normal-varfile-7
+		/path/to/normal-varfile-8
+   			...
+   			
+   	 Varfiles can be in compressed or uncompressed form. For example, Complete Genomics' Diversity panel can be used.
+     (can be downloaded from ftp://ftp2.completegenomics.com/)		
+   
+   - change	"/path/to/VN_genomes_junctionfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics
+     junctionfiles to be used as a virtual normal. This file should contain 1 file location per line.  For example, Complete Genomics' 
+	 Diversity panel can be used. (can be downloaded from ftp://ftp2.completegenomics.com/)
+     
+ 2) restart Galaxy for changes to take effect
+ 
+ After this initial setup, additional normals can be added to the lists without having to restart Galaxy. 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TV-vs-background.sh	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered
+
+echo $@
+
+set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+while [ $# -gt 0 ]
+do 
+    case "$1" in
+       	--variants) 			variants=$2;shift;;
+		--reference)			crr=$2;shift;;
+		--VN_varfiles)			VN_varfiles_list=$2;shift;;  			
+		--outputfile_filtered) 	output_filtered=$2;shift;;  
+		--outputfile_all) 		output_all=$2;shift;;  
+		--threshold) 			threshold=$2;shift;;  		
+		--thresholdhc)                 thresholdhc=$2;shift;;
+        -h)        		shift;;
+		--)        		shift;break;;
+        -*)        		usage;;
+        *)         		break;;            
+    esac
+    shift
+done
+
+# replace newline chars with spaces for input to testvariants
+tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt
+
+
+###  run TestVariants against 31G, 54G or 85G
+
+echo "number of normals: $VNsetsize"
+echo "list of normals: ($VN_varfiles_list)"
+cat VN_varfiles.txt
+
+
+echo "running TV against Virtual Normal set"
+echo "command: cgatools testvariants\
+	--beta \
+	--reference $crr \
+	--input	$variants \
+	--output $output_all \
+	--variants `cat VN_varfiles.txt`"
+
+cgatools testvariants \
+	--beta \
+	--reference $crr \
+	--input	$variants \
+	--output $output_all \
+	--variants `cat VN_varfiles.txt`
+
+
+
+VNsetsize=`cat $VN_varfiles_list | wc -l`
+
+
+
+### filter file based on occurrence in background genomes
+cp $output_all $output_filtered
+cp $output_all output_expanded
+
+### condens file to columns with counts for all background genomes 
+echo "Counting..."
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		totalnormals="'"$VNsetsize"'"+0
+		count["00"]="0";
+		count["01"]="0";
+		count["11"]="0";
+		count["0N"]="0";
+		count["1N"]="0";		
+		count["NN"]="0";
+		count["0"]="0";
+		count["1"]="0";
+		count["N"]="0";
+	}{
+		if(FNR==1)  # header
+			print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N"
+		else{ 
+			#count entries in reference genomes
+			for (c in count)
+				count[c]=0;
+			for (i=9; i<=NF; i++){
+				count[$i]++;
+			}
+			occurrences=count["11"]+count["01"]+count["1N"]+count["1"]
+			fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"]
+			print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"]
+		}
+	}END{
+
+
+	}' $output_all > "${output_all}-counted"
+
+
+# this counted file is the final output file
+rm $output_all
+mv "${output_all}-counted" $output_all
+
+
+
+### filter out variants occurring in more than <threshold> of the background genomes
+# if total of columns containing a 1 (01,11,1N,1) is >= threshold
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";		
+	}{
+		if(FNR==1){
+			print $0 			
+		}
+		if(FNR>1){
+			if($9 < "'"$threshold"'" )
+				print $0 			
+		}
+	}END{}' $output_all > $output_filtered 
+
+
+awk 'BEGIN{
+        FS="\t";
+        OFS="\t";
+        threshold="'"${thresholdhc}"'"+0
+    }{
+        if(FNR==1)
+            print $0
+        else if($11 >= threshold)
+            print $0
+    
+    }END{}' $output_filtered > "output_filtered_highconf.tsv"
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TV-vs-background.xml	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,101 @@
+<tool id="t-vs-vnormal" name="Virtual Normal Correction SmallVars" version="1.6">
+	<description> Filter small variants based on presence in Virtual Normal set  </description>
+
+	<requirements>		
+		<requirement type="package" version="1.7">cgatools</requirement>
+	</requirements>
+	
+	<command interpreter="bash">
+	TV-vs-background.sh 
+		--variants $variants
+		--reference ${reference.fields.reference_crr_cgatools}
+		#if $virtnorm.VNset == "diversity":
+			--VN_varfiles ${reference.fields.VN_genomes_varfiles_list} 
+		#else
+			--VN_varfiles ${reference.fields.VN_genomes_varfiles_list_1000G} 
+		#end if
+		--threshold $threshold
+                --thresholdhc $thresholdhc
+		--outputfile_all $output_all
+		--outputfile_filtered $output_filtered		
+	</command>
+	
+	<inputs>
+		<param name="variants" type="data" format="tabular" label="List of Variants as produced by Listvariants program or VCF-2-LV conversion program"/>
+		<!--select build-->
+		<param name="reference" type="select" label="Select Build">
+			<options from_data_table="virtual_normal_correction" />
+			<filter type="data_meta" ref="variants" key="dbkey" column="0" />
+		</param>
+		<conditional name="virtnorm" >
+		<param name="VNset" type="select" label="Select Virtual Normal set to use" help="1000Genomes set can only be used for hg19 samples, for hg18 54 genomes will be used.">
+			<option value="diversity" > CG Diversity Panel and trios (54 Genomes) </option>
+			<option value="thousand" > CG 1000G project genomes (433 Genomes) (hg19 only) </option>
+		</param> 
+		</conditional>
+				
+		<param name="threshold" type="text" value="1" label="Threshold: Filter variants if present in at least this number of the background genomes"/>
+                <param name="thresholdhc" type="text" value="10" label="High Confidence Threshold: Label a somatic variant as high-confidence if locus was fully called in at least this many normal genomes" help="Please adjust according to number of normals used and desired stringency. "/>
+		<param name="fname" type="text" value="" label="Prefix for your output file" help="Optional. For example sample name."/>
+		<!--<param name="debug" type="select" label="individual level annotations?" help="get a columns per normal sample whether variant was present (only available for fully public normal samples)">
+			<option value="N" > No  </option>
+			<option value="Y" > Yes </option>
+		</param>
+                -->
+	</inputs>
+
+  <outputs>
+    <data format="tabular" name="output_all" label="${fname} All variants for ${tool.name} on ${on_string}"/>		
+    <data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on  ${on_string}"/>
+    <data format="tabular" name="output_filtered_highconf" label="${fname} High Confidence Filtered variants for ${tool.name} on  ${on_string}" from_work_dir="output_filtered_highconf.tsv"/>
+    <!--<data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on  ${on_string}"/>
+	<data format="tabular" name="output_expanded" from_work_dir="output_expanded" label="${fname} expanded annotation for ${tool.name} on  ${on_string}">
+		<filter> $debug == "Y" </filter>
+	</data>	
+    -->
+  </outputs>
+
+	<help> 
+**What it does**
+
+This tool compares a list of variants to a set of normal genomes. Each variant will be annotated with the number of normal samples it appears in.
+The tool will also output how often the variant was found in one or both alleles (01 or 11), and distinguish between a variant not being present in the normal (00) 
+or the location being no-called in the normal (NN) or half-called (0N,1N) etc. 
+
+This may take quite some time depending on the number of input variants and the number of normal genomes.
+
+**Input Files**  
+
+This program takes as input a list of variants as produced by the ListVariants tool, or the vcf-to-LV preprocessing tool. Input must be a tab-separated file of the following format::
+
+ variantID - chromosome - begin  -  end  -  varType - reference - alleleSeq - xRef
+ 1034	     chr1	  972803    972804  snp       T	          C	      dbsnp:rs31238120
+
+valid entries in varType column are: snp,sub,ins,del. 
+
+Chromosome coordinates must be zero-based half-open. 
+
+Column names must match the ones given above.
+
+
+**Output Files**
+
+1) Original input file annotated with presence (or lack thereof) in background genomes
+
+2) Filtered version of output 1, variants are removed when present in at least *threshold* of the background normal genomes (default: 1) (filters on column 9 of output file)
+
+3) High Confidence filtered version of output 2. Of all the variants labelled somatic, filter out any variants not fully called in at least *high confidence threshold* normals. (filter on column 11 of output file)
+
+Example output format::
+ 
+ variantId chromosome	begin	end	varType	reference alleleSeq	xRef	           VN_occurrences	VN_frequency	VN_fullycalled_count	VN_fullycalled_frequency  VN_00	VN_01	VN_11	VN_0N	VN_1N	VN_NN	VN_0	VN_1	VN_N
+ 34	   chr1	        46661	46662	snp	T	  C	        dbsnp.100:rs2691309	26	        0.472727	33	                0.6	                  7	19	7	1	0	20	0	0	0
+ 35	   chr1	        46850	46850	ins		  A	        	                0	        0	        10	                0.181818	          10	0	0	5	0	39	0	0	0 
+ 36	   chr1	        46895	46896	snp	T	  C	        dbsnp.100:rs2691311	8	        0.145455	40	                0.727273	          33	7	0	2	1	11	0	0	0
+ 37	   chr1	        46926	46927	snp	G	  A	        dbsnp.100:rs2548884	7	        0.127273	43	                0.781818	          36	7	0	2	0	9	0	0	0
+
+	</help>
+
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VN_genomes_locations.txt	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,8 @@
+/path/to/normal-varfile-1
+/path/to/normal-varfile-2
+/path/to/normal-varfile-3
+/path/to/normal-varfile-4
+/path/to/normal-varfile-5
+/path/to/normal-varfile-6
+/path/to/normal-varfile-7
+/path/to/normal-varfile-8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/virtual_normal_correction.loc.sample	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,6 @@
+#loc file for annovar tool
+
+# <columns>value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools</columns>
+
+hg18	hg18	Virtual Normal hg18	/mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg18.txt	/mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg18.txt	/mnt/galaxyIndices/cgatools/build36.crr
+hg19	hg19	Virtual Normal hg19	/mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg19.txt	/mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg19.txt	/mnt/galaxyIndices/cgatools/build37.crr
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,7 @@
+<!-- VN correction -->
+<tables>
+<table name="virtual_normal_correction" comment_char="#">
+<columns>value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools</columns>
+<file path="tool-data/virtual_normal_correction.loc" /> 
+</table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool_dependency>
+	<package name="cgatools" version="1.7"> 
+        <install version="1.0">
+            <actions>                				
+                <action type="download_by_url">http://sourceforge.net/projects/cgatools/files/1.7.1/cgatools-1.7.1.5-linux_binary-x86_64.tar.gz</action>
+				<action type="shell_command"> chmod a+x bin/cgatools</action>
+                <action type="move_file">
+                	<source>bin/cgatools</source>
+                	<destination>$INSTALL_DIR/bin</destination>
+                </action>	    
+				<action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                    <environment_variable name="PATH" action="prepend_to">$REPOSITORY_INSTALL_DIR</environment_variable>
+                </action>            	               			
+            </actions>
+        </install>
+        <readme>
+			Downloads and installs the cgatools binary. 
+        </readme>
+    </package>      
+</tool_dependency>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2lv.sh	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+vcffile=$1
+outputfile=$2
+
+# vcf columns: CHROM-POS-ID-REF-ALT
+# LV cloumns: variantId-chromosome-start-end-reference-alleleSeq-xRef 
+
+
+# add chr prefix if not present
+# determine varType (snp, ins, del, sub)
+# convert coordinates to 0-based halfopen
+# calculate end coordinate from position and length
+# remove leading reference base from the non-SNP variants, update position
+
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";	
+		count=0;
+		
+		#output new header
+		print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef"
+	}{
+
+		if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column)
+						
+			# detect multivariants
+			chrom=$1
+			pos=$2
+			ref=$4
+			#alt=$5
+			reflen=length($4)	
+			
+			# add chr prefix if needed
+			if(substr($1,1,3)!="chr")
+				chromosome="chr"$1
+			else
+				chromosome=chrom
+			
+			# split ALT column in case of multiple variant alleles
+			split($5,alleles,",");
+		
+			for (i in alleles) {
+				alt=alleles[i]
+							
+				
+				# determine varType
+				if(length(ref) == 1 && length(alt) == 1)
+					varType="snp"
+				else if (length(ref) == 1  )
+					varType="ins"
+				else if (length(alt) == 1 )
+					varType="del"
+				else 
+					varType="sub"
+					
+				# determine start and end coordinates in 0-based half-open coordinate system
+					
+				if (varType=="snp"){
+					start=pos-1
+					end=pos			
+				}
+				else if (varType=="ins"){
+					start=pos
+					end=pos	
+				}
+				else if (varType=="del"){
+					start=pos
+					end=pos+(reflen-1)			
+				}
+				else if (varType=="sub"){
+					start=pos
+					end=pos+(reflen-1)			
+				}		
+	
+				# remove leading reference base
+			   	if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s
+					reference=substr(ref,2)	
+					alleleSeq=substr(alt,2)	
+				}
+				else{
+					reference=ref
+					alleleSeq=alt
+				}
+		
+				#print output variant(s)
+		
+				if(chromosome != "chrM")
+					print count, chromosome, start, end, varType, reference, alleleSeq, ""
+			
+				count+=1
+			}
+		}
+	}END{}' $vcffile > $outputfile
+	
+	
+	
+#from 100Genomes site:
+
+#CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required)
+#POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required)
+#ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String)
+#REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required).
+#ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (”<ID>”). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself)	
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2lv.xml	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,27 @@
+<tool id="vcf2lv" name="VCF-2-VariantList" version="1">
+ 	<description> convert VCF file to CG-compatible variant list </description>
+  
+  	<command interpreter="bash"> 
+  		vcf2lv.sh $vcffile $outputfile	
+  	</command>
+
+  	<inputs>
+  		<param name="vcffile" type="data" label="Select vcf file" />
+  		<param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/>	
+  	</inputs>
+
+	<outputs>
+  		<data format="tabular" name="outputfile" label="${fname} VCF-2-VariantList conversion"/>
+  	</outputs>
+  	
+  	<help>  
+**what it does**
+
+Converts a VCF file containing small variants (SNVs, indels and substitutions) to a Complete Genomics type variantlist.
+		
+After conversion, the file can be used as input to the Virtual Normal filtering pipeline. 
+
+
+
+	</help>
+</tool>