Mercurial > repos > saskia-hiltemann > virtual_normal_analysis
changeset 0:1209f18a5a83 draft
Uploaded
author | saskia-hiltemann |
---|---|
date | Mon, 03 Aug 2015 05:01:15 -0400 |
parents | |
children | 1c6710924e80 |
files | JunctionDiff-vs-background.sh JunctionDiff-vs-background.xml README.txt TV-vs-background.sh TV-vs-background.xml VN_genomes_locations.txt tool-data/virtual_normal_correction.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml vcf2lv.sh vcf2lv.xml |
diffstat | 11 files changed, 594 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/JunctionDiff-vs-background.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,75 @@ +#!/bin/bash + +#JunctionDiff-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $output_filtered $output_report $scoreThresholdA $scoreThresholdB $distance $minlength + +#set some defaults +output_report="output_reports.tsv" + +set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_junctions: cgatools_binary: outputfile_filtered: outputfile_report: scoreThresholdA: scoreThresholdB: distance: minlength: " "h:" "$@"` || usage +[ $# -eq 0 ] && usage + +while [ $# -gt 0 ] +do + case "$1" in + --variants) variants=$2;shift;; + --reference) crr=$2;shift;; + --VN_junctions) VN_junctionfiles_list=$2;shift;; + --cgatools_binary) cgatools_binary=$2;shift;; #cgatools binary to use + --outputfile_filtered) output_filtered=$2;shift;; + --outputfile_report) output_report=$2;shift;; + --scoreThresholdA) scoreThresholdA=$2;shift;; + --scoreThresholdB) scoreThresholdB=$2;shift;; + --distance) distance=$2;shift;; + --minlength) minlength=$2;shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + + +# make copy of input junctions file, as this file will be altered +junctions="junctions.tsv" +cp $variants $junctions + + +### run JunctionDiff against all of the VN junctionfiles + +echo "running JunctionDiff against each of the VN genomes" + +# for each line in VN genomes list of junctionfiles, run junctiondiff +count=0 +while read line +do + if [[ $line != "" ]] # catch empty lines + then + count=$[$count+1] + ${cgatools_binary} junctiondiff \ + --beta \ + --statout \ + --reference $crr \ + --junctionsA $junctions \ + --junctionsB $line \ + --scoreThresholdA $scoreThresholdA \ + --scoreThresholdB $scoreThresholdB \ + --distance $distance \ + --minlength $minlength + + #concatenate all reports + echo -e "report of run $count:\n----------------------" >> $output_report + cat report.tsv >> $output_report + echo "" >> $output_report + + + #rename output file to junctions file for next iteration + rm $junctions + mv "diff-$junctions" $junctions + fi +done < $VN_junctionfiles_list + +cp $junctions $output_filtered + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/JunctionDiff-vs-background.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,72 @@ +<tool id="t-vs-vnormal_junctions" name="Virtual Normal Correction SVs" version="1.6"> + <description> Filter SVs based on presence in VN set </description> + + <requirements> + <requirement type="package" version="1.7">cgatools</requirement> + </requirements> + + <command interpreter="bash"> + JunctionDiff-vs-background.sh + --variants $variants + --reference ${reference.fields.reference_crr_cgatools} + #if $virtnorm.VNset == "diversity" + --VN_junctions ${reference.fields.VN_genomes_junctionfile_list} + #else + --VN_junctions ${reference.fields.VN_genomes_junctionfile_list_1000G} + #end if + --cgatools_binary cgatools + --outputfile_filtered $output_filtered + --scoreThresholdA $scoreThresholdA + --scoreThresholdB $scoreThresholdB + --distance $distance + --minlength $minlength + </command> + + <inputs> + <!--select build--> + <param name="reference" type="select" label="Select Build"> + <options from_data_table="virtual_normal_correction" /> + <filter type="data_meta" ref="variants" key="dbkey" column="0" /> + </param> + <conditional name="virtnorm" > + <param name="VNset" type="select" label="Select Virtual Normal set to use" help="1000Genomes set can only be used for hg19 samples"> + <option value="diversity" selected="true"> CG Diversity Panel and trios (54 Genomes) (hg18/hg19) </option> + <option value="thousand" > CG 1000G project genomes (433 Genomes) (hg19 only) </option> + </param> + </conditional> + <param name="variants" type="data" format="tabular" label="CG Junctions file"/> + <param name="scoreThresholdA" type="text" value="10" label="scoreThreshold" help="The minimum number of discordant mate pair alignments supporting the junction from input genome"/> + <param name="scoreThresholdB" type="text" value="10" label="scoreThreshold" help="The minimum number of discordant mate pair alignments supporting the junction from background genomes"/> + <param name="distance" type="text" value="200" label="Maximum distance between coordinates of potentially compatible junctions."/> + <param name="minlength" type="text" value="500" label="Minimum deletion junctions length to be included into the difference file."/> + <param name="report" type="select" label="Generate report file?"> + <option value="N" selected="true"> No </option> + <option value="Y"> Yes </option> + </param> + <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/> + </inputs> + + <outputs> + <data format="tabular" name="output_filtered" label="${fname} Filtered junctions for ${tool.name} on ${on_string}"/> + <data format="tabular" name="output_report" from_work_dir= "output_reports.tsv" label="${fname} report for ${tool.name} on ${on_string}"> + <filter> report == "Y" </filter> + </data> + </outputs> + + <help> +**What it does** + + + +**Input Files** +Complete Genomics Junctions file + +**Output Files** +Junctions remaining after filtering + + + </help> + +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,32 @@ +Installs VirtualNormal Correction Tool + +After installing this tool via admin panel, manually configure the following: + +1) edit virtual_normal_correction.loc file + + - change "/path/to/hg18.crr" to the location of the Complete Genomics reference crr file on your system + (can be downloaded from ftp://ftp.completegenomics.com/ReferenceFiles/ ) + + - change "/path/to/VN_genomes_varfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics + varfiles to be used as a virtual normal. This file should contain 1 file location per line, e.g. + + /path/to/normal-varfile-1 + /path/to/normal-varfile-2 + /path/to/normal-varfile-3 + /path/to/normal-varfile-4 + /path/to/normal-varfile-5 + /path/to/normal-varfile-6 + /path/to/normal-varfile-7 + /path/to/normal-varfile-8 + ... + + Varfiles can be in compressed or uncompressed form. For example, Complete Genomics' Diversity panel can be used. + (can be downloaded from ftp://ftp2.completegenomics.com/) + + - change "/path/to/VN_genomes_junctionfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics + junctionfiles to be used as a virtual normal. This file should contain 1 file location per line. For example, Complete Genomics' + Diversity panel can be used. (can be downloaded from ftp://ftp2.completegenomics.com/) + + 2) restart Galaxy for changes to take effect + + After this initial setup, additional normals can be added to the lists without having to restart Galaxy.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TV-vs-background.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,139 @@ +#!/bin/bash + +#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered + +echo $@ + +set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage +[ $# -eq 0 ] && usage + +while [ $# -gt 0 ] +do + case "$1" in + --variants) variants=$2;shift;; + --reference) crr=$2;shift;; + --VN_varfiles) VN_varfiles_list=$2;shift;; + --outputfile_filtered) output_filtered=$2;shift;; + --outputfile_all) output_all=$2;shift;; + --threshold) threshold=$2;shift;; + --thresholdhc) thresholdhc=$2;shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + +# replace newline chars with spaces for input to testvariants +tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt + + +### run TestVariants against 31G, 54G or 85G + +echo "number of normals: $VNsetsize" +echo "list of normals: ($VN_varfiles_list)" +cat VN_varfiles.txt + + +echo "running TV against Virtual Normal set" +echo "command: cgatools testvariants\ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt`" + +cgatools testvariants \ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt` + + + +VNsetsize=`cat $VN_varfiles_list | wc -l` + + + +### filter file based on occurrence in background genomes +cp $output_all $output_filtered +cp $output_all output_expanded + +### condens file to columns with counts for all background genomes +echo "Counting..." +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + totalnormals="'"$VNsetsize"'"+0 + count["00"]="0"; + count["01"]="0"; + count["11"]="0"; + count["0N"]="0"; + count["1N"]="0"; + count["NN"]="0"; + count["0"]="0"; + count["1"]="0"; + count["N"]="0"; + }{ + if(FNR==1) # header + print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N" + else{ + #count entries in reference genomes + for (c in count) + count[c]=0; + for (i=9; i<=NF; i++){ + count[$i]++; + } + occurrences=count["11"]+count["01"]+count["1N"]+count["1"] + fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"] + print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"] + } + }END{ + + + }' $output_all > "${output_all}-counted" + + +# this counted file is the final output file +rm $output_all +mv "${output_all}-counted" $output_all + + + +### filter out variants occurring in more than <threshold> of the background genomes +# if total of columns containing a 1 (01,11,1N,1) is >= threshold +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + }{ + if(FNR==1){ + print $0 + } + if(FNR>1){ + if($9 < "'"$threshold"'" ) + print $0 + } + }END{}' $output_all > $output_filtered + + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + threshold="'"${thresholdhc}"'"+0 + }{ + if(FNR==1) + print $0 + else if($11 >= threshold) + print $0 + + }END{}' $output_filtered > "output_filtered_highconf.tsv" + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TV-vs-background.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,101 @@ +<tool id="t-vs-vnormal" name="Virtual Normal Correction SmallVars" version="1.6"> + <description> Filter small variants based on presence in Virtual Normal set </description> + + <requirements> + <requirement type="package" version="1.7">cgatools</requirement> + </requirements> + + <command interpreter="bash"> + TV-vs-background.sh + --variants $variants + --reference ${reference.fields.reference_crr_cgatools} + #if $virtnorm.VNset == "diversity": + --VN_varfiles ${reference.fields.VN_genomes_varfiles_list} + #else + --VN_varfiles ${reference.fields.VN_genomes_varfiles_list_1000G} + #end if + --threshold $threshold + --thresholdhc $thresholdhc + --outputfile_all $output_all + --outputfile_filtered $output_filtered + </command> + + <inputs> + <param name="variants" type="data" format="tabular" label="List of Variants as produced by Listvariants program or VCF-2-LV conversion program"/> + <!--select build--> + <param name="reference" type="select" label="Select Build"> + <options from_data_table="virtual_normal_correction" /> + <filter type="data_meta" ref="variants" key="dbkey" column="0" /> + </param> + <conditional name="virtnorm" > + <param name="VNset" type="select" label="Select Virtual Normal set to use" help="1000Genomes set can only be used for hg19 samples, for hg18 54 genomes will be used."> + <option value="diversity" > CG Diversity Panel and trios (54 Genomes) </option> + <option value="thousand" > CG 1000G project genomes (433 Genomes) (hg19 only) </option> + </param> + </conditional> + + <param name="threshold" type="text" value="1" label="Threshold: Filter variants if present in at least this number of the background genomes"/> + <param name="thresholdhc" type="text" value="10" label="High Confidence Threshold: Label a somatic variant as high-confidence if locus was fully called in at least this many normal genomes" help="Please adjust according to number of normals used and desired stringency. "/> + <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional. For example sample name."/> + <!--<param name="debug" type="select" label="individual level annotations?" help="get a columns per normal sample whether variant was present (only available for fully public normal samples)"> + <option value="N" > No </option> + <option value="Y" > Yes </option> + </param> + --> + </inputs> + + <outputs> + <data format="tabular" name="output_all" label="${fname} All variants for ${tool.name} on ${on_string}"/> + <data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on ${on_string}"/> + <data format="tabular" name="output_filtered_highconf" label="${fname} High Confidence Filtered variants for ${tool.name} on ${on_string}" from_work_dir="output_filtered_highconf.tsv"/> + <!--<data format="tabular" name="output_filtered" label="${fname} Filtered variants for ${tool.name} on ${on_string}"/> + <data format="tabular" name="output_expanded" from_work_dir="output_expanded" label="${fname} expanded annotation for ${tool.name} on ${on_string}"> + <filter> $debug == "Y" </filter> + </data> + --> + </outputs> + + <help> +**What it does** + +This tool compares a list of variants to a set of normal genomes. Each variant will be annotated with the number of normal samples it appears in. +The tool will also output how often the variant was found in one or both alleles (01 or 11), and distinguish between a variant not being present in the normal (00) +or the location being no-called in the normal (NN) or half-called (0N,1N) etc. + +This may take quite some time depending on the number of input variants and the number of normal genomes. + +**Input Files** + +This program takes as input a list of variants as produced by the ListVariants tool, or the vcf-to-LV preprocessing tool. Input must be a tab-separated file of the following format:: + + variantID - chromosome - begin - end - varType - reference - alleleSeq - xRef + 1034 chr1 972803 972804 snp T C dbsnp:rs31238120 + +valid entries in varType column are: snp,sub,ins,del. + +Chromosome coordinates must be zero-based half-open. + +Column names must match the ones given above. + + +**Output Files** + +1) Original input file annotated with presence (or lack thereof) in background genomes + +2) Filtered version of output 1, variants are removed when present in at least *threshold* of the background normal genomes (default: 1) (filters on column 9 of output file) + +3) High Confidence filtered version of output 2. Of all the variants labelled somatic, filter out any variants not fully called in at least *high confidence threshold* normals. (filter on column 11 of output file) + +Example output format:: + + variantId chromosome begin end varType reference alleleSeq xRef VN_occurrences VN_frequency VN_fullycalled_count VN_fullycalled_frequency VN_00 VN_01 VN_11 VN_0N VN_1N VN_NN VN_0 VN_1 VN_N + 34 chr1 46661 46662 snp T C dbsnp.100:rs2691309 26 0.472727 33 0.6 7 19 7 1 0 20 0 0 0 + 35 chr1 46850 46850 ins A 0 0 10 0.181818 10 0 0 5 0 39 0 0 0 + 36 chr1 46895 46896 snp T C dbsnp.100:rs2691311 8 0.145455 40 0.727273 33 7 0 2 1 11 0 0 0 + 37 chr1 46926 46927 snp G A dbsnp.100:rs2548884 7 0.127273 43 0.781818 36 7 0 2 0 9 0 0 0 + + </help> + +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VN_genomes_locations.txt Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,8 @@ +/path/to/normal-varfile-1 +/path/to/normal-varfile-2 +/path/to/normal-varfile-3 +/path/to/normal-varfile-4 +/path/to/normal-varfile-5 +/path/to/normal-varfile-6 +/path/to/normal-varfile-7 +/path/to/normal-varfile-8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/virtual_normal_correction.loc.sample Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,6 @@ +#loc file for annovar tool + +# <columns>value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools</columns> + +hg18 hg18 Virtual Normal hg18 /mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg18.txt /mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg18.txt /mnt/galaxyIndices/cgatools/build36.crr +hg19 hg19 Virtual Normal hg19 /mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg19.txt /mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg19.txt /mnt/galaxyIndices/cgatools/build37.crr
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,7 @@ +<!-- VN correction --> +<tables> +<table name="virtual_normal_correction" comment_char="#"> +<columns>value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools</columns> +<file path="tool-data/virtual_normal_correction.loc" /> +</table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="cgatools" version="1.7"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://sourceforge.net/projects/cgatools/files/1.7.1/cgatools-1.7.1.5-linux_binary-x86_64.tar.gz</action> + <action type="shell_command"> chmod a+x bin/cgatools</action> + <action type="move_file"> + <source>bin/cgatools</source> + <destination>$INSTALL_DIR/bin</destination> + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> + <environment_variable name="PATH" action="prepend_to">$REPOSITORY_INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + <readme> + Downloads and installs the cgatools binary. + </readme> + </package> +</tool_dependency> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2lv.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,104 @@ +#!/bin/bash + +vcffile=$1 +outputfile=$2 + +# vcf columns: CHROM-POS-ID-REF-ALT +# LV cloumns: variantId-chromosome-start-end-reference-alleleSeq-xRef + + +# add chr prefix if not present +# determine varType (snp, ins, del, sub) +# convert coordinates to 0-based halfopen +# calculate end coordinate from position and length +# remove leading reference base from the non-SNP variants, update position + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + count=0; + + #output new header + print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" + }{ + + if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) + + # detect multivariants + chrom=$1 + pos=$2 + ref=$4 + #alt=$5 + reflen=length($4) + + # add chr prefix if needed + if(substr($1,1,3)!="chr") + chromosome="chr"$1 + else + chromosome=chrom + + # split ALT column in case of multiple variant alleles + split($5,alleles,","); + + for (i in alleles) { + alt=alleles[i] + + + # determine varType + if(length(ref) == 1 && length(alt) == 1) + varType="snp" + else if (length(ref) == 1 ) + varType="ins" + else if (length(alt) == 1 ) + varType="del" + else + varType="sub" + + # determine start and end coordinates in 0-based half-open coordinate system + + if (varType=="snp"){ + start=pos-1 + end=pos + } + else if (varType=="ins"){ + start=pos + end=pos + } + else if (varType=="del"){ + start=pos + end=pos+(reflen-1) + } + else if (varType=="sub"){ + start=pos + end=pos+(reflen-1) + } + + # remove leading reference base + if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s + reference=substr(ref,2) + alleleSeq=substr(alt,2) + } + else{ + reference=ref + alleleSeq=alt + } + + #print output variant(s) + + if(chromosome != "chrM") + print count, chromosome, start, end, varType, reference, alleleSeq, "" + + count+=1 + } + } + }END{}' $vcffile > $outputfile + + + +#from 100Genomes site: + +#CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) +#POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) +#ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String) +#REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required). +#ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (”<ID>”). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2lv.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,27 @@ +<tool id="vcf2lv" name="VCF-2-VariantList" version="1"> + <description> convert VCF file to CG-compatible variant list </description> + + <command interpreter="bash"> + vcf2lv.sh $vcffile $outputfile + </command> + + <inputs> + <param name="vcffile" type="data" label="Select vcf file" /> + <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/> + </inputs> + + <outputs> + <data format="tabular" name="outputfile" label="${fname} VCF-2-VariantList conversion"/> + </outputs> + + <help> +**what it does** + +Converts a VCF file containing small variants (SNVs, indels and substitutions) to a Complete Genomics type variantlist. + +After conversion, the file can be used as input to the Virtual Normal filtering pipeline. + + + + </help> +</tool>