Mercurial > repos > saskia-hiltemann > virtual_normal_analysis
diff TV-vs-background.sh @ 0:1209f18a5a83 draft
Uploaded
author | saskia-hiltemann |
---|---|
date | Mon, 03 Aug 2015 05:01:15 -0400 |
parents | |
children | 885ba15c2564 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TV-vs-background.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,139 @@ +#!/bin/bash + +#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered + +echo $@ + +set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage +[ $# -eq 0 ] && usage + +while [ $# -gt 0 ] +do + case "$1" in + --variants) variants=$2;shift;; + --reference) crr=$2;shift;; + --VN_varfiles) VN_varfiles_list=$2;shift;; + --outputfile_filtered) output_filtered=$2;shift;; + --outputfile_all) output_all=$2;shift;; + --threshold) threshold=$2;shift;; + --thresholdhc) thresholdhc=$2;shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + +# replace newline chars with spaces for input to testvariants +tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt + + +### run TestVariants against 31G, 54G or 85G + +echo "number of normals: $VNsetsize" +echo "list of normals: ($VN_varfiles_list)" +cat VN_varfiles.txt + + +echo "running TV against Virtual Normal set" +echo "command: cgatools testvariants\ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt`" + +cgatools testvariants \ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt` + + + +VNsetsize=`cat $VN_varfiles_list | wc -l` + + + +### filter file based on occurrence in background genomes +cp $output_all $output_filtered +cp $output_all output_expanded + +### condens file to columns with counts for all background genomes +echo "Counting..." +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + totalnormals="'"$VNsetsize"'"+0 + count["00"]="0"; + count["01"]="0"; + count["11"]="0"; + count["0N"]="0"; + count["1N"]="0"; + count["NN"]="0"; + count["0"]="0"; + count["1"]="0"; + count["N"]="0"; + }{ + if(FNR==1) # header + print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N" + else{ + #count entries in reference genomes + for (c in count) + count[c]=0; + for (i=9; i<=NF; i++){ + count[$i]++; + } + occurrences=count["11"]+count["01"]+count["1N"]+count["1"] + fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"] + print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"] + } + }END{ + + + }' $output_all > "${output_all}-counted" + + +# this counted file is the final output file +rm $output_all +mv "${output_all}-counted" $output_all + + + +### filter out variants occurring in more than <threshold> of the background genomes +# if total of columns containing a 1 (01,11,1N,1) is >= threshold +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + }{ + if(FNR==1){ + print $0 + } + if(FNR>1){ + if($9 < "'"$threshold"'" ) + print $0 + } + }END{}' $output_all > $output_filtered + + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + threshold="'"${thresholdhc}"'"+0 + }{ + if(FNR==1) + print $0 + else if($11 >= threshold) + print $0 + + }END{}' $output_filtered > "output_filtered_highconf.tsv" + + + + + + + +