diff TV-vs-background.sh @ 0:1209f18a5a83 draft

Uploaded
author saskia-hiltemann
date Mon, 03 Aug 2015 05:01:15 -0400
parents
children 885ba15c2564
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TV-vs-background.sh	Mon Aug 03 05:01:15 2015 -0400
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered
+
+echo $@
+
+set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+while [ $# -gt 0 ]
+do 
+    case "$1" in
+       	--variants) 			variants=$2;shift;;
+		--reference)			crr=$2;shift;;
+		--VN_varfiles)			VN_varfiles_list=$2;shift;;  			
+		--outputfile_filtered) 	output_filtered=$2;shift;;  
+		--outputfile_all) 		output_all=$2;shift;;  
+		--threshold) 			threshold=$2;shift;;  		
+		--thresholdhc)                 thresholdhc=$2;shift;;
+        -h)        		shift;;
+		--)        		shift;break;;
+        -*)        		usage;;
+        *)         		break;;            
+    esac
+    shift
+done
+
+# replace newline chars with spaces for input to testvariants
+tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt
+
+
+###  run TestVariants against 31G, 54G or 85G
+
+echo "number of normals: $VNsetsize"
+echo "list of normals: ($VN_varfiles_list)"
+cat VN_varfiles.txt
+
+
+echo "running TV against Virtual Normal set"
+echo "command: cgatools testvariants\
+	--beta \
+	--reference $crr \
+	--input	$variants \
+	--output $output_all \
+	--variants `cat VN_varfiles.txt`"
+
+cgatools testvariants \
+	--beta \
+	--reference $crr \
+	--input	$variants \
+	--output $output_all \
+	--variants `cat VN_varfiles.txt`
+
+
+
+VNsetsize=`cat $VN_varfiles_list | wc -l`
+
+
+
+### filter file based on occurrence in background genomes
+cp $output_all $output_filtered
+cp $output_all output_expanded
+
+### condens file to columns with counts for all background genomes 
+echo "Counting..."
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		totalnormals="'"$VNsetsize"'"+0
+		count["00"]="0";
+		count["01"]="0";
+		count["11"]="0";
+		count["0N"]="0";
+		count["1N"]="0";		
+		count["NN"]="0";
+		count["0"]="0";
+		count["1"]="0";
+		count["N"]="0";
+	}{
+		if(FNR==1)  # header
+			print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N"
+		else{ 
+			#count entries in reference genomes
+			for (c in count)
+				count[c]=0;
+			for (i=9; i<=NF; i++){
+				count[$i]++;
+			}
+			occurrences=count["11"]+count["01"]+count["1N"]+count["1"]
+			fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"]
+			print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"]
+		}
+	}END{
+
+
+	}' $output_all > "${output_all}-counted"
+
+
+# this counted file is the final output file
+rm $output_all
+mv "${output_all}-counted" $output_all
+
+
+
+### filter out variants occurring in more than <threshold> of the background genomes
+# if total of columns containing a 1 (01,11,1N,1) is >= threshold
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";		
+	}{
+		if(FNR==1){
+			print $0 			
+		}
+		if(FNR>1){
+			if($9 < "'"$threshold"'" )
+				print $0 			
+		}
+	}END{}' $output_all > $output_filtered 
+
+
+awk 'BEGIN{
+        FS="\t";
+        OFS="\t";
+        threshold="'"${thresholdhc}"'"+0
+    }{
+        if(FNR==1)
+            print $0
+        else if($11 >= threshold)
+            print $0
+    
+    }END{}' $output_filtered > "output_filtered_highconf.tsv"
+
+
+
+
+
+
+
+