view TV-vs-background.sh @ 0:1209f18a5a83 draft

Uploaded
author saskia-hiltemann
date Mon, 03 Aug 2015 05:01:15 -0400
parents
children 885ba15c2564
line wrap: on
line source

#!/bin/bash

#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered

echo $@

set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage
[ $# -eq 0 ] && usage

while [ $# -gt 0 ]
do 
    case "$1" in
       	--variants) 			variants=$2;shift;;
		--reference)			crr=$2;shift;;
		--VN_varfiles)			VN_varfiles_list=$2;shift;;  			
		--outputfile_filtered) 	output_filtered=$2;shift;;  
		--outputfile_all) 		output_all=$2;shift;;  
		--threshold) 			threshold=$2;shift;;  		
		--thresholdhc)                 thresholdhc=$2;shift;;
        -h)        		shift;;
		--)        		shift;break;;
        -*)        		usage;;
        *)         		break;;            
    esac
    shift
done

# replace newline chars with spaces for input to testvariants
tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt


###  run TestVariants against 31G, 54G or 85G

echo "number of normals: $VNsetsize"
echo "list of normals: ($VN_varfiles_list)"
cat VN_varfiles.txt


echo "running TV against Virtual Normal set"
echo "command: cgatools testvariants\
	--beta \
	--reference $crr \
	--input	$variants \
	--output $output_all \
	--variants `cat VN_varfiles.txt`"

cgatools testvariants \
	--beta \
	--reference $crr \
	--input	$variants \
	--output $output_all \
	--variants `cat VN_varfiles.txt`



VNsetsize=`cat $VN_varfiles_list | wc -l`



### filter file based on occurrence in background genomes
cp $output_all $output_filtered
cp $output_all output_expanded

### condens file to columns with counts for all background genomes 
echo "Counting..."
awk 'BEGIN{
		FS="\t";
		OFS="\t";
		totalnormals="'"$VNsetsize"'"+0
		count["00"]="0";
		count["01"]="0";
		count["11"]="0";
		count["0N"]="0";
		count["1N"]="0";		
		count["NN"]="0";
		count["0"]="0";
		count["1"]="0";
		count["N"]="0";
	}{
		if(FNR==1)  # header
			print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N"
		else{ 
			#count entries in reference genomes
			for (c in count)
				count[c]=0;
			for (i=9; i<=NF; i++){
				count[$i]++;
			}
			occurrences=count["11"]+count["01"]+count["1N"]+count["1"]
			fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"]
			print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"]
		}
	}END{


	}' $output_all > "${output_all}-counted"


# this counted file is the final output file
rm $output_all
mv "${output_all}-counted" $output_all



### filter out variants occurring in more than <threshold> of the background genomes
# if total of columns containing a 1 (01,11,1N,1) is >= threshold
awk 'BEGIN{
		FS="\t";
		OFS="\t";		
	}{
		if(FNR==1){
			print $0 			
		}
		if(FNR>1){
			if($9 < "'"$threshold"'" )
				print $0 			
		}
	}END{}' $output_all > $output_filtered 


awk 'BEGIN{
        FS="\t";
        OFS="\t";
        threshold="'"${thresholdhc}"'"+0
    }{
        if(FNR==1)
            print $0
        else if($11 >= threshold)
            print $0
    
    }END{}' $output_filtered > "output_filtered_highconf.tsv"