view filter_columns.sh @ 0:e77c9484b2d0 draft default tip

Uploaded
author saskia-hiltemann
date Thu, 22 Oct 2015 09:18:30 -0400
parents
children
line wrap: on
line source

#!/bin/bash

echo $@ 
count=1

set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter:  " "h:" "$@"` || usage
[ $# -eq 0 ] && usage


while [ $# -gt 0 ]
do
    case "$1" in
    --infile)      	infile=$2;shift;;  
	--outfile_rem)	outfile_rem=$2;shift;; 
	--outfile_ret)	outfile_ret=$2;shift;;
	--filter)		filter[$count]=$2 
					count=$[$count+1] 
					shift;;  	
    -h)   	     	shift;;
	--)   	     	shift;break;;
    -*)   	     	usage;;
    *)    	     	break;;            
    esac
    shift
done




# remove but remember header
cp $infile inputfile
sed -i -e "1d" inputfile

head inputfile

count=1
cp $infile inputfile

for f in ${filter[@]}
do
	echo "filter $count: $f" 
	count=$[$count+1] 
	
	IFS=',' read -a filt <<< "$f"
	
	#filt=split(${f//,/ })
	column=${filt[0]}
	op=${filt[1]}
	value=${filt[2]}
	
	echo "column: $column"
	echo "op : $op"
	echo "value: $value"
	
	#perform filtering
	
	awk 'BEGIN{
			FS="\t"
			OFS="\t"
			op="'"$op"'"
			numeric_value="'"$value"'"+0
			
		}{
			# keep header in both output files
			if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
				print $0 >> "outfile_removed"
				print $0 >> "outfile_retained"
			}
			
			if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
				print $0 >> "outfile_removed"
			}
			else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
				print $0 >> "outfile_removed"
			}
			else
			  print $0 >> "outfile_retained"
		
		}END{}' inputfile	
	
		#next iteration only run on retained lines
		cp outfile_retained inputfile
		rm outfile_retained
done

#remove duplicate lines in outputfiles
cat  outfile_removed | uniq > $outfile_rem
cat  inputfile | uniq > $outfile_ret

#awk ' !x[$0]++' outfile_removed > $outfile_rem 
#awk ' !x[$0]++' outfile_retained > $outfile_ret