diff filter_columns.sh @ 0:e77c9484b2d0 draft default tip

Uploaded
author saskia-hiltemann
date Thu, 22 Oct 2015 09:18:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+echo $@ 
+count=1
+
+set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter:  " "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    --infile)      	infile=$2;shift;;  
+	--outfile_rem)	outfile_rem=$2;shift;; 
+	--outfile_ret)	outfile_ret=$2;shift;;
+	--filter)		filter[$count]=$2 
+					count=$[$count+1] 
+					shift;;  	
+    -h)   	     	shift;;
+	--)   	     	shift;break;;
+    -*)   	     	usage;;
+    *)    	     	break;;            
+    esac
+    shift
+done
+
+
+
+
+# remove but remember header
+cp $infile inputfile
+sed -i -e "1d" inputfile
+
+head inputfile
+
+count=1
+cp $infile inputfile
+
+for f in ${filter[@]}
+do
+	echo "filter $count: $f" 
+	count=$[$count+1] 
+	
+	IFS=',' read -a filt <<< "$f"
+	
+	#filt=split(${f//,/ })
+	column=${filt[0]}
+	op=${filt[1]}
+	value=${filt[2]}
+	
+	echo "column: $column"
+	echo "op : $op"
+	echo "value: $value"
+	
+	#perform filtering
+	
+	awk 'BEGIN{
+			FS="\t"
+			OFS="\t"
+			op="'"$op"'"
+			numeric_value="'"$value"'"+0
+			
+		}{
+			# keep header in both output files
+			if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
+				print $0 >> "outfile_removed"
+				print $0 >> "outfile_retained"
+			}
+			
+			if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else
+			  print $0 >> "outfile_retained"
+		
+		}END{}' inputfile	
+	
+		#next iteration only run on retained lines
+		cp outfile_retained inputfile
+		rm outfile_retained
+done
+
+#remove duplicate lines in outputfiles
+cat  outfile_removed | uniq > $outfile_rem
+cat  inputfile | uniq > $outfile_ret
+
+#awk ' !x[$0]++' outfile_removed > $outfile_rem 
+#awk ' !x[$0]++' outfile_retained > $outfile_ret 
+