Mercurial > repos > saskia-hiltemann > file_manipulation
diff filter_columns.sh @ 0:e77c9484b2d0 draft default tip
Uploaded
author | saskia-hiltemann |
---|---|
date | Thu, 22 Oct 2015 09:18:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_columns.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,116 @@ +#!/bin/bash + +echo $@ +count=1 + +set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage +[ $# -eq 0 ] && usage + + +while [ $# -gt 0 ] +do + case "$1" in + --infile) infile=$2;shift;; + --outfile_rem) outfile_rem=$2;shift;; + --outfile_ret) outfile_ret=$2;shift;; + --filter) filter[$count]=$2 + count=$[$count+1] + shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + + + + +# remove but remember header +cp $infile inputfile +sed -i -e "1d" inputfile + +head inputfile + +count=1 +cp $infile inputfile + +for f in ${filter[@]} +do + echo "filter $count: $f" + count=$[$count+1] + + IFS=',' read -a filt <<< "$f" + + #filt=split(${f//,/ }) + column=${filt[0]} + op=${filt[1]} + value=${filt[2]} + + echo "column: $column" + echo "op : $op" + echo "value: $value" + + #perform filtering + + awk 'BEGIN{ + FS="\t" + OFS="\t" + op="'"$op"'" + numeric_value="'"$value"'"+0 + + }{ + # keep header in both output files + if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ + print $0 >> "outfile_removed" + print $0 >> "outfile_retained" + } + + if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else + print $0 >> "outfile_retained" + + }END{}' inputfile + + #next iteration only run on retained lines + cp outfile_retained inputfile + rm outfile_retained +done + +#remove duplicate lines in outputfiles +cat outfile_removed | uniq > $outfile_rem +cat inputfile | uniq > $outfile_ret + +#awk ' !x[$0]++' outfile_removed > $outfile_rem +#awk ' !x[$0]++' outfile_retained > $outfile_ret +