comparison filter_columns.sh @ 0:e77c9484b2d0 draft default tip

Uploaded
author saskia-hiltemann
date Thu, 22 Oct 2015 09:18:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e77c9484b2d0
1 #!/bin/bash
2
3 echo $@
4 count=1
5
6 set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage
7 [ $# -eq 0 ] && usage
8
9
10 while [ $# -gt 0 ]
11 do
12 case "$1" in
13 --infile) infile=$2;shift;;
14 --outfile_rem) outfile_rem=$2;shift;;
15 --outfile_ret) outfile_ret=$2;shift;;
16 --filter) filter[$count]=$2
17 count=$[$count+1]
18 shift;;
19 -h) shift;;
20 --) shift;break;;
21 -*) usage;;
22 *) break;;
23 esac
24 shift
25 done
26
27
28
29
30 # remove but remember header
31 cp $infile inputfile
32 sed -i -e "1d" inputfile
33
34 head inputfile
35
36 count=1
37 cp $infile inputfile
38
39 for f in ${filter[@]}
40 do
41 echo "filter $count: $f"
42 count=$[$count+1]
43
44 IFS=',' read -a filt <<< "$f"
45
46 #filt=split(${f//,/ })
47 column=${filt[0]}
48 op=${filt[1]}
49 value=${filt[2]}
50
51 echo "column: $column"
52 echo "op : $op"
53 echo "value: $value"
54
55 #perform filtering
56
57 awk 'BEGIN{
58 FS="\t"
59 OFS="\t"
60 op="'"$op"'"
61 numeric_value="'"$value"'"+0
62
63 }{
64 # keep header in both output files
65 if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
66 print $0 >> "outfile_removed"
67 print $0 >> "outfile_retained"
68 }
69
70 if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
71 print $0 >> "outfile_removed"
72 }
73 else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
74 print $0 >> "outfile_removed"
75 }
76 else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
77 print $0 >> "outfile_removed"
78 }
79 else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
80 print $0 >> "outfile_removed"
81 }
82 else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
83 print $0 >> "outfile_removed"
84 }
85 else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
86 print $0 >> "outfile_removed"
87 }
88 else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
89 print $0 >> "outfile_removed"
90 }
91 else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
92 print $0 >> "outfile_removed"
93 }
94 else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
95 print $0 >> "outfile_removed"
96 }
97 else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
98 print $0 >> "outfile_removed"
99 }
100 else
101 print $0 >> "outfile_retained"
102
103 }END{}' inputfile
104
105 #next iteration only run on retained lines
106 cp outfile_retained inputfile
107 rm outfile_retained
108 done
109
110 #remove duplicate lines in outputfiles
111 cat outfile_removed | uniq > $outfile_rem
112 cat inputfile | uniq > $outfile_ret
113
114 #awk ' !x[$0]++' outfile_removed > $outfile_rem
115 #awk ' !x[$0]++' outfile_retained > $outfile_ret
116