0
|
1 #!/bin/bash
|
|
2
|
|
3 echo $@
|
|
4 count=1
|
|
5
|
|
6 set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage
|
|
7 [ $# -eq 0 ] && usage
|
|
8
|
|
9
|
|
10 while [ $# -gt 0 ]
|
|
11 do
|
|
12 case "$1" in
|
|
13 --infile) infile=$2;shift;;
|
|
14 --outfile_rem) outfile_rem=$2;shift;;
|
|
15 --outfile_ret) outfile_ret=$2;shift;;
|
|
16 --filter) filter[$count]=$2
|
|
17 count=$[$count+1]
|
|
18 shift;;
|
|
19 -h) shift;;
|
|
20 --) shift;break;;
|
|
21 -*) usage;;
|
|
22 *) break;;
|
|
23 esac
|
|
24 shift
|
|
25 done
|
|
26
|
|
27
|
|
28
|
|
29
|
|
30 # remove but remember header
|
|
31 cp $infile inputfile
|
|
32 sed -i -e "1d" inputfile
|
|
33
|
|
34 head inputfile
|
|
35
|
|
36 count=1
|
|
37 cp $infile inputfile
|
|
38
|
|
39 for f in ${filter[@]}
|
|
40 do
|
|
41 echo "filter $count: $f"
|
|
42 count=$[$count+1]
|
|
43
|
|
44 IFS=',' read -a filt <<< "$f"
|
|
45
|
|
46 #filt=split(${f//,/ })
|
|
47 column=${filt[0]}
|
|
48 op=${filt[1]}
|
|
49 value=${filt[2]}
|
|
50
|
|
51 echo "column: $column"
|
|
52 echo "op : $op"
|
|
53 echo "value: $value"
|
|
54
|
|
55 #perform filtering
|
|
56
|
|
57 awk 'BEGIN{
|
|
58 FS="\t"
|
|
59 OFS="\t"
|
|
60 op="'"$op"'"
|
|
61 numeric_value="'"$value"'"+0
|
|
62
|
|
63 }{
|
|
64 # keep header in both output files
|
|
65 if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
|
|
66 print $0 >> "outfile_removed"
|
|
67 print $0 >> "outfile_retained"
|
|
68 }
|
|
69
|
|
70 if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
|
|
71 print $0 >> "outfile_removed"
|
|
72 }
|
|
73 else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
|
|
74 print $0 >> "outfile_removed"
|
|
75 }
|
|
76 else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
|
|
77 print $0 >> "outfile_removed"
|
|
78 }
|
|
79 else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
|
|
80 print $0 >> "outfile_removed"
|
|
81 }
|
|
82 else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
|
|
83 print $0 >> "outfile_removed"
|
|
84 }
|
|
85 else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
|
|
86 print $0 >> "outfile_removed"
|
|
87 }
|
|
88 else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
|
|
89 print $0 >> "outfile_removed"
|
|
90 }
|
|
91 else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
|
|
92 print $0 >> "outfile_removed"
|
|
93 }
|
|
94 else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
|
|
95 print $0 >> "outfile_removed"
|
|
96 }
|
|
97 else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
|
|
98 print $0 >> "outfile_removed"
|
|
99 }
|
|
100 else
|
|
101 print $0 >> "outfile_retained"
|
|
102
|
|
103 }END{}' inputfile
|
|
104
|
|
105 #next iteration only run on retained lines
|
|
106 cp outfile_retained inputfile
|
|
107 rm outfile_retained
|
|
108 done
|
|
109
|
|
110 #remove duplicate lines in outputfiles
|
|
111 cat outfile_removed | uniq > $outfile_rem
|
|
112 cat inputfile | uniq > $outfile_ret
|
|
113
|
|
114 #awk ' !x[$0]++' outfile_removed > $outfile_rem
|
|
115 #awk ' !x[$0]++' outfile_retained > $outfile_ret
|
|
116
|