annotate filters.py @ 20:ab27c4bd14b9 draft

Uploaded
author jjohnson
date Fri, 14 Jul 2017 11:39:27 -0400
parents
children bed5018e7ae3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
20
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/binsenv python
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
2
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
3 from __future__ import print_function
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
4
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
5 import re
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
6 import sys
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
7
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
8
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
9 class LineFilter(object):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
10 def __init__(self, source, filter_dict):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
11 self.source = source
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
12 self.filter_dict = filter_dict
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
13 self.func = lambda i, l: l.rstrip('\r\n') if l else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
14 self.src_lines = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
15 self.src_line_cnt = 0
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
16 if not filter_dict:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
17 return
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
18 if filter_dict['filter'] == 'regex':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
19 rgx = re.compile(filter_dict['pattern'])
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
20 if filter_dict['action'] == 'exclude_match':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
21 self.func = lambda i, l: l if not rgx.match(l) else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
22 elif filter_dict['action'] == 'include_match':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
23 self.func = lambda i, l: l if rgx.match(l) else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
24 elif filter_dict['action'] == 'exclude_find':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
25 self.func = lambda i, l: l if not rgx.search(l) else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
26 elif filter_dict['action'] == 'include_find':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
27 self.func = lambda i, l: l if rgx.search(l) else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
28 elif filter_dict['filter'] == 'select_columns':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
29 cols = [int(c) - 1 for c in filter_dict['columns']]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
30 self.func = lambda i, l: self.select_columns(l, cols)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
31 elif filter_dict['filter'] == 'replace':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
32 p = filter_dict['pattern']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
33 r = filter_dict['replace']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
34 c = int(filter_dict['column']) - 1
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
35 self.func = lambda i, l: '\t'.join(
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
37 elif filter_dict['filter'] == 'prepend_line_num':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
38 self.func = lambda i, l: '%d\t%s' % (i, l)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
39 elif filter_dict['filter'] == 'append_line_num':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
41 elif filter_dict['filter'] == 'prepend_text':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
42 s = filter_dict['column_text']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
43 self.func = lambda i, l: '%s\t%s' % (s, l)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
44 elif filter_dict['filter'] == 'append_text':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
45 s = filter_dict['column_text']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
47 elif filter_dict['filter'] == 'skip':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
48 cnt = filter_dict['count']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
49 self.func = lambda i, l: l if i > cnt else None
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
50 elif filter_dict['filter'] == 'normalize':
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
51 cols = [int(c) - 1 for c in filter_dict['columns']]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
52 sep = filter_dict['separator']
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
53 self.func = lambda i, l: self.normalize(l, cols, sep)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
54
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
55 def __iter__(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
56 return self
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
57
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
58 def __next__(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
59 return next(self)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
60
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
61 def next(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
62 if not self.src_lines:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
63 self.get_lines()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
64 if self.src_lines:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
65 return self.src_lines.pop(0)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
66 raise StopIteration
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
67
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
68 def select_columns(self, line, cols):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
69 fields = line.split('\t')
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
70 return '\t'.join([fields[x] for x in cols])
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
71
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
72 def normalize(self, line, split_cols, sep):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
73 lines = []
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
74 fields = line.rstrip('\r\n').split('\t')
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
75 split_fields = dict()
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
76 cnt = 0
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
77 for c in split_cols:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
78 if c < len(fields):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
79 split_fields[c] = fields[c].split(sep)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
80 cnt = max(cnt, len(split_fields[c]))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
81 if cnt == 0:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
82 lines.append('\t'.join(fields))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
83 else:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
84 for n in range(0, cnt):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
85 flds = [x if c not in split_cols else split_fields[c][n]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
86 if n < len(split_fields[c])
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
87 else '' for (c, x) in enumerate(fields)]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
88 lines.append('\t'.join(flds))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
89 return lines
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
90
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
91 def get_lines(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
92 for i, next_line in enumerate(self.source):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
93 self.src_line_cnt += 1
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
94 line = self.func(self.src_line_cnt, next_line)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
95 if line:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
96 if isinstance(line, list):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
97 self.src_lines.extend(line)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
98 else:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
99 self.src_lines.append(line)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
100 return
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
101
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
102
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
103 class TabularReader:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
104 """
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
105 Tabular file iterator. Returns a list
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
106 """
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
107 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
108 filters=None):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
109 self.skip = skip
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
110 self.comment_char = comment_char
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
111 self.col_idx = col_idx
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
112 self.filters = filters
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
113 self.tsv_file = \
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
114 input_file if isinstance(input_file, file) else open(input_file)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
115 if skip and skip > 0:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
116 for i in range(skip):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
117 if not self.tsv_file.readline():
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
118 break
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
119 source = LineFilter(self.tsv_file, None)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
120 if comment_char:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
121 source = LineFilter(source,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
122 {"filter": "regex", "pattern": comment_char,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
123 "action": "exclude_match"})
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
124 if filters:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
125 for f in filters:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
126 source = LineFilter(source, f)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
127 self.source = source
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
128
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
129 def __iter__(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
130 return self
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
131
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
132 def __next__(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
133 return next(self)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
134
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
135 def next(self):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
136 ''' Iteration '''
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
137 for i, line in enumerate(self.source):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
138 fields = line.rstrip('\r\n').split('\t')
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
139 if self.col_idx:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
140 fields = [fields[i] for i in self.col_idx]
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
141 return fields
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
142 raise StopIteration
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
143
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
144
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
145 def filter_file(input_file, output, skip=0, comment_char='#', filters=None):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
146 data_lines = 0
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
147 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
148 tr = TabularReader(input_file, skip=skip, comment_char=comment_char,
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
149 filters=filters)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
150 for linenum, fields in enumerate(tr):
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
151 data_lines += 1
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
152 try:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
153 output.write('%s\n' % '\t'.join(fields))
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
154 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
155 print('Failed at line: %d err: %s' % (linenum, e),
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
156 file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
157 except Exception as e:
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
158 print('Failed: %s' % (e), file=sys.stderr)
ab27c4bd14b9 Uploaded
jjohnson
parents:
diff changeset
159 exit(1)