20
|
1 #!/usr/binsenv python
|
|
2
|
|
3 from __future__ import print_function
|
|
4
|
|
5 import re
|
|
6 import sys
|
|
7
|
|
8
|
|
9 class LineFilter(object):
|
|
10 def __init__(self, source, filter_dict):
|
|
11 self.source = source
|
|
12 self.filter_dict = filter_dict
|
|
13 self.func = lambda i, l: l.rstrip('\r\n') if l else None
|
|
14 self.src_lines = []
|
|
15 self.src_line_cnt = 0
|
|
16 if not filter_dict:
|
|
17 return
|
|
18 if filter_dict['filter'] == 'regex':
|
|
19 rgx = re.compile(filter_dict['pattern'])
|
|
20 if filter_dict['action'] == 'exclude_match':
|
|
21 self.func = lambda i, l: l if not rgx.match(l) else None
|
|
22 elif filter_dict['action'] == 'include_match':
|
|
23 self.func = lambda i, l: l if rgx.match(l) else None
|
|
24 elif filter_dict['action'] == 'exclude_find':
|
|
25 self.func = lambda i, l: l if not rgx.search(l) else None
|
|
26 elif filter_dict['action'] == 'include_find':
|
|
27 self.func = lambda i, l: l if rgx.search(l) else None
|
|
28 elif filter_dict['filter'] == 'select_columns':
|
|
29 cols = [int(c) - 1 for c in filter_dict['columns']]
|
|
30 self.func = lambda i, l: self.select_columns(l, cols)
|
|
31 elif filter_dict['filter'] == 'replace':
|
|
32 p = filter_dict['pattern']
|
|
33 r = filter_dict['replace']
|
|
34 c = int(filter_dict['column']) - 1
|
|
35 self.func = lambda i, l: '\t'.join(
|
|
36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
|
|
37 elif filter_dict['filter'] == 'prepend_line_num':
|
|
38 self.func = lambda i, l: '%d\t%s' % (i, l)
|
|
39 elif filter_dict['filter'] == 'append_line_num':
|
|
40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i)
|
|
41 elif filter_dict['filter'] == 'prepend_text':
|
|
42 s = filter_dict['column_text']
|
|
43 self.func = lambda i, l: '%s\t%s' % (s, l)
|
|
44 elif filter_dict['filter'] == 'append_text':
|
|
45 s = filter_dict['column_text']
|
|
46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s)
|
|
47 elif filter_dict['filter'] == 'skip':
|
|
48 cnt = filter_dict['count']
|
|
49 self.func = lambda i, l: l if i > cnt else None
|
|
50 elif filter_dict['filter'] == 'normalize':
|
|
51 cols = [int(c) - 1 for c in filter_dict['columns']]
|
|
52 sep = filter_dict['separator']
|
|
53 self.func = lambda i, l: self.normalize(l, cols, sep)
|
|
54
|
|
55 def __iter__(self):
|
|
56 return self
|
|
57
|
|
58 def __next__(self):
|
|
59 return next(self)
|
|
60
|
|
61 def next(self):
|
|
62 if not self.src_lines:
|
|
63 self.get_lines()
|
|
64 if self.src_lines:
|
|
65 return self.src_lines.pop(0)
|
|
66 raise StopIteration
|
|
67
|
|
68 def select_columns(self, line, cols):
|
|
69 fields = line.split('\t')
|
|
70 return '\t'.join([fields[x] for x in cols])
|
|
71
|
|
72 def normalize(self, line, split_cols, sep):
|
|
73 lines = []
|
|
74 fields = line.rstrip('\r\n').split('\t')
|
|
75 split_fields = dict()
|
|
76 cnt = 0
|
|
77 for c in split_cols:
|
|
78 if c < len(fields):
|
|
79 split_fields[c] = fields[c].split(sep)
|
|
80 cnt = max(cnt, len(split_fields[c]))
|
|
81 if cnt == 0:
|
|
82 lines.append('\t'.join(fields))
|
|
83 else:
|
|
84 for n in range(0, cnt):
|
|
85 flds = [x if c not in split_cols else split_fields[c][n]
|
|
86 if n < len(split_fields[c])
|
|
87 else '' for (c, x) in enumerate(fields)]
|
|
88 lines.append('\t'.join(flds))
|
|
89 return lines
|
|
90
|
|
91 def get_lines(self):
|
|
92 for i, next_line in enumerate(self.source):
|
|
93 self.src_line_cnt += 1
|
|
94 line = self.func(self.src_line_cnt, next_line)
|
|
95 if line:
|
|
96 if isinstance(line, list):
|
|
97 self.src_lines.extend(line)
|
|
98 else:
|
|
99 self.src_lines.append(line)
|
|
100 return
|
|
101
|
|
102
|
|
103 class TabularReader:
|
|
104 """
|
|
105 Tabular file iterator. Returns a list
|
|
106 """
|
|
107 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None,
|
|
108 filters=None):
|
|
109 self.skip = skip
|
|
110 self.comment_char = comment_char
|
|
111 self.col_idx = col_idx
|
|
112 self.filters = filters
|
|
113 self.tsv_file = \
|
|
114 input_file if isinstance(input_file, file) else open(input_file)
|
|
115 if skip and skip > 0:
|
|
116 for i in range(skip):
|
|
117 if not self.tsv_file.readline():
|
|
118 break
|
|
119 source = LineFilter(self.tsv_file, None)
|
|
120 if comment_char:
|
|
121 source = LineFilter(source,
|
|
122 {"filter": "regex", "pattern": comment_char,
|
|
123 "action": "exclude_match"})
|
|
124 if filters:
|
|
125 for f in filters:
|
|
126 source = LineFilter(source, f)
|
|
127 self.source = source
|
|
128
|
|
129 def __iter__(self):
|
|
130 return self
|
|
131
|
|
132 def __next__(self):
|
|
133 return next(self)
|
|
134
|
|
135 def next(self):
|
|
136 ''' Iteration '''
|
|
137 for i, line in enumerate(self.source):
|
|
138 fields = line.rstrip('\r\n').split('\t')
|
|
139 if self.col_idx:
|
|
140 fields = [fields[i] for i in self.col_idx]
|
|
141 return fields
|
|
142 raise StopIteration
|
|
143
|
|
144
|
|
145 def filter_file(input_file, output, skip=0, comment_char='#', filters=None):
|
|
146 data_lines = 0
|
|
147 try:
|
|
148 tr = TabularReader(input_file, skip=skip, comment_char=comment_char,
|
|
149 filters=filters)
|
|
150 for linenum, fields in enumerate(tr):
|
|
151 data_lines += 1
|
|
152 try:
|
|
153 output.write('%s\n' % '\t'.join(fields))
|
|
154 except Exception as e:
|
|
155 print('Failed at line: %d err: %s' % (linenum, e),
|
|
156 file=sys.stderr)
|
|
157 except Exception as e:
|
|
158 print('Failed: %s' % (e), file=sys.stderr)
|
|
159 exit(1)
|