Mercurial > repos > iuc > sqlite_to_tabular
comparison filters.py @ 0:859064f07be4 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d
author | iuc |
---|---|
date | Tue, 18 Jul 2017 09:07:26 -0400 |
parents | |
children | c1b700bc0150 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:859064f07be4 |
---|---|
1 #!/usr/binsenv python | |
2 | |
3 from __future__ import print_function | |
4 | |
5 import re | |
6 import sys | |
7 | |
8 | |
9 class LineFilter(object): | |
10 def __init__(self, source, filter_dict): | |
11 self.source = source | |
12 self.filter_dict = filter_dict | |
13 self.func = lambda i, l: l.rstrip('\r\n') if l else None | |
14 self.src_lines = [] | |
15 self.src_line_cnt = 0 | |
16 if not filter_dict: | |
17 return | |
18 if filter_dict['filter'] == 'regex': | |
19 rgx = re.compile(filter_dict['pattern']) | |
20 if filter_dict['action'] == 'exclude_match': | |
21 self.func = lambda i, l: l if not rgx.match(l) else None | |
22 elif filter_dict['action'] == 'include_match': | |
23 self.func = lambda i, l: l if rgx.match(l) else None | |
24 elif filter_dict['action'] == 'exclude_find': | |
25 self.func = lambda i, l: l if not rgx.search(l) else None | |
26 elif filter_dict['action'] == 'include_find': | |
27 self.func = lambda i, l: l if rgx.search(l) else None | |
28 elif filter_dict['filter'] == 'select_columns': | |
29 cols = [int(c) - 1 for c in filter_dict['columns']] | |
30 self.func = lambda i, l: self.select_columns(l, cols) | |
31 elif filter_dict['filter'] == 'replace': | |
32 p = filter_dict['pattern'] | |
33 r = filter_dict['replace'] | |
34 c = int(filter_dict['column']) - 1 | |
35 self.func = lambda i, l: '\t'.join( | |
36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) | |
37 elif filter_dict['filter'] == 'prepend_line_num': | |
38 self.func = lambda i, l: '%d\t%s' % (i, l) | |
39 elif filter_dict['filter'] == 'append_line_num': | |
40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) | |
41 elif filter_dict['filter'] == 'prepend_text': | |
42 s = filter_dict['column_text'] | |
43 self.func = lambda i, l: '%s\t%s' % (s, l) | |
44 elif filter_dict['filter'] == 'append_text': | |
45 s = filter_dict['column_text'] | |
46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) | |
47 elif filter_dict['filter'] == 'skip': | |
48 cnt = filter_dict['count'] | |
49 self.func = lambda i, l: l if i > cnt else None | |
50 elif filter_dict['filter'] == 'normalize': | |
51 cols = [int(c) - 1 for c in filter_dict['columns']] | |
52 sep = filter_dict['separator'] | |
53 self.func = lambda i, l: self.normalize(l, cols, sep) | |
54 | |
55 def __iter__(self): | |
56 return self | |
57 | |
58 def __next__(self): | |
59 if not self.src_lines: | |
60 self.get_lines() | |
61 if self.src_lines: | |
62 return self.src_lines.pop(0) | |
63 raise StopIteration | |
64 | |
65 next = __next__ | |
66 | |
67 def select_columns(self, line, cols): | |
68 fields = line.split('\t') | |
69 return '\t'.join([fields[x] for x in cols]) | |
70 | |
71 def normalize(self, line, split_cols, sep): | |
72 lines = [] | |
73 fields = line.rstrip('\r\n').split('\t') | |
74 split_fields = dict() | |
75 cnt = 0 | |
76 for c in split_cols: | |
77 if c < len(fields): | |
78 split_fields[c] = fields[c].split(sep) | |
79 cnt = max(cnt, len(split_fields[c])) | |
80 if cnt == 0: | |
81 lines.append('\t'.join(fields)) | |
82 else: | |
83 for n in range(0, cnt): | |
84 flds = [x if c not in split_cols else split_fields[c][n] | |
85 if n < len(split_fields[c]) | |
86 else '' for (c, x) in enumerate(fields)] | |
87 lines.append('\t'.join(flds)) | |
88 return lines | |
89 | |
90 def get_lines(self): | |
91 for i, next_line in enumerate(self.source): | |
92 self.src_line_cnt += 1 | |
93 line = self.func(self.src_line_cnt, next_line) | |
94 if line: | |
95 if isinstance(line, list): | |
96 self.src_lines.extend(line) | |
97 else: | |
98 self.src_lines.append(line) | |
99 return | |
100 | |
101 | |
102 class TabularReader: | |
103 """ | |
104 Tabular file iterator. Returns a list | |
105 """ | |
106 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, | |
107 filters=None): | |
108 self.skip = skip | |
109 self.comment_char = comment_char | |
110 self.col_idx = col_idx | |
111 self.filters = filters | |
112 self.tsv_file = \ | |
113 input_file if hasattr(input_file, 'readline') else open(input_file) | |
114 if skip and skip > 0: | |
115 for i in range(skip): | |
116 if not self.tsv_file.readline(): | |
117 break | |
118 source = LineFilter(self.tsv_file, None) | |
119 if comment_char: | |
120 source = LineFilter(source, | |
121 {"filter": "regex", "pattern": comment_char, | |
122 "action": "exclude_match"}) | |
123 if filters: | |
124 for f in filters: | |
125 source = LineFilter(source, f) | |
126 self.source = source | |
127 | |
128 def __iter__(self): | |
129 return self | |
130 | |
131 def __next__(self): | |
132 ''' Iteration ''' | |
133 for i, line in enumerate(self.source): | |
134 fields = line.rstrip('\r\n').split('\t') | |
135 if self.col_idx: | |
136 fields = [fields[i] for i in self.col_idx] | |
137 return fields | |
138 raise StopIteration | |
139 | |
140 next = __next__ | |
141 | |
142 | |
143 def filter_file(input_file, output, skip=0, comment_char='#', filters=None): | |
144 data_lines = 0 | |
145 try: | |
146 tr = TabularReader(input_file, skip=skip, comment_char=comment_char, | |
147 filters=filters) | |
148 for linenum, fields in enumerate(tr): | |
149 data_lines += 1 | |
150 try: | |
151 output.write('%s\n' % '\t'.join(fields)) | |
152 except Exception as e: | |
153 print('Failed at line: %d err: %s' % (linenum, e), | |
154 file=sys.stderr) | |
155 except Exception as e: | |
156 exit('Error: %s' % (e)) |