comparison filters.py @ 0:859064f07be4 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d
author iuc
date Tue, 18 Jul 2017 09:07:26 -0400
parents
children c1b700bc0150
comparison
equal deleted inserted replaced
-1:000000000000 0:859064f07be4
1 #!/usr/binsenv python
2
3 from __future__ import print_function
4
5 import re
6 import sys
7
8
9 class LineFilter(object):
10 def __init__(self, source, filter_dict):
11 self.source = source
12 self.filter_dict = filter_dict
13 self.func = lambda i, l: l.rstrip('\r\n') if l else None
14 self.src_lines = []
15 self.src_line_cnt = 0
16 if not filter_dict:
17 return
18 if filter_dict['filter'] == 'regex':
19 rgx = re.compile(filter_dict['pattern'])
20 if filter_dict['action'] == 'exclude_match':
21 self.func = lambda i, l: l if not rgx.match(l) else None
22 elif filter_dict['action'] == 'include_match':
23 self.func = lambda i, l: l if rgx.match(l) else None
24 elif filter_dict['action'] == 'exclude_find':
25 self.func = lambda i, l: l if not rgx.search(l) else None
26 elif filter_dict['action'] == 'include_find':
27 self.func = lambda i, l: l if rgx.search(l) else None
28 elif filter_dict['filter'] == 'select_columns':
29 cols = [int(c) - 1 for c in filter_dict['columns']]
30 self.func = lambda i, l: self.select_columns(l, cols)
31 elif filter_dict['filter'] == 'replace':
32 p = filter_dict['pattern']
33 r = filter_dict['replace']
34 c = int(filter_dict['column']) - 1
35 self.func = lambda i, l: '\t'.join(
36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
37 elif filter_dict['filter'] == 'prepend_line_num':
38 self.func = lambda i, l: '%d\t%s' % (i, l)
39 elif filter_dict['filter'] == 'append_line_num':
40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i)
41 elif filter_dict['filter'] == 'prepend_text':
42 s = filter_dict['column_text']
43 self.func = lambda i, l: '%s\t%s' % (s, l)
44 elif filter_dict['filter'] == 'append_text':
45 s = filter_dict['column_text']
46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s)
47 elif filter_dict['filter'] == 'skip':
48 cnt = filter_dict['count']
49 self.func = lambda i, l: l if i > cnt else None
50 elif filter_dict['filter'] == 'normalize':
51 cols = [int(c) - 1 for c in filter_dict['columns']]
52 sep = filter_dict['separator']
53 self.func = lambda i, l: self.normalize(l, cols, sep)
54
55 def __iter__(self):
56 return self
57
58 def __next__(self):
59 if not self.src_lines:
60 self.get_lines()
61 if self.src_lines:
62 return self.src_lines.pop(0)
63 raise StopIteration
64
65 next = __next__
66
67 def select_columns(self, line, cols):
68 fields = line.split('\t')
69 return '\t'.join([fields[x] for x in cols])
70
71 def normalize(self, line, split_cols, sep):
72 lines = []
73 fields = line.rstrip('\r\n').split('\t')
74 split_fields = dict()
75 cnt = 0
76 for c in split_cols:
77 if c < len(fields):
78 split_fields[c] = fields[c].split(sep)
79 cnt = max(cnt, len(split_fields[c]))
80 if cnt == 0:
81 lines.append('\t'.join(fields))
82 else:
83 for n in range(0, cnt):
84 flds = [x if c not in split_cols else split_fields[c][n]
85 if n < len(split_fields[c])
86 else '' for (c, x) in enumerate(fields)]
87 lines.append('\t'.join(flds))
88 return lines
89
90 def get_lines(self):
91 for i, next_line in enumerate(self.source):
92 self.src_line_cnt += 1
93 line = self.func(self.src_line_cnt, next_line)
94 if line:
95 if isinstance(line, list):
96 self.src_lines.extend(line)
97 else:
98 self.src_lines.append(line)
99 return
100
101
102 class TabularReader:
103 """
104 Tabular file iterator. Returns a list
105 """
106 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None,
107 filters=None):
108 self.skip = skip
109 self.comment_char = comment_char
110 self.col_idx = col_idx
111 self.filters = filters
112 self.tsv_file = \
113 input_file if hasattr(input_file, 'readline') else open(input_file)
114 if skip and skip > 0:
115 for i in range(skip):
116 if not self.tsv_file.readline():
117 break
118 source = LineFilter(self.tsv_file, None)
119 if comment_char:
120 source = LineFilter(source,
121 {"filter": "regex", "pattern": comment_char,
122 "action": "exclude_match"})
123 if filters:
124 for f in filters:
125 source = LineFilter(source, f)
126 self.source = source
127
128 def __iter__(self):
129 return self
130
131 def __next__(self):
132 ''' Iteration '''
133 for i, line in enumerate(self.source):
134 fields = line.rstrip('\r\n').split('\t')
135 if self.col_idx:
136 fields = [fields[i] for i in self.col_idx]
137 return fields
138 raise StopIteration
139
140 next = __next__
141
142
143 def filter_file(input_file, output, skip=0, comment_char='#', filters=None):
144 data_lines = 0
145 try:
146 tr = TabularReader(input_file, skip=skip, comment_char=comment_char,
147 filters=filters)
148 for linenum, fields in enumerate(tr):
149 data_lines += 1
150 try:
151 output.write('%s\n' % '\t'.join(fields))
152 except Exception as e:
153 print('Failed at line: %d err: %s' % (linenum, e),
154 file=sys.stderr)
155 except Exception as e:
156 exit('Error: %s' % (e))