Mercurial > repos > iuc > sqlite_to_tabular
comparison filters.py @ 0:859064f07be4 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 74915fc9cee746bbce1c4b507e13231259de177d
| author | iuc |
|---|---|
| date | Tue, 18 Jul 2017 09:07:26 -0400 |
| parents | |
| children | c1b700bc0150 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:859064f07be4 |
|---|---|
| 1 #!/usr/binsenv python | |
| 2 | |
| 3 from __future__ import print_function | |
| 4 | |
| 5 import re | |
| 6 import sys | |
| 7 | |
| 8 | |
| 9 class LineFilter(object): | |
| 10 def __init__(self, source, filter_dict): | |
| 11 self.source = source | |
| 12 self.filter_dict = filter_dict | |
| 13 self.func = lambda i, l: l.rstrip('\r\n') if l else None | |
| 14 self.src_lines = [] | |
| 15 self.src_line_cnt = 0 | |
| 16 if not filter_dict: | |
| 17 return | |
| 18 if filter_dict['filter'] == 'regex': | |
| 19 rgx = re.compile(filter_dict['pattern']) | |
| 20 if filter_dict['action'] == 'exclude_match': | |
| 21 self.func = lambda i, l: l if not rgx.match(l) else None | |
| 22 elif filter_dict['action'] == 'include_match': | |
| 23 self.func = lambda i, l: l if rgx.match(l) else None | |
| 24 elif filter_dict['action'] == 'exclude_find': | |
| 25 self.func = lambda i, l: l if not rgx.search(l) else None | |
| 26 elif filter_dict['action'] == 'include_find': | |
| 27 self.func = lambda i, l: l if rgx.search(l) else None | |
| 28 elif filter_dict['filter'] == 'select_columns': | |
| 29 cols = [int(c) - 1 for c in filter_dict['columns']] | |
| 30 self.func = lambda i, l: self.select_columns(l, cols) | |
| 31 elif filter_dict['filter'] == 'replace': | |
| 32 p = filter_dict['pattern'] | |
| 33 r = filter_dict['replace'] | |
| 34 c = int(filter_dict['column']) - 1 | |
| 35 self.func = lambda i, l: '\t'.join( | |
| 36 [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) | |
| 37 elif filter_dict['filter'] == 'prepend_line_num': | |
| 38 self.func = lambda i, l: '%d\t%s' % (i, l) | |
| 39 elif filter_dict['filter'] == 'append_line_num': | |
| 40 self.func = lambda i, l: '%s\t%d' % (l.rstrip('\r\n'), i) | |
| 41 elif filter_dict['filter'] == 'prepend_text': | |
| 42 s = filter_dict['column_text'] | |
| 43 self.func = lambda i, l: '%s\t%s' % (s, l) | |
| 44 elif filter_dict['filter'] == 'append_text': | |
| 45 s = filter_dict['column_text'] | |
| 46 self.func = lambda i, l: '%s\t%s' % (l.rstrip('\r\n'), s) | |
| 47 elif filter_dict['filter'] == 'skip': | |
| 48 cnt = filter_dict['count'] | |
| 49 self.func = lambda i, l: l if i > cnt else None | |
| 50 elif filter_dict['filter'] == 'normalize': | |
| 51 cols = [int(c) - 1 for c in filter_dict['columns']] | |
| 52 sep = filter_dict['separator'] | |
| 53 self.func = lambda i, l: self.normalize(l, cols, sep) | |
| 54 | |
| 55 def __iter__(self): | |
| 56 return self | |
| 57 | |
| 58 def __next__(self): | |
| 59 if not self.src_lines: | |
| 60 self.get_lines() | |
| 61 if self.src_lines: | |
| 62 return self.src_lines.pop(0) | |
| 63 raise StopIteration | |
| 64 | |
| 65 next = __next__ | |
| 66 | |
| 67 def select_columns(self, line, cols): | |
| 68 fields = line.split('\t') | |
| 69 return '\t'.join([fields[x] for x in cols]) | |
| 70 | |
| 71 def normalize(self, line, split_cols, sep): | |
| 72 lines = [] | |
| 73 fields = line.rstrip('\r\n').split('\t') | |
| 74 split_fields = dict() | |
| 75 cnt = 0 | |
| 76 for c in split_cols: | |
| 77 if c < len(fields): | |
| 78 split_fields[c] = fields[c].split(sep) | |
| 79 cnt = max(cnt, len(split_fields[c])) | |
| 80 if cnt == 0: | |
| 81 lines.append('\t'.join(fields)) | |
| 82 else: | |
| 83 for n in range(0, cnt): | |
| 84 flds = [x if c not in split_cols else split_fields[c][n] | |
| 85 if n < len(split_fields[c]) | |
| 86 else '' for (c, x) in enumerate(fields)] | |
| 87 lines.append('\t'.join(flds)) | |
| 88 return lines | |
| 89 | |
| 90 def get_lines(self): | |
| 91 for i, next_line in enumerate(self.source): | |
| 92 self.src_line_cnt += 1 | |
| 93 line = self.func(self.src_line_cnt, next_line) | |
| 94 if line: | |
| 95 if isinstance(line, list): | |
| 96 self.src_lines.extend(line) | |
| 97 else: | |
| 98 self.src_lines.append(line) | |
| 99 return | |
| 100 | |
| 101 | |
| 102 class TabularReader: | |
| 103 """ | |
| 104 Tabular file iterator. Returns a list | |
| 105 """ | |
| 106 def __init__(self, input_file, skip=0, comment_char=None, col_idx=None, | |
| 107 filters=None): | |
| 108 self.skip = skip | |
| 109 self.comment_char = comment_char | |
| 110 self.col_idx = col_idx | |
| 111 self.filters = filters | |
| 112 self.tsv_file = \ | |
| 113 input_file if hasattr(input_file, 'readline') else open(input_file) | |
| 114 if skip and skip > 0: | |
| 115 for i in range(skip): | |
| 116 if not self.tsv_file.readline(): | |
| 117 break | |
| 118 source = LineFilter(self.tsv_file, None) | |
| 119 if comment_char: | |
| 120 source = LineFilter(source, | |
| 121 {"filter": "regex", "pattern": comment_char, | |
| 122 "action": "exclude_match"}) | |
| 123 if filters: | |
| 124 for f in filters: | |
| 125 source = LineFilter(source, f) | |
| 126 self.source = source | |
| 127 | |
| 128 def __iter__(self): | |
| 129 return self | |
| 130 | |
| 131 def __next__(self): | |
| 132 ''' Iteration ''' | |
| 133 for i, line in enumerate(self.source): | |
| 134 fields = line.rstrip('\r\n').split('\t') | |
| 135 if self.col_idx: | |
| 136 fields = [fields[i] for i in self.col_idx] | |
| 137 return fields | |
| 138 raise StopIteration | |
| 139 | |
| 140 next = __next__ | |
| 141 | |
| 142 | |
| 143 def filter_file(input_file, output, skip=0, comment_char='#', filters=None): | |
| 144 data_lines = 0 | |
| 145 try: | |
| 146 tr = TabularReader(input_file, skip=skip, comment_char=comment_char, | |
| 147 filters=filters) | |
| 148 for linenum, fields in enumerate(tr): | |
| 149 data_lines += 1 | |
| 150 try: | |
| 151 output.write('%s\n' % '\t'.join(fields)) | |
| 152 except Exception as e: | |
| 153 print('Failed at line: %d err: %s' % (linenum, e), | |
| 154 file=sys.stderr) | |
| 155 except Exception as e: | |
| 156 exit('Error: %s' % (e)) |
