Mercurial > repos > peterjc > seq_filter_by_id
comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 7:fb1313d79396 draft
Uploaded v0.2.5, ignore blank names in tabular files (based on contribution from Gildas Le Corguille)
author | peterjc |
---|---|
date | Fri, 04 Nov 2016 08:11:08 -0400 |
parents | 03e134cae41a |
children | 2d4537dbf0bc |
comparison
equal
deleted
inserted
replaced
6:03e134cae41a | 7:fb1313d79396 |
---|---|
72 help="Show version and quit") | 72 help="Show version and quit") |
73 | 73 |
74 options, args = parser.parse_args() | 74 options, args = parser.parse_args() |
75 | 75 |
76 if options.version: | 76 if options.version: |
77 print "v0.2.3" | 77 print "v0.2.5" |
78 sys.exit(0) | 78 sys.exit(0) |
79 | 79 |
80 in_file = options.input | 80 in_file = options.input |
81 seq_format = options.format | 81 seq_format = options.format |
82 out_positive_file = options.output_positive | 82 out_positive_file = options.output_positive |
91 if seq_format is None: | 91 if seq_format is None: |
92 sys.exit("Missing sequence format") | 92 sys.exit("Missing sequence format") |
93 if logic not in ["UNION", "INTERSECTION"]: | 93 if logic not in ["UNION", "INTERSECTION"]: |
94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) | 94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) |
95 if options.id_list and args: | 95 if options.id_list and args: |
96 sys.exit("Cannot accepted IDs via both -t and as tabular files") | 96 sys.exit("Cannot accept IDs via both -t in the command line, and as tabular files") |
97 elif not options.id_list and not args: | 97 elif not options.id_list and not args: |
98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)") | 98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)") |
99 if len(args) % 2: | 99 if len(args) % 2: |
100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args) | 100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args) |
101 | 101 |
179 '@': '__at__', | 179 '@': '__at__', |
180 '\n': '__cn__', | 180 '\n': '__cn__', |
181 '\r': '__cr__', | 181 '\r': '__cr__', |
182 '\t': '__tc__', | 182 '\t': '__tc__', |
183 '#': '__pd__', | 183 '#': '__pd__', |
184 } | 184 } |
185 | 185 |
186 # Read tabular file(s) and record all specified identifiers | 186 # Read tabular file(s) and record all specified identifiers |
187 ids = None # Will be a set | 187 ids = None # Will be a set |
188 if options.id_list: | 188 if options.id_list: |
189 assert not identifiers | 189 assert not identifiers |
204 if line.startswith("#"): | 204 if line.startswith("#"): |
205 # Ignore comments | 205 # Ignore comments |
206 continue | 206 continue |
207 parts = line.rstrip("\n").split("\t") | 207 parts = line.rstrip("\n").split("\t") |
208 for col in columns: | 208 for col in columns: |
209 file_ids.add(clean_name(parts[col])) | 209 name = clean_name(parts[col]) |
210 if name: | |
211 file_ids.add(name) | |
210 else: | 212 else: |
211 # Single column, special case speed up | 213 # Single column, special case speed up |
212 col = columns[0] | 214 col = columns[0] |
213 for line in handle: | 215 for line in handle: |
214 if not line.strip(): #skip empty lines | 216 if not line.strip(): # skip empty lines |
215 continue | 217 continue |
216 if not line.startswith("#"): | 218 if not line.startswith("#"): |
217 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) | 219 name = clean_name(line.rstrip("\n").split("\t")[col]) |
220 if name: | |
221 file_ids.add(name) | |
218 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) | 222 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) |
219 if ids is None: | 223 if ids is None: |
220 ids = file_ids | 224 ids = file_ids |
221 if logic == "UNION": | 225 if logic == "UNION": |
222 ids.update(file_ids) | 226 ids.update(file_ids) |