comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 7:fb1313d79396 draft

Uploaded v0.2.5, ignore blank names in tabular files (based on contribution from Gildas Le Corguille)
author peterjc
date Fri, 04 Nov 2016 08:11:08 -0400
parents 03e134cae41a
children 2d4537dbf0bc
comparison
equal deleted inserted replaced
6:03e134cae41a 7:fb1313d79396
72 help="Show version and quit") 72 help="Show version and quit")
73 73
74 options, args = parser.parse_args() 74 options, args = parser.parse_args()
75 75
76 if options.version: 76 if options.version:
77 print "v0.2.3" 77 print "v0.2.5"
78 sys.exit(0) 78 sys.exit(0)
79 79
80 in_file = options.input 80 in_file = options.input
81 seq_format = options.format 81 seq_format = options.format
82 out_positive_file = options.output_positive 82 out_positive_file = options.output_positive
91 if seq_format is None: 91 if seq_format is None:
92 sys.exit("Missing sequence format") 92 sys.exit("Missing sequence format")
93 if logic not in ["UNION", "INTERSECTION"]: 93 if logic not in ["UNION", "INTERSECTION"]:
94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic) 94 sys.exit("Logic agrument should be 'UNION' or 'INTERSECTION', not %r" % logic)
95 if options.id_list and args: 95 if options.id_list and args:
96 sys.exit("Cannot accepted IDs via both -t and as tabular files") 96 sys.exit("Cannot accept IDs via both -t in the command line, and as tabular files")
97 elif not options.id_list and not args: 97 elif not options.id_list and not args:
98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)") 98 sys.exit("Expected matched pairs of tabular files and columns (or -t given)")
99 if len(args) % 2: 99 if len(args) % 2:
100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args) 100 sys.exit("Expected matched pairs of tabular files and columns, not: %r" % args)
101 101
179 '@': '__at__', 179 '@': '__at__',
180 '\n': '__cn__', 180 '\n': '__cn__',
181 '\r': '__cr__', 181 '\r': '__cr__',
182 '\t': '__tc__', 182 '\t': '__tc__',
183 '#': '__pd__', 183 '#': '__pd__',
184 } 184 }
185 185
186 # Read tabular file(s) and record all specified identifiers 186 # Read tabular file(s) and record all specified identifiers
187 ids = None # Will be a set 187 ids = None # Will be a set
188 if options.id_list: 188 if options.id_list:
189 assert not identifiers 189 assert not identifiers
204 if line.startswith("#"): 204 if line.startswith("#"):
205 # Ignore comments 205 # Ignore comments
206 continue 206 continue
207 parts = line.rstrip("\n").split("\t") 207 parts = line.rstrip("\n").split("\t")
208 for col in columns: 208 for col in columns:
209 file_ids.add(clean_name(parts[col])) 209 name = clean_name(parts[col])
210 if name:
211 file_ids.add(name)
210 else: 212 else:
211 # Single column, special case speed up 213 # Single column, special case speed up
212 col = columns[0] 214 col = columns[0]
213 for line in handle: 215 for line in handle:
214 if not line.strip(): #skip empty lines 216 if not line.strip(): # skip empty lines
215 continue 217 continue
216 if not line.startswith("#"): 218 if not line.startswith("#"):
217 file_ids.add(clean_name(line.rstrip("\n").split("\t")[col])) 219 name = clean_name(line.rstrip("\n").split("\t")[col])
220 if name:
221 file_ids.add(name)
218 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)) 222 print "Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns))
219 if ids is None: 223 if ids is None:
220 ids = file_ids 224 ids = file_ids
221 if logic == "UNION": 225 if logic == "UNION":
222 ids.update(file_ids) 226 ids.update(file_ids)