comparison common.py @ 0:7db7ecc78ad6 draft

Uploaded
author damion
date Mon, 02 Mar 2015 20:46:00 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7db7ecc78ad6
1 import os.path
2 import sys
3 import re
4 import optparse
5 import subprocess
6 from shutil import move
7 import csv
8
9 re_default_query_id = re.compile("^Query_\d+$")
10 #assert re_default_query_id.match("Query_101")
11 #assert not re_default_query_id.match("Query_101a")
12 #assert not re_default_query_id.match("MyQuery_101")
13 re_default_subject_id = re.compile("^(Subject_|gnl\|BL_ORD_ID\|)\d+$") #requires some kind of numeric id
14 #assert self.re_default_subject_id.match("gnl|BL_ORD_ID|221")
15 #assert re_default_subject_id.match("Subject_1")
16 #assert not re_default_subject_id.match("Subject_")
17 #assert not re_default_subject_id.match("Subject_12a")
18 #assert not re_default_subject_id.match("TheSubject_1")
19 # Spot sequence ids that have accession ids in them
20 re_default_ncbi_id = re.compile("^gi\|\d+\|[a-z]+\|[a-zA-Z0-9_]+(\.\d+)?\|")
21 re_default_ref_id = re.compile("^ref\|[a-zA-Z0-9_]+\|[a-zA-Z0-9_]+(\.\d+)?\|")
22
23
24 def stop_err( msg ):
25 sys.stderr.write("%s\n" % msg)
26 sys.exit(1)
27
28 class MyParser(optparse.OptionParser):
29 """
30 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
31 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
32 """
33 def format_epilog(self, formatter):
34 return self.epilog
35
36
37
38 ## *********************************** FieldFilter ****************************
39 class FieldFilter(object):
40
41 def __init__(self, tagGroup, options):
42 """ Creates dicitionary of fields that are to be filtered, and array of comparators and their values.
43 Numeric filters have a single numeric value
44 Each text filter is a string of phrases separated by "|"
45
46 e.g. filters = "pident: > 97,score: > 37,sallseqdescr includes what | have|you"
47
48 @param filters string e.g. "[ [field name]: [comparator] [value],[[comparator] [value],]* ]*
49 @result .dict dictionary contains field name keys and arrays of [comparator, filterValue]
50
51 """
52 self.dict = {}
53 self.comparators = {
54 '==': lambda x,y: float(x) == float(y),
55 '!=': lambda x,y: float(x) != float(y),
56 'gt': lambda x,y: float(x) > float(y),
57 'gte': lambda x,y: float(x) >= float(y),
58 'lt': lambda x,y: float(x) < float(y),
59 'lte': lambda x,y: float(x) <= float(y),
60 'includes': self.includesPhrase,
61 'excludes': self.excludesPhrase
62 }
63 self.matches = {}
64 self.drop_redundant_hits = options.drop_redundant_hits
65
66
67
68 if options.filters != None:
69 cleaned_filters = []
70 for colPtr, filterParam in enumerate(options.filters.strip().strip(';').split(';')):
71 filterSpec = filterParam.strip().split(":")
72 filterField = filterSpec[0].strip()
73 if len(filterField) > 0:
74 if filterField in self.dict:
75 stop_err("Filter field listed twice: \"" + filterField + "\". Please move constraints up to first use of field!")
76 field_name = cleanField(tagGroup.columns_in, filterField, 'Invalid field for filtering eh')
77 if len(filterSpec) > 1: #we have start of filter field defn. "[field]:[crit]+,"
78
79 self.dict[field_name] = [] #new entry for filter field
80
81 for filterParam in filterSpec[1].strip().strip(',').split(','):
82 filterSpec2 = filterParam.strip().split(' ')
83 comparator = filterSpec2[0]
84 if not comparator in self.comparators:
85 stop_err("Invalid comparator for field filter: \"" + comparator + "\"")
86 if len(filterSpec2) < 2:
87 stop_err("Missing value for field comparator: \"" + comparator + "\"")
88
89 #For text search, values are trimmed array of phrases
90 if comparator in ['includes','excludes']:
91 filterValue = list(map(str.strip, ' '.join(filterSpec2[1:]).split('|')))
92 filterValue = filter(None, filterValue)
93 else:
94 filterValue = filterSpec2[1]
95
96 self.dict[field_name].append([comparator, filterValue])
97
98 cleaned_filters.append(field_name + ':' + filterSpec[1])
99
100 options.filters = ';'.join(cleaned_filters)
101 # Adjust filter expression fieldnames.
102 words = {'gt':'&gt;', 'gte':'&gt;=', 'lt':'&lt;', 'lte':'&lt;=',',':'',':':' '}
103 options.filters_HTML = word_replace_all(options.filters, words)
104 words = {'gt':'>', 'gte':'>=', 'lt':'<', 'lte':'<=',',':'',':':' '}
105 options.filters = word_replace_all(options.filters, words)
106
107 else:
108 options.filters = None
109 options.filters_HTML = ''
110
111 def __str__(self):
112 return "label: %s dict: %s" % (self.label, str(self.dict))
113
114 def includesPhrase(self, source, filter_phrases):
115 """ Search for any words/phrases (separated by commas) in commastring in source string
116 @param source string Words separated by whitespace
117 @param filter_phrases array of phrases
118
119 """
120 return any(x in source for x in filter_phrases)
121
122 def excludesPhrase(self, source, commastring):
123 return not self.includesPhrase(source, commastring)
124
125 def process(self, record):
126 """ For given record (an object) cycle through filters to see if any of record's attributes fail filter conditions.
127
128 FUTURE: MAKE GENERIC SO PASSED record field function for unique test.???
129
130 @uses self.dict
131 @uses self.drop_redundant_hits
132 @uses self.matches
133 @param record object An object containing field & values read from a <hit> line.
134 @return boolean True if all filter criteria succeed, false otherwise
135
136 """
137
138 # Block out repeated hits
139 # THIS ASSUMES BLASTn XML file is listing BEST HIT FIRST. Only appropriate for searching for single hits within a reference sequence.
140 if self.drop_redundant_hits == True:
141 # parsing succession id from e.g. gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]
142 #acc = str(record.sseqid.split('|')[3:4]).strip()
143 key = record.qseqid + '-' + record.accessionid #acc
144 if key in self.matches:
145 return False
146 self.matches[key] = True
147
148 for key, constraints in self.dict.items():
149 try: # The .loc table of fields has fieldnames without leading _ underscore.
150 # Such fields are assumed to be added by code;
151 # Leading underscore fields are raw values read from XML file directly.
152 # Our filter names don't have underscore, but we see if underscore field exists if normal attr check fails
153 value = getattr(record, key)
154 for ptr, constraint in enumerate(constraints):
155 comparator = constraint[0]
156 userValue = constraint[1]
157 # print "constraint " + str(value) + comparator + str(userValue) + " -> " + \
158 # str (self.comparators[comparator](value, userValue) )
159 if not self.comparators[comparator](value, userValue):
160 return False #failed a constraint
161 except AttributeError:
162 print 'A filter on field [' + key + '] was requested, but this field does not exist.'
163 raise KeyError
164
165 return True
166
167
168
169 class FieldSpec(object):
170
171 def __init__(self, file_path, columns_in = []):
172 """ READ FIELD SPECIFICATIONS of a particular galaxy tool form/process from a .loc 'tabular data' file
173
174 Example blast_reporting_fields.tab file
175
176 #value type subtype sort filter default min max choose name
177 # Remember to edit tool_data_table_conf.xml for column spec!
178 qseqid numeric int 1 1 1 Query Seq-id
179 sseqid numeric int 1 1 1 Subject Seq-id
180 pident numeric float 1 1 97 90 100 1 Percentage of identical matches
181
182 - value is name of field: alphanumeric strings only.
183 - type is 'text' or 'numeric' or 'bin'
184 - subtype where applicable, indicates further validation function
185 - sort indicates if field should be provided in sort menu
186 - filter indicates if field should be in menu of fields that can be filtered
187 - default is default value field should have if drawn on form
188 - min is minimum range of field
189 - max is maximum range of field
190 - choose indicates if field can be chosen for an output column (some are mandatory / some are to be avoided?)
191 - name is textual name of field as it should appear on pulldown menus
192
193 @param file_path string full name and path of .loc file containing pertinent field names and their specifications.
194 @result .dict dictionary
195
196 """
197 self.dict = {}
198 self.columns_in = columns_in
199
200 with open(file_path, 'rb') as f:
201 reader = csv.DictReader(f, delimiter='\t') #1st row read as field name header by default
202 try:
203 for row in reader:
204 myKey = row['#value']
205 # Some lines begin with '#' for value. Ignore them
206 # Also, reader has read column names from first row; "#value" is name of first column
207 if not myKey[0] == '#': # 1st character is not a hash
208 row.pop("#value", None)
209 self.dict[myKey] = row
210 # self.dict[myKey]['value']=row['#value'] # If we need this ever?
211
212
213 except csv.Error as e:
214 stop_err('file %s, line %d: %s' % (filename, reader.line_num, e))
215
216
217 def initColumns(self, columns_out, custom_columns):
218 """
219 # Augment columns with fieldSpec label and some sorting defaults.
220 # Default sorts: qseqid is marked as sorted asc; score as sorted desc.
221 # No need to move sorted fields around.
222 # This basically creates spec to generate tab-delimited file.
223 # The only other calculation done for that is the row_limit cut, if any.
224 """
225 column_spec = list(columns_out)
226 for (i, spec) in enumerate(column_spec):
227 spec_field = spec.lstrip("_")
228
229 if spec_field == 'qseqid':
230 sort = 'asc'
231 group = 'section'
232 elif spec_field == 'score':
233 sort = 'desc'
234 group = 'column'
235 else:
236 sort = ''
237 group = 'column'
238
239 field = {
240 'field': spec,
241 'group': group,
242 'sort': sort,
243 'label': self.getAttribute(spec_field, 'short_name'),
244 'type': self.getAttribute(spec_field, 'type')
245 }
246 column_spec[i] = field
247
248 """
249 # For the HTML (OR XML) report we allow users to specify columns of data to represent sections of the report or table sections.
250 # Selected columns either enhance an existing column's info, or add a new column.
251 # If a selected column is sorted, it is inserted/moved to after last SORTED column in data.
252 # In other words, primary/secondary etc sorting is preserved.
253 """
254 if custom_columns != None:
255
256
257 custom_spec = [x.strip() for x in custom_columns.split(';')]
258 for spec in custom_spec:
259 params = [i.strip() for i in spec.rstrip(":").split(":")]
260 parlen = len(params)
261 if parlen > 0 and params[0] != '':
262 field_name = cleanField(self.columns_in, params[0]) # Halts if it finds a field mismatch
263
264 group = 'column'
265 sort = ''
266 if parlen > 1 and params[1] in ['column','hidden','table','section']: group = params[1]
267 if parlen > 2 and params[2] in ['asc','desc']: sort = params[2]
268
269 # Enforce sort on section and table items....
270
271 # All self.column_spec have a fieldspec entry. Get default label from there.
272 # HOW TO HANDLE CALCULATED FIELD LABELS? ENSURE THEY HAVE ENTRIES?
273 spec_field = field_name.lstrip("_")
274 label = self.getAttribute(spec_field, 'short_name')
275 if parlen > 3: label = params[3]
276
277 field = {
278 'field': field_name,
279 'group': group,
280 'sort': sort,
281 'label': label,
282 'type': self.getAttribute(spec_field, 'type')
283 }
284
285 # If field is a 'section' move it right after last existing 'section' (if not matched)
286 # if its a 'table' move it after last existing 'table' (if not matched)
287 # otherwise append to column list.(if not matched)
288
289 found = False # if found== true, rest of loop looks for existing mention of field, and removes it.
290 for (ptr, target) in enumerate(column_spec):
291
292 found_name = spec_field == target['field'].lstrip("_")
293 if (found == True):
294 if (found_name): # Found duplicate name
295 del column_spec[ptr]
296 break
297 elif (found_name):
298 found = True
299 column_spec[ptr] = field # Overwrite spec.
300 break
301
302 elif (field['group'] == 'section'):
303 if (target['group'] != 'section'): # time to insert section
304 found = True
305 column_spec.insert(ptr, field)
306
307 elif (field['group'] == 'table'):
308 if (target['group'] == 'column' or target['group'] == 'hidden'):
309 found = True
310 column_spec.insert(ptr, field)
311
312 if found == False: # didn't find place for field above.
313 column_spec.append(field)
314 # print ("col spec: " + str(column_spec))
315
316 return column_spec
317
318
319
320 def getAttribute(self, fieldName, attribute):
321 """ Retrieve attribute of a given field
322
323 @param fieldName string
324 @param attribute string
325 @return string value of attribute
326
327 """
328 return self.dict[fieldName][attribute]
329
330
331 def word_replace_all(text, dictionary):
332 textArray = re.split('(\W+)', text) #Workaround: split() function is not allowing words next to punctuation.
333 for ptr,w in enumerate(textArray):
334 if w in dictionary: textArray[ptr] = dictionary[w]
335 return ''.join(textArray)
336
337
338 def cleanField(columns_in, field_name, msg = 'Not a valid field name'):
339
340 if not field_name.replace('_','').isalnum():
341 stop_err(msg + ': [' + field_name+']')
342 if field_name in columns_in:
343 clean = field_name
344 elif '_' + field_name in columns_in: #passed from source file
345 clean = '_' + field_name
346 else: #column not found here
347 stop_err(msg + ':'+ field_name + '- no such field')
348 return clean
349
350
351 def fileSort (out_file, fields):
352 """
353 fileSort() uses linux "sort" to handle possibility of giant file sizes.
354 List of fields to sort on delivered in options.sorting string as:
355
356 [{name:[field_name],order:[asc|desc],label:[label]},{name ... }] etc.
357
358 Match "sorts" fields to columns to produce -k[col],[col] parameters that start and end sorting
359 Note that sort takes in columns with primary listed first, then secondary etc.
360 Note that file to be sorted can't have 1st line column headers.
361
362 sort attributes:
363
364 -f ignore case;
365 -r reverse (i.e. descending)Good.
366 -n numeric
367 -k[start col],[end col] range of text that sort will be performed on
368 -s stabilize sort : "If checked, this will stabilize sort by disabling its last-resort comparison so that lines in which all fields compare equal are left in their original relative order." Note, this might not be available on all linux flavours?
369 -V sorts numbers within text - if number is leading then field essentially treated as numeric. This means we don't have to specify -n for numeric fields in particular
370
371 Note: some attention may need to be given to locale settings for command line sort
372 May need to set export LC_ALL=C or export LANG=C to ensure same results on all systems
373
374 @param out_file string File path of file to resort
375 @param sorts string Comma-separated list of fields to sort, includes ascending/descending 2nd term;each field validated as an alphanumeric word + underscores.
376 @param prelim_columns dictionary of files column header names
377 """
378
379 sortparam = []
380 for colPtr, field in enumerate(fields):
381 if field['sort']:
382 field_name = field['field']
383 if not field_name.replace('_','').isalnum():
384 stop_err("Invalid field to sort on: " + field)
385
386 #print "sort term:" + field + ":" + str(prelim_columns)
387 ordering = '' if field['sort'] == "asc" else 'r'
388 column = str(colPtr+1)
389 # V sorts numbers AND text (check server's version of sort
390 sortparam.append('-k' + column + 'V' + ordering + ',' + column)
391
392 if len(sortparam) > 0:
393 args = ['sort','-s','-f','-V','-t\t'] + sortparam + ['-o' + out_file, out_file]
394 sort_a = subprocess.call(args)
395
396
397
398 def fileTabular (in_file, tagGroup, options):
399 """Produces tabular report format. Takes in tabular data + metainformation about that file, and iterates through rows. Not a query-based approach.
400 It trims off the sort-only columns (prelim - final),
401 It optionally adds column label header. (not done in fileSort() because it gets mixed into sort there.)
402 NOTE: RUN THIS AFTER fileHTML() BECAUSE IT MAY TRIM FIELDS THAT HTML REPORT NEEDS
403
404 @param in_file string Full file path
405 @param tagGroup object Includes prelim_columns, final_columns
406 @param options object Includes label_flag and row_limit
407
408 """
409 fp_in = open(in_file, "rb")
410 fp_out = open(in_file + '.tmp', 'wb')
411
412 try:
413
414 reader = csv.reader(fp_in, delimiter="\t")
415 writer = csv.writer(fp_out, delimiter="\t")
416
417 # WRITE TABULAR HEADER
418 if options.column_labels: # options.column_labels in ['name','field']:
419 if options.column_labels == 'label':
420 tabHeader = [field['label'] for field in tagGroup.columns]
421 else:
422 # Tabular data header: strip leading underscores off of any labels...
423 tabHeader = [field['field'].lstrip('_') for field in tagGroup.columns]
424
425 writer.writerow(tabHeader)
426
427 for row in reader:
428
429 rowdata=[]
430 for (idx, field) in enumerate(tagGroup.columns): # Exclude hidden columns here?
431 rowdata.append(row[idx])
432 writer.writerow(rowdata)
433
434 move(in_file + '.tmp', in_file) # Overwrites in_file
435
436 except IOError as e:
437 print 'Operation failed: %s' % e.strerror
438
439 fp_in.close()
440 fp_out.close()
441
442
443
444 def fileSelections (in_file, selection_file, tagGroup, options):
445 """ Produces selection report format.
446 For selection file we need: qseqid, qseq, sseqid, sseq, and #
447
448 @param in_file string Full file path
449 @param tagGroup object Includes prelim_columns, final_columns
450 @param options object Includes label_flag and row_limit
451
452 """
453 fp_in = open(in_file, "rb")
454
455 if selection_file != 'None':
456 fp_out = open(selection_file, 'w')
457
458
459 try:
460
461 reader = csv.reader(fp_in, delimiter="\t")
462 writer = csv.writer(fp_out, delimiter="\t")
463
464 for (idx, field) in enumerate(tagGroup.columns):
465 fieldname = field['field']
466 if fieldname == 'qseqid': qseqid_col = idx
467 elif fieldname == '_qseq': qseq_col = idx
468 elif fieldname == 'sseqid': sseqid_col = idx
469 elif fieldname == '_sseq': sseq_col = idx
470 # else: stop_err("You : " + field)
471
472 selectrow_count = 0
473 grouping = -1
474 old_section = ''
475 for row in reader:
476
477 selectrow_count +=1
478 if row[qseqid_col] != old_section:
479 old_section = row[qseqid_col]
480 grouping +=1
481 writer.writerow([row[qseqid_col], row[qseq_col], grouping, selectrow_count])
482 selectrow_count +=1
483
484 writer.writerow([row[sseqid_col], row[sseq_col], grouping, selectrow_count])
485
486
487 except IOError as e:
488 print 'Operation failed: %s' % e.strerror
489
490 fp_in.close()
491 fp_out.close()
492
493 def testSuite(test_ids, tests, output_dir):
494
495 if test_ids == 'all':
496 test_ids = sorted(tests.keys())
497 else:
498 test_ids = test_ids.split(',')
499
500 for test_id in test_ids:
501 if test_id in tests:
502 test = tests[test_id]
503 test['base_dir'] = os.path.dirname(__file__)
504 # Each output file has to be prefixed with the output folder
505 test['tmp_output'] = (' ' + test['outputs']).replace(' ',' ' + output_dir)
506 # Note: output_dir output files don't get cleaned up after each test. Should they?!
507 params = '%(base_dir)s/blast_reporting.py %(base_dir)s/test-data/%(input)s%(tmp_output)s %(options)s' % test
508 print("Testing" + test_id + ': ' + params)
509 os.system(params)
510 for file in test['outputs'].split(' '):
511 #print(os.system('diff --suppress-common-lines ./test-data/%s %s%s' % (file, output_dir, file)))
512 f1 = open(test['base_dir'] + '/test-data/' + file)
513 f2 = open(output_dir + file)
514 import difflib #n=[number of context lines
515 diff = difflib.context_diff(f1.readlines(), f2.readlines(), lineterm='',n=0)
516 # One Galaxy issue: it doesn't convert entities when user downloads file. BUT IT DOES when generating directly to command line?
517 print '\nCompare ' + file
518 print '\n'.join(list(diff))
519
520 else:
521 stop_err("\nExpecting one or more test ids from " + str(sorted(tests.keys())))
522
523 stop_err("\nTest finished.")