0
|
1 from __future__ import division
|
|
2 import sys
|
|
3 import pandas as pd
|
|
4 import itertools as it
|
|
5 import scipy.stats as st
|
|
6 import collections
|
|
7 import lxml.etree as ET
|
|
8 import pickle as pk
|
|
9 import math
|
|
10 import os
|
|
11 import argparse
|
|
12 from svglib.svglib import svg2rlg
|
|
13 from reportlab.graphics import renderPDF
|
|
14
|
16
|
15 ########################## argparse ##########################################
|
0
|
16
|
|
17 def process_args(args):
|
|
18 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
|
|
19 description = 'process some value\'s'+
|
|
20 ' genes to create a comparison\'s map.')
|
48
|
21 parser.add_argument('-cr', '--custom_rules',
|
0
|
22 type = str,
|
48
|
23 default = 'false',
|
|
24 choices = ['true', 'false'],
|
|
25 help = 'choose whether to use custom rules')
|
|
26 parser.add_argument('-cc', '--custom_rule',
|
0
|
27 type = str,
|
48
|
28 help='custom rules to use')
|
|
29 parser.add_argument('-cm', '--custom_map',
|
|
30 type = str,
|
|
31 help='custom map to use')
|
0
|
32 parser.add_argument('-n', '--none',
|
|
33 type = str,
|
|
34 default = 'true',
|
|
35 choices = ['true', 'false'],
|
|
36 help = 'compute Nan values')
|
|
37 parser.add_argument('-pv' ,'--pValue',
|
|
38 type = float,
|
47
|
39 default = 0.1,
|
0
|
40 help = 'P-Value threshold (default: %(default)s)')
|
|
41 parser.add_argument('-fc', '--fChange',
|
|
42 type = float,
|
|
43 default = 1.5,
|
|
44 help = 'Fold-Change threshold (default: %(default)s)')
|
|
45 parser.add_argument('-td', '--tool_dir',
|
|
46 type = str,
|
|
47 required = True,
|
|
48 help = 'your tool directory')
|
|
49 parser.add_argument('-op', '--option',
|
|
50 type = str,
|
47
|
51 choices = ['datasets', 'dataset_class'],
|
0
|
52 help='dataset or dataset and class')
|
|
53 parser.add_argument('-ol', '--out_log',
|
|
54 help = "Output log")
|
|
55 parser.add_argument('-id', '--input_data',
|
|
56 type = str,
|
|
57 help = 'input dataset')
|
|
58 parser.add_argument('-ic', '--input_class',
|
|
59 type = str,
|
|
60 help = 'sample group specification')
|
16
|
61 parser.add_argument('-gs', '--generate_svg',
|
|
62 type = str,
|
|
63 default = 'true',
|
|
64 choices = ['true', 'false'],
|
|
65 help = 'generate svg map')
|
|
66 parser.add_argument('-gp', '--generate_pdf',
|
|
67 type = str,
|
|
68 default = 'true',
|
|
69 choices = ['true', 'false'],
|
|
70 help = 'generate pdf map')
|
47
|
71 parser.add_argument('-on', '--control',
|
|
72 type = str)
|
|
73 parser.add_argument('-co', '--comparison',
|
|
74 type = str,
|
|
75 default = '1vs1',
|
|
76 choices = ['manyvsmany', 'onevsrest', 'onevsmany'])
|
|
77 parser.add_argument('-ids', '--input_datas',
|
16
|
78 type = str,
|
47
|
79 nargs = '+',
|
|
80 help = 'input datasets')
|
|
81 parser.add_argument('-na', '--names',
|
|
82 type = str,
|
|
83 nargs = '+',
|
|
84 help = 'input names')
|
|
85
|
0
|
86 args = parser.parse_args()
|
|
87 return args
|
|
88
|
|
89 ########################### warning ###########################################
|
|
90
|
|
91 def warning(s):
|
|
92 args = process_args(sys.argv)
|
|
93 with open(args.out_log, 'a') as log:
|
|
94 log.write(s)
|
|
95
|
|
96 ############################ dataset input ####################################
|
|
97
|
|
98 def read_dataset(data, name):
|
|
99 try:
|
16
|
100 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
|
0
|
101 except pd.errors.EmptyDataError:
|
|
102 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
103 if len(dataset.columns) < 2:
|
|
104 sys.exit('Execution aborted: wrong format of ' + name + '\n')
|
|
105 return dataset
|
|
106
|
|
107 ############################ dataset name #####################################
|
|
108
|
|
109 def name_dataset(name_data, count):
|
|
110 if str(name_data) == 'Dataset':
|
|
111 return str(name_data) + '_' + str(count)
|
|
112 else:
|
|
113 return str(name_data)
|
|
114
|
|
115 ############################ load id e rules ##################################
|
|
116
|
|
117 def load_id_rules(reactions):
|
|
118 ids, rules = [], []
|
|
119 for key, value in reactions.items():
|
|
120 ids.append(key)
|
|
121 rules.append(value)
|
|
122 return (ids, rules)
|
|
123
|
|
124 ############################ check_methods ####################################
|
|
125
|
|
126 def gene_type(l, name):
|
|
127 if check_hgnc(l):
|
|
128 return 'hugo_id'
|
|
129 elif check_ensembl(l):
|
|
130 return 'ensembl_gene_id'
|
|
131 elif check_symbol(l):
|
|
132 return 'symbol'
|
|
133 elif check_entrez(l):
|
|
134 return 'entrez_id'
|
|
135 else:
|
|
136 sys.exit('Execution aborted:\n' +
|
|
137 'gene ID type in ' + name + ' not supported. Supported ID'+
|
|
138 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
|
|
139
|
|
140 def check_hgnc(l):
|
|
141 if len(l) > 5:
|
|
142 if (l.upper()).startswith('HGNC:'):
|
|
143 return l[5:].isdigit()
|
|
144 else:
|
|
145 return False
|
|
146 else:
|
|
147 return False
|
|
148
|
|
149 def check_ensembl(l):
|
|
150 if len(l) == 15:
|
|
151 if (l.upper()).startswith('ENS'):
|
|
152 return l[4:].isdigit()
|
|
153 else:
|
|
154 return False
|
|
155 else:
|
|
156 return False
|
|
157
|
|
158 def check_symbol(l):
|
|
159 if len(l) > 0:
|
|
160 if l[0].isalpha() and l[1:].isalnum():
|
|
161 return True
|
|
162 else:
|
|
163 return False
|
|
164 else:
|
|
165 return False
|
|
166
|
|
167 def check_entrez(l):
|
|
168 if len(l) > 0:
|
|
169 return l.isdigit()
|
|
170 else:
|
|
171 return False
|
|
172
|
|
173 def check_bool(b):
|
|
174 if b == 'true':
|
|
175 return True
|
|
176 elif b == 'false':
|
|
177 return False
|
|
178
|
|
179 ############################ resolve_methods ##################################
|
|
180
|
|
181 def replace_gene_value(l, d):
|
|
182 tmp = []
|
|
183 err = []
|
|
184 while l:
|
|
185 if isinstance(l[0], list):
|
|
186 tmp_rules, tmp_err = replace_gene_value(l[0], d)
|
|
187 tmp.append(tmp_rules)
|
|
188 err.extend(tmp_err)
|
|
189 else:
|
|
190 value = replace_gene(l[0], d)
|
|
191 tmp.append(value)
|
|
192 if value == None:
|
|
193 err.append(l[0])
|
|
194 l = l[1:]
|
|
195 return (tmp, err)
|
|
196
|
35
|
197
|
0
|
198 def replace_gene(l, d):
|
|
199 if l =='and' or l == 'or':
|
|
200 return l
|
|
201 else:
|
|
202 value = d.get(l, None)
|
|
203 if not(value == None or isinstance(value, (int, float))):
|
|
204 sys.exit('Execution aborted: ' + value + ' value not valid\n')
|
|
205 return value
|
|
206
|
|
207 def computes(val1, op, val2, cn):
|
|
208 if val1 != None and val2 != None:
|
|
209 if op == 'and':
|
|
210 return min(val1, val2)
|
|
211 else:
|
|
212 return val1 + val2
|
|
213 elif op == 'and':
|
|
214 if cn is True:
|
|
215 if val1 != None:
|
|
216 return val1
|
|
217 elif val2 != None:
|
|
218 return val2
|
|
219 else:
|
|
220 return None
|
|
221 else:
|
|
222 return None
|
|
223 else:
|
|
224 if val1 != None:
|
|
225 return val1
|
|
226 elif val2 != None:
|
|
227 return val2
|
|
228 else:
|
|
229 return None
|
|
230
|
|
231 def control(ris, l, cn):
|
|
232 if len(l) == 1:
|
|
233 if isinstance(l[0], (float, int)) or l[0] == None:
|
|
234 return l[0]
|
|
235 elif isinstance(l[0], list):
|
|
236 return control(None, l[0], cn)
|
|
237 else:
|
|
238 return False
|
|
239 elif len(l) > 2:
|
|
240 return control_list(ris, l, cn)
|
|
241 else:
|
|
242 return False
|
|
243
|
|
244 def control_list(ris, l, cn):
|
|
245 while l:
|
|
246 if len(l) == 1:
|
|
247 return False
|
|
248 elif (isinstance(l[0], (float, int)) or
|
|
249 l[0] == None) and l[1] in ['and', 'or']:
|
|
250 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
251 ris = computes(l[0], l[1], l[2], cn)
|
|
252 elif isinstance(l[2], list):
|
|
253 tmp = control(None, l[2], cn)
|
|
254 if tmp is False:
|
|
255 return False
|
|
256 else:
|
|
257 ris = computes(l[0], l[1], tmp, cn)
|
|
258 else:
|
|
259 return False
|
|
260 l = l[3:]
|
|
261 elif l[0] in ['and', 'or']:
|
|
262 if isinstance(l[1], (float, int)) or l[1] == None:
|
|
263 ris = computes(ris, l[0], l[1], cn)
|
|
264 elif isinstance(l[1], list):
|
|
265 tmp = control(None,l[1], cn)
|
|
266 if tmp is False:
|
|
267 return False
|
|
268 else:
|
|
269 ris = computes(ris, l[0], tmp, cn)
|
|
270 else:
|
|
271 return False
|
|
272 l = l[2:]
|
|
273 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
274 if isinstance(l[2], (float, int)) or l[2] == None:
|
|
275 tmp = control(None, l[0], cn)
|
|
276 if tmp is False:
|
|
277 return False
|
|
278 else:
|
|
279 ris = computes(tmp, l[1], l[2], cn)
|
|
280 elif isinstance(l[2], list):
|
|
281 tmp = control(None, l[0], cn)
|
|
282 tmp2 = control(None, l[2], cn)
|
|
283 if tmp is False or tmp2 is False:
|
|
284 return False
|
|
285 else:
|
|
286 ris = computes(tmp, l[1], tmp2, cn)
|
|
287 else:
|
|
288 return False
|
|
289 l = l[3:]
|
|
290 else:
|
|
291 return False
|
|
292 return ris
|
|
293
|
|
294 ############################ map_methods ######################################
|
|
295
|
|
296 def fold_change(avg1, avg2):
|
|
297 if avg1 == 0 and avg2 == 0:
|
|
298 return 0
|
|
299 elif avg1 == 0:
|
|
300 return '-INF'
|
|
301 elif avg2 == 0:
|
|
302 return 'INF'
|
|
303 else:
|
|
304 return math.log(avg1 / avg2, 2)
|
|
305
|
|
306 def fix_style(l, col, width, dash):
|
|
307 tmp = l.split(';')
|
|
308 flag_col = False
|
|
309 flag_width = False
|
|
310 flag_dash = False
|
|
311 for i in range(len(tmp)):
|
|
312 if tmp[i].startswith('stroke:'):
|
|
313 tmp[i] = 'stroke:' + col
|
|
314 flag_col = True
|
|
315 if tmp[i].startswith('stroke-width:'):
|
|
316 tmp[i] = 'stroke-width:' + width
|
|
317 flag_width = True
|
|
318 if tmp[i].startswith('stroke-dasharray:'):
|
|
319 tmp[i] = 'stroke-dasharray:' + dash
|
|
320 flag_dash = True
|
|
321 if not flag_col:
|
|
322 tmp.append('stroke:' + col)
|
|
323 if not flag_width:
|
|
324 tmp.append('stroke-width:' + width)
|
|
325 if not flag_dash:
|
|
326 tmp.append('stroke-dasharray:' + dash)
|
|
327 return ';'.join(tmp)
|
|
328
|
|
329 def fix_map(d, core_map, threshold_P_V, threshold_F_C, max_F_C):
|
|
330 maxT = 12
|
|
331 minT = 2
|
|
332 grey = '#BEBEBE'
|
|
333 blue = '#0000FF'
|
|
334 red = '#E41A1C'
|
|
335 for el in core_map.iter():
|
|
336 el_id = str(el.get('id'))
|
|
337 if el_id.startswith('R_'):
|
|
338 tmp = d.get(el_id[2:])
|
|
339 if tmp != None:
|
|
340 p_val = tmp[0]
|
|
341 f_c = tmp[1]
|
|
342 if p_val < threshold_P_V:
|
|
343 if not isinstance(f_c, str):
|
|
344 if abs(f_c) < math.log(threshold_F_C, 2):
|
|
345 col = grey
|
|
346 width = str(minT)
|
|
347 else:
|
|
348 if f_c < 0:
|
|
349 col = blue
|
|
350 elif f_c > 0:
|
|
351 col = red
|
|
352 width = str(max((abs(f_c) * maxT) / max_F_C, minT))
|
|
353 else:
|
|
354 if f_c == '-INF':
|
|
355 col = blue
|
|
356 elif f_c == 'INF':
|
|
357 col = red
|
|
358 width = str(maxT)
|
|
359 dash = 'none'
|
|
360 else:
|
|
361 dash = '5,5'
|
|
362 col = grey
|
|
363 width = str(minT)
|
|
364 el.set('style', fix_style(el.get('style'), col, width, dash))
|
|
365 return core_map
|
|
366
|
|
367 ############################ make recon #######################################
|
|
368
|
|
369 def check_and_doWord(l):
|
|
370 tmp = []
|
|
371 tmp_genes = []
|
|
372 count = 0
|
|
373 while l:
|
|
374 if count >= 0:
|
|
375 if l[0] == '(':
|
|
376 count += 1
|
|
377 tmp.append(l[0])
|
|
378 l.pop(0)
|
|
379 elif l[0] == ')':
|
|
380 count -= 1
|
|
381 tmp.append(l[0])
|
|
382 l.pop(0)
|
|
383 elif l[0] == ' ':
|
|
384 l.pop(0)
|
|
385 else:
|
|
386 word = []
|
|
387 while l:
|
|
388 if l[0] in [' ', '(', ')']:
|
|
389 break
|
|
390 else:
|
|
391 word.append(l[0])
|
|
392 l.pop(0)
|
|
393 word = ''.join(word)
|
|
394 tmp.append(word)
|
|
395 if not(word in ['or', 'and']):
|
|
396 tmp_genes.append(word)
|
|
397 else:
|
|
398 return False
|
|
399 if count == 0:
|
|
400 return (tmp, tmp_genes)
|
|
401 else:
|
|
402 return False
|
|
403
|
|
404 def brackets_to_list(l):
|
|
405 tmp = []
|
|
406 while l:
|
|
407 if l[0] == '(':
|
|
408 l.pop(0)
|
|
409 tmp.append(resolve_brackets(l))
|
|
410 else:
|
|
411 tmp.append(l[0])
|
|
412 l.pop(0)
|
|
413 return tmp
|
|
414
|
|
415 def resolve_brackets(l):
|
|
416 tmp = []
|
|
417 while l[0] != ')':
|
|
418 if l[0] == '(':
|
|
419 l.pop(0)
|
|
420 tmp.append(resolve_brackets(l))
|
|
421 else:
|
|
422 tmp.append(l[0])
|
|
423 l.pop(0)
|
|
424 l.pop(0)
|
|
425 return tmp
|
|
426
|
|
427 def priorityAND(l):
|
|
428 tmp = []
|
|
429 flag = True
|
|
430 while l:
|
|
431 if len(l) == 1:
|
|
432 if isinstance(l[0], list):
|
|
433 tmp.append(priorityAND(l[0]))
|
|
434 else:
|
|
435 tmp.append(l[0])
|
|
436 l = l[1:]
|
|
437 elif l[0] == 'or':
|
|
438 tmp.append(l[0])
|
|
439 flag = False
|
|
440 l = l[1:]
|
|
441 elif l[1] == 'or':
|
|
442 if isinstance(l[0], list):
|
|
443 tmp.append(priorityAND(l[0]))
|
|
444 else:
|
|
445 tmp.append(l[0])
|
|
446 tmp.append(l[1])
|
|
447 flag = False
|
|
448 l = l[2:]
|
|
449 elif l[1] == 'and':
|
|
450 tmpAnd = []
|
|
451 if isinstance(l[0], list):
|
|
452 tmpAnd.append(priorityAND(l[0]))
|
|
453 else:
|
|
454 tmpAnd.append(l[0])
|
|
455 tmpAnd.append(l[1])
|
|
456 if isinstance(l[2], list):
|
|
457 tmpAnd.append(priorityAND(l[2]))
|
|
458 else:
|
|
459 tmpAnd.append(l[2])
|
|
460 l = l[3:]
|
|
461 while l:
|
|
462 if l[0] == 'and':
|
|
463 tmpAnd.append(l[0])
|
|
464 if isinstance(l[1], list):
|
|
465 tmpAnd.append(priorityAND(l[1]))
|
|
466 else:
|
|
467 tmpAnd.append(l[1])
|
|
468 l = l[2:]
|
|
469 elif l[0] == 'or':
|
|
470 flag = False
|
|
471 break
|
13
|
472 if flag == True: #when there are only AND in list
|
0
|
473 tmp.extend(tmpAnd)
|
|
474 elif flag == False:
|
|
475 tmp.append(tmpAnd)
|
|
476 return tmp
|
|
477
|
|
478 def checkRule(l):
|
|
479 if len(l) == 1:
|
|
480 if isinstance(l[0], list):
|
|
481 if checkRule(l[0]) is False:
|
|
482 return False
|
|
483 elif len(l) > 2:
|
|
484 if checkRule2(l) is False:
|
|
485 return False
|
|
486 else:
|
|
487 return False
|
|
488 return True
|
|
489
|
|
490 def checkRule2(l):
|
|
491 while l:
|
|
492 if len(l) == 1:
|
|
493 return False
|
|
494 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
|
|
495 if checkRule(l[0]) is False:
|
|
496 return False
|
|
497 if isinstance(l[2], list):
|
|
498 if checkRule(l[2]) is False:
|
|
499 return False
|
|
500 l = l[3:]
|
|
501 elif l[1] in ['and', 'or']:
|
|
502 if isinstance(l[2], list):
|
|
503 if checkRule(l[2]) is False:
|
|
504 return False
|
|
505 l = l[3:]
|
|
506 elif l[0] in ['and', 'or']:
|
|
507 if isinstance(l[1], list):
|
|
508 if checkRule(l[1]) is False:
|
|
509 return False
|
|
510 l = l[2:]
|
|
511 else:
|
|
512 return False
|
|
513 return True
|
|
514
|
|
515 def do_rules(rules):
|
|
516 split_rules = []
|
|
517 err_rules = []
|
|
518 tmp_gene_in_rule = []
|
|
519 for i in range(len(rules)):
|
|
520 tmp = list(rules[i])
|
|
521 if tmp:
|
|
522 tmp, tmp_genes = check_and_doWord(tmp)
|
|
523 tmp_gene_in_rule.extend(tmp_genes)
|
|
524 if tmp is False:
|
|
525 split_rules.append([])
|
|
526 err_rules.append(rules[i])
|
|
527 else:
|
|
528 tmp = brackets_to_list(tmp)
|
|
529 if checkRule(tmp):
|
|
530 split_rules.append(priorityAND(tmp))
|
|
531 else:
|
|
532 split_rules.append([])
|
|
533 err_rules.append(rules[i])
|
|
534 else:
|
|
535 split_rules.append([])
|
|
536 if err_rules:
|
|
537 warning('Warning: wrong format rule in ' + str(err_rules) + '\n')
|
|
538 return (split_rules, list(set(tmp_gene_in_rule)))
|
|
539
|
|
540
|
|
541 ############################ gene #############################################
|
|
542
|
|
543 def data_gene(gene, type_gene, name, gene_custom):
|
35
|
544 args = process_args(sys.argv)
|
0
|
545 for i in range(len(gene)):
|
|
546 tmp = gene.iloc[i, 0]
|
|
547 if tmp.startswith(' ') or tmp.endswith(' '):
|
|
548 gene.iloc[i, 0] = (tmp.lstrip()).rstrip()
|
|
549 gene_dup = [item for item, count in
|
|
550 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
|
|
551 pat_dup = [item for item, count in
|
|
552 collections.Counter(list(gene.columns)).items() if count > 1]
|
35
|
553
|
0
|
554 if gene_dup:
|
|
555 if gene_custom == None:
|
|
556 if args.rules_selector == 'HMRcore':
|
|
557 gene_in_rule = pk.load(open(args.tool_dir +
|
|
558 '/local/HMRcore_genes.p', 'rb'))
|
|
559 elif args.rules_selector == 'Recon':
|
|
560 gene_in_rule = pk.load(open(args.tool_dir +
|
|
561 '/local/Recon_genes.p', 'rb'))
|
|
562 gene_in_rule = gene_in_rule.get(type_gene)
|
|
563 else:
|
|
564 gene_in_rule = gene_custom
|
|
565 tmp = []
|
|
566 for i in gene_dup:
|
|
567 if gene_in_rule.get(i) == 'ok':
|
|
568 tmp.append(i)
|
|
569 if tmp:
|
|
570 sys.exit('Execution aborted because gene ID '
|
|
571 +str(tmp)+' in '+name+' is duplicated\n')
|
|
572 if pat_dup:
|
|
573 warning('Warning: duplicated label\n' + str(pat_dup) + 'in ' + name +
|
|
574 '\n')
|
35
|
575
|
0
|
576 return (gene.set_index(gene.columns[0])).to_dict()
|
|
577
|
|
578 ############################ resolve ##########################################
|
|
579
|
|
580 def resolve(genes, rules, ids, resolve_none, name):
|
|
581 resolve_rules = {}
|
|
582 not_found = []
|
|
583 flag = False
|
|
584 for key, value in genes.items():
|
|
585 tmp_resolve = []
|
|
586 for i in range(len(rules)):
|
|
587 tmp = rules[i]
|
|
588 if tmp:
|
|
589 tmp, err = replace_gene_value(tmp, value)
|
|
590 if err:
|
|
591 not_found.extend(err)
|
|
592 ris = control(None, tmp, resolve_none)
|
|
593 if ris is False or ris == None:
|
|
594 tmp_resolve.append(None)
|
|
595 else:
|
|
596 tmp_resolve.append(ris)
|
|
597 flag = True
|
|
598 else:
|
35
|
599 tmp_resolve.append(None)
|
0
|
600 resolve_rules[key] = tmp_resolve
|
|
601 if flag is False:
|
|
602 warning('Warning: no computable score (due to missing gene values)' +
|
|
603 'for class ' + name + ', the class has been disregarded\n')
|
|
604 return (None, None)
|
|
605 return (resolve_rules, list(set(not_found)))
|
|
606
|
|
607 ############################ split class ######################################
|
|
608
|
|
609 def split_class(classes, resolve_rules):
|
|
610 class_pat = {}
|
|
611 for i in range(len(classes)):
|
|
612 classe = classes.iloc[i, 1]
|
|
613 if not pd.isnull(classe):
|
|
614 l = []
|
|
615 for j in range(i, len(classes)):
|
|
616 if classes.iloc[j, 1] == classe:
|
|
617 pat_id = classes.iloc[j, 0]
|
|
618 tmp = resolve_rules.get(pat_id, None)
|
|
619 if tmp != None:
|
|
620 l.append(tmp)
|
|
621 classes.iloc[j, 1] = None
|
|
622 if l:
|
|
623 class_pat[classe] = list(map(list, zip(*l)))
|
|
624 else:
|
|
625 warning('Warning: no sample found in class ' + classe +
|
|
626 ', the class has been disregarded\n')
|
|
627 return class_pat
|
|
628
|
|
629 ############################ map ##############################################
|
|
630
|
47
|
631 def maps(core_map, class_pat, ids, threshold_P_V, threshold_F_C, create_svg, create_pdf, comparison, control):
|
0
|
632 args = process_args(sys.argv)
|
|
633 if (not class_pat) or (len(class_pat.keys()) < 2):
|
|
634 sys.exit('Execution aborted: classes provided for comparisons are ' +
|
|
635 'less than two\n')
|
47
|
636
|
|
637 if comparison == "manyvsmany":
|
|
638 for i, j in it.combinations(class_pat.keys(), 2):
|
|
639 tmp = {}
|
|
640 count = 0
|
|
641 max_F_C = 0
|
|
642 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
|
|
643 try:
|
|
644 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
645 #sum(l1) da errore secondo me perchè ha null
|
|
646 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
647 if not isinstance(avg, str):
|
|
648 if max_F_C < abs(avg):
|
|
649 max_F_C = abs(avg)
|
|
650 tmp[ids[count]] = [float(p_value), avg]
|
|
651 count += 1
|
|
652 except (TypeError, ZeroDivisionError):
|
|
653 count += 1
|
|
654 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
|
|
655 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
656 tmp_csv = tmp_csv.reset_index()
|
|
657 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
658 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
659
|
|
660 if create_svg or create_pdf:
|
48
|
661 if args.custom_rules == 'false' or (args.custom_rules == 'true'
|
|
662 and args.custom_map != ''):
|
47
|
663 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
664 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
|
|
665 with open(file_svg, 'wb') as new_map:
|
|
666 new_map.write(ET.tostring(core_map))
|
|
667
|
|
668
|
|
669 if create_pdf:
|
|
670 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
|
|
671 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
672
|
|
673 if not create_svg:
|
|
674 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
|
|
675 elif comparison == "onevsrest":
|
|
676 for single_cluster in class_pat.keys():
|
|
677 t = []
|
|
678 for k in class_pat.keys():
|
|
679 if k != single_cluster:
|
|
680 t.append(class_pat.get(k))
|
|
681 rest = []
|
|
682 for i in t:
|
|
683 rest = rest + i
|
|
684
|
|
685 tmp = {}
|
|
686 count = 0
|
|
687 max_F_C = 0
|
|
688
|
|
689 for l1, l2 in zip(rest, class_pat.get(single_cluster)):
|
|
690 try:
|
|
691 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
692 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
693 if not isinstance(avg, str):
|
|
694 if max_F_C < abs(avg):
|
|
695 max_F_C = abs(avg)
|
|
696 tmp[ids[count]] = [float(p_value), avg]
|
|
697 count += 1
|
|
698 except (TypeError, ZeroDivisionError):
|
|
699 count += 1
|
|
700 tab = 'result/' + single_cluster + '_vs_rest (Tabular Result).tsv'
|
|
701 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
702 tmp_csv = tmp_csv.reset_index()
|
|
703 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
704 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
705
|
|
706 if create_svg or create_pdf:
|
65
|
707 if args.custom_rules == 'false' or (args.custom_rules == 'true'
|
|
708 and args.custom_map != ''):
|
47
|
709 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
710 file_svg = 'result/' + single_cluster + '_vs_ rest (SVG Map).svg'
|
|
711 with open(file_svg, 'wb') as new_map:
|
|
712 new_map.write(ET.tostring(core_map))
|
|
713
|
|
714
|
|
715 if create_pdf:
|
|
716 file_pdf = 'result/' + single_cluster + '_vs_ rest (PDF Map).pdf'
|
|
717 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
718
|
|
719 if not create_svg:
|
|
720 os.remove('result/' + single_cluster + '_vs_ rest (SVG Map).svg')
|
|
721
|
|
722 elif comparison == "onevsmany":
|
|
723 for i, j in it.combinations(class_pat.keys(), 2):
|
65
|
724 if i != control and j != control:
|
|
725 continue
|
|
726 if i == control and j == control:
|
|
727 continue
|
47
|
728 tmp = {}
|
|
729 count = 0
|
|
730 max_F_C = 0
|
|
731 for l1, l2 in zip(class_pat.get(i), class_pat.get(j)):
|
|
732 try:
|
|
733 stat_D, p_value = st.ks_2samp(l1, l2)
|
|
734 #sum(l1) da errore secondo me perchè ha null
|
|
735 avg = fold_change(sum(l1) / len(l1), sum(l2) / len(l2))
|
|
736 if not isinstance(avg, str):
|
|
737 if max_F_C < abs(avg):
|
|
738 max_F_C = abs(avg)
|
|
739 tmp[ids[count]] = [float(p_value), avg]
|
|
740 count += 1
|
|
741 except (TypeError, ZeroDivisionError):
|
|
742 count += 1
|
|
743 tab = 'result/' + i + '_vs_' + j + ' (Tabular Result).tsv'
|
|
744 tmp_csv = pd.DataFrame.from_dict(tmp, orient = "index")
|
|
745 tmp_csv = tmp_csv.reset_index()
|
|
746 header = ['ids', 'P_Value', 'Log2(fold change)']
|
|
747 tmp_csv.to_csv(tab, sep = '\t', index = False, header = header)
|
|
748
|
|
749 if create_svg or create_pdf:
|
65
|
750 if args.custom_rules == 'false' or (args.custom_rules == 'true'
|
|
751 and args.custom_map != ''):
|
47
|
752 fix_map(tmp, core_map, threshold_P_V, threshold_F_C, max_F_C)
|
|
753 file_svg = 'result/' + i + '_vs_' + j + ' (SVG Map).svg'
|
|
754 with open(file_svg, 'wb') as new_map:
|
|
755 new_map.write(ET.tostring(core_map))
|
|
756
|
|
757
|
|
758 if create_pdf:
|
|
759 file_pdf = 'result/' + i + '_vs_' + j + ' (PDF Map).pdf'
|
|
760 renderPDF.drawToFile(svg2rlg(file_svg), file_pdf)
|
|
761
|
|
762 if not create_svg:
|
|
763 os.remove('result/' + i + '_vs_' + j + ' (SVG Map).svg')
|
16
|
764
|
47
|
765
|
|
766
|
|
767
|
0
|
768 return None
|
|
769
|
|
770 ############################ MAIN #############################################
|
|
771
|
|
772 def main():
|
|
773 args = process_args(sys.argv)
|
16
|
774
|
|
775 create_svg = check_bool(args.generate_svg)
|
|
776 create_pdf = check_bool(args.generate_pdf)
|
35
|
777
|
47
|
778 if os.path.isdir('result') == False:
|
|
779 os.makedirs('result')
|
|
780
|
0
|
781 class_pat = {}
|
16
|
782
|
47
|
783 if args.option == 'datasets':
|
0
|
784 num = 1
|
|
785 for i, j in zip(args.input_datas, args.names):
|
|
786 name = name_dataset(j, num)
|
47
|
787 resolve_rules = read_dataset(i, name)
|
|
788
|
|
789 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
|
16
|
790
|
47
|
791 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
|
16
|
792
|
47
|
793 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
|
|
794 resolve_rules = resolve_rules.replace({'None': None})
|
|
795 resolve_rules = resolve_rules.to_dict('list')
|
16
|
796
|
47
|
797 #Converto i valori da str a float
|
|
798 to_float = lambda x: float(x) if (x != None) else None
|
|
799
|
|
800 resolve_rules_float = {}
|
|
801
|
|
802 for k in resolve_rules:
|
|
803 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
|
|
804
|
0
|
805 if resolve_rules != None:
|
47
|
806 class_pat[name] = list(map(list, zip(*resolve_rules_float.values())))
|
|
807
|
0
|
808 num += 1
|
47
|
809
|
|
810 if args.option == 'dataset_class':
|
|
811 name = 'RAS'
|
|
812 resolve_rules = read_dataset(args.input_data, name)
|
|
813 resolve_rules.iloc[:, 0] = (resolve_rules.iloc[:, 0]).astype(str)
|
|
814
|
|
815 ids = pd.Series.tolist(resolve_rules.iloc[:, 0])
|
|
816
|
|
817 resolve_rules = resolve_rules.drop(resolve_rules.columns[[0]], axis=1)
|
|
818 resolve_rules = resolve_rules.replace({'None': None})
|
|
819 resolve_rules = resolve_rules.to_dict('list')
|
|
820
|
|
821 #Converto i valori da str a float
|
|
822 to_float = lambda x: float(x) if (x != None) else None
|
|
823
|
|
824 resolve_rules_float = {}
|
|
825
|
|
826 for k in resolve_rules:
|
|
827 resolve_rules_float[k] = list(map(to_float, resolve_rules[k])); resolve_rules_float
|
|
828
|
0
|
829 classes = read_dataset(args.input_class, 'class')
|
|
830 classes = classes.astype(str)
|
47
|
831
|
|
832 if resolve_rules_float != None:
|
|
833 class_pat = split_class(classes, resolve_rules_float)
|
28
|
834
|
48
|
835
|
|
836 if args.custom_rules == 'true':
|
|
837 try:
|
|
838 core_map = ET.parse(args.custom_map)
|
|
839 except (ET.XMLSyntaxError, ET.XMLSchemaParseError):
|
|
840 sys.exit('Execution aborted: custom map in wrong format')
|
|
841 else:
|
0
|
842 core_map = ET.parse(args.tool_dir+'/local/HMRcoreMap.svg')
|
16
|
843
|
65
|
844 class_pat_trim = {}
|
|
845
|
|
846 for key in class_pat.keys():
|
|
847 class_pat_trim[key.strip()] = class_pat[key]
|
|
848
|
|
849
|
|
850 maps(core_map, class_pat_trim, ids, args.pValue, args.fChange, create_svg, create_pdf, args.comparison, args.control)
|
16
|
851
|
|
852 print('Execution succeded')
|
47
|
853
|
|
854 return None
|
16
|
855
|
0
|
856
|
|
857 ###############################################################################
|
|
858
|
|
859 if __name__ == "__main__":
|
|
860 main()
|