Mercurial > repos > prog > mtblsdwnld
comparison isaslicer.py @ 0:8dab200e02cb draft
"planemo upload commit 239561a6401593c5f87df40ac971a9aa393c4663-dirty"
author | prog |
---|---|
date | Tue, 07 Jan 2020 09:05:21 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dab200e02cb |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import glob | |
5 import json | |
6 import logging | |
7 import os | |
8 import re | |
9 import shutil | |
10 import sys | |
11 import tempfile | |
12 import zipfile | |
13 | |
14 import pandas as pd | |
15 from isatools import isatab | |
16 from isatools.model import OntologyAnnotation | |
17 from isatools.net import mtbls as MTBLS | |
18 | |
19 logger = None | |
20 | |
21 # isaslicer.py <command> <study_id> [ command-specific options ] | |
22 | |
23 | |
24 def make_parser(): | |
25 parser = argparse.ArgumentParser( description="ISA slicer") | |
26 | |
27 parser.add_argument('--log-level', choices=[ | |
28 'DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'], | |
29 default='INFO', help="Set the desired logging level") | |
30 | |
31 subparsers = parser.add_subparsers( | |
32 title='Actions', | |
33 dest='command') # specified subcommand will be available in attribute 'command' | |
34 subparsers.required = True | |
35 | |
36 # mtblisa commands | |
37 | |
38 subparser = subparsers.add_parser( | |
39 'mtbls-get-study-archive', aliases=['gsa'], | |
40 help="Get ISA study from MetaboLights as zip archive") | |
41 subparser.set_defaults(func=get_study_archive_command) | |
42 subparser.add_argument('study_id') | |
43 subparser.add_argument( | |
44 'output', metavar="OUTPUT", | |
45 help="Name of output archive (extension will be added)") | |
46 subparser.add_argument('--format', metavar="FMT", choices=[ | |
47 'zip', 'tar', 'gztar', 'bztar', 'xztar'], default='zip', | |
48 help="Type of archive to create") | |
49 | |
50 subparser = subparsers.add_parser('mtbls-get-study', aliases=['gs'], | |
51 help="Get ISA study from MetaboLights") | |
52 subparser.set_defaults(func=get_study_command) | |
53 subparser.add_argument('study_id') | |
54 subparser.add_argument('output', metavar="PATH", help="Name of output") | |
55 subparser.add_argument( | |
56 '-f', '--isa-format', choices=['isa-tab', 'isa-json'], | |
57 metavar="FORMAT", default='isa-tab', help="Desired ISA format") | |
58 | |
59 subparser = subparsers.add_parser( | |
60 'mtbls-get-factors', aliases=['gf'], | |
61 help="Get factor names from a study in json format") | |
62 subparser.set_defaults(func=get_factors_command) | |
63 subparser.add_argument('study_id') | |
64 subparser.add_argument( | |
65 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
66 help="Output file") | |
67 | |
68 subparser = subparsers.add_parser( | |
69 'mtbls-get-factor-values', aliases=['gfv'], | |
70 help="Get factor values from a study in json format") | |
71 subparser.set_defaults(func=get_factor_values_command) | |
72 subparser.add_argument('study_id') | |
73 subparser.add_argument( | |
74 'factor', help="The desired factor. Use `get-factors` to get the list " | |
75 "of available factors") | |
76 subparser.add_argument( | |
77 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
78 help="Output file") | |
79 | |
80 subparser = subparsers.add_parser('mtbls-get-data-list', aliases=['gd'], | |
81 help="Get data files list in json format") | |
82 subparser.set_defaults(func=get_data_files_command) | |
83 subparser.add_argument('study_id') | |
84 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
85 help="Output file") | |
86 subparser.add_argument( | |
87 '--json-query', | |
88 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
89 subparser.add_argument( | |
90 '--galaxy_parameters_file', | |
91 help="Path to JSON file containing input Galaxy JSON") | |
92 | |
93 subparser = subparsers.add_parser( | |
94 'mtbls-get-factors-summary', aliases=['gsum'], | |
95 help="Get the variables summary from a study, in json format") | |
96 subparser.set_defaults(func=get_summary_command) | |
97 subparser.add_argument('study_id') | |
98 subparser.add_argument( | |
99 'json_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
100 help="Output JSON file") | |
101 subparser.add_argument( | |
102 'html_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
103 help="Output HTML file") | |
104 | |
105 # isaslicer commands on path to unpacked ISA-Tab as input | |
106 | |
107 subparser = subparsers.add_parser( | |
108 'isa-tab-get-factors', aliases=['isagf'], | |
109 help="Get factor names from a study in json format") | |
110 subparser.set_defaults(func=isatab_get_factor_names_command) | |
111 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
112 subparser.add_argument( | |
113 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
114 help="Output file") | |
115 | |
116 subparser = subparsers.add_parser( | |
117 'zip-get-factors', aliases=['zipgf'], | |
118 help="Get factor names from a study in json format") | |
119 subparser.set_defaults(func=zip_get_factor_names_command) | |
120 subparser.add_argument('input_path', type=str, | |
121 help="Input ISA-Tab zip path") | |
122 subparser.add_argument( | |
123 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
124 help="Output file") | |
125 | |
126 subparser = subparsers.add_parser( | |
127 'isa-tab-get-factor-values', aliases=['isagfv'], | |
128 help="Get factor values from a study in json format") | |
129 subparser.set_defaults(func=isatab_get_factor_values_command) | |
130 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
131 subparser.add_argument( | |
132 'factor', help="The desired factor. Use `get-factors` to get the list " | |
133 "of available factors") | |
134 subparser.add_argument( | |
135 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
136 help="Output file") | |
137 | |
138 subparser = subparsers.add_parser( | |
139 'zip-get-factor-values', aliases=['zipgfv'], | |
140 help="Get factor values from a study in json format") | |
141 subparser.set_defaults(func=zip_get_factor_values_command) | |
142 subparser.add_argument('input_path', type=str, | |
143 help="Input ISA-Tab zip path") | |
144 subparser.add_argument( | |
145 'factor', help="The desired factor. Use `get-factors` to get the list " | |
146 "of available factors") | |
147 subparser.add_argument( | |
148 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
149 help="Output file") | |
150 | |
151 subparser = subparsers.add_parser('isa-tab-get-data-list', aliases=['isagdl'], | |
152 help="Get data files list in json format") | |
153 subparser.set_defaults(func=isatab_get_data_files_list_command) | |
154 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
155 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
156 help="Output file") | |
157 subparser.add_argument( | |
158 '--json-query', | |
159 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
160 subparser.add_argument( | |
161 '--galaxy_parameters_file', | |
162 help="Path to JSON file containing input Galaxy JSON") | |
163 | |
164 subparser = subparsers.add_parser('zip-get-data-list', aliases=['zipgdl'], | |
165 help="Get data files list in json format") | |
166 subparser.set_defaults(func=zip_get_data_files_list_command) | |
167 subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path") | |
168 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
169 help="Output file") | |
170 subparser.add_argument( | |
171 '--json-query', | |
172 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
173 subparser.add_argument( | |
174 '--galaxy_parameters_file', | |
175 help="Path to JSON file containing input Galaxy JSON") | |
176 | |
177 subparser = subparsers.add_parser('isa-tab-get-data-collection', aliases=['isagdc'], | |
178 help="Get data files collection") | |
179 subparser.set_defaults(func=isatab_get_data_files_collection_command) | |
180 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
181 subparser.add_argument('output_path', type=str, help="Output data files path") | |
182 subparser.add_argument( | |
183 '--json-query', | |
184 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
185 subparser.add_argument( | |
186 '--galaxy_parameters_file', | |
187 help="Path to JSON file containing input Galaxy JSON") | |
188 | |
189 subparser = subparsers.add_parser('zip-get-data-collection', aliases=['zipgdc'], | |
190 help="Get data files collection") | |
191 subparser.set_defaults(func=zip_get_data_files_collection_command) | |
192 subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path") | |
193 subparser.add_argument('output_path', type=str, help="Output data files path") | |
194 subparser.add_argument( | |
195 '--json-query', | |
196 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
197 | |
198 subparser = subparsers.add_parser( | |
199 'isa-tab-get-factors-summary', aliases=['isasum'], | |
200 help="Get the variables summary from a study, in json format") | |
201 subparser.set_defaults(func=isatab_get_factors_summary_command) | |
202 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
203 subparser.add_argument( | |
204 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
205 help="Output file") | |
206 | |
207 subparser = subparsers.add_parser( | |
208 'zip-get-factors-summary', aliases=['zipsum'], | |
209 help="Get the variables summary from a study, in json format") | |
210 subparser.set_defaults(func=zip_get_factors_summary_command) | |
211 subparser.add_argument('input_path', type=str, | |
212 help="Input ISA-Tab zip path") | |
213 subparser.add_argument( | |
214 'json_output', nargs='?', type=argparse.FileType('w'), | |
215 default=sys.stdout, | |
216 help="Output JSON file") | |
217 subparser.add_argument( | |
218 'html_output', nargs='?', type=argparse.FileType('w'), | |
219 default=sys.stdout, | |
220 help="Output HTML file") | |
221 | |
222 subparser = subparsers.add_parser( | |
223 'isaslicer2-slice', aliases=['slice2'], | |
224 help="Slice ISA-Tabs version 2") | |
225 subparser.set_defaults(func=query_isatab) | |
226 subparser.add_argument('--source_dir', type=str, | |
227 help="Input ISA-Tab zip path") | |
228 subparser.add_argument( | |
229 '--galaxy_parameters_file', type=argparse.FileType(mode='r'), | |
230 help="Path to JSON file containing input Galaxy JSON") | |
231 subparser.add_argument('--output', type=argparse.FileType(mode='w'), | |
232 help="Input ISA-Tab zip path") | |
233 | |
234 subparser = subparsers.add_parser( | |
235 'filter-data', aliases=['filter'], | |
236 help="Filter out data based on slicer2") | |
237 subparser.set_defaults(func=filter_data) | |
238 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
239 subparser.add_argument('output_path', type=str, help="Output data files path") | |
240 subparser.add_argument('--slice', type=argparse.FileType(mode='r'), | |
241 help="slice") | |
242 subparser.add_argument('--filename_filter', type=str, help="shell-like wildcard to filter files") | |
243 | |
244 return parser | |
245 | |
246 | |
247 def filter_data(options): | |
248 loglines = [] | |
249 source_dir = options.input_path if options.input_path else "" | |
250 output_path = options.output_path | |
251 filename_filter = options.filename_filter | |
252 if source_dir: | |
253 if not os.path.exists(source_dir): | |
254 raise IOError('Source path does not exist!') | |
255 data_files = [] | |
256 slice_json = options.slice | |
257 for result in json.load(slice_json)['results']: | |
258 data_files.extend(result.get('data_files', [])) | |
259 reduced_data_files = list(set(data_files)) | |
260 filtered_files = glob.glob(os.path.join(source_dir, filename_filter)) | |
261 to_copy = [] | |
262 for filepath in filtered_files: | |
263 if os.path.basename(filepath) in reduced_data_files: | |
264 to_copy.append(filepath) | |
265 loglines.append("Using slice results from {}\n".format(slice_json.name)) | |
266 for filepath in to_copy: | |
267 loglines.append("Copying {}\n".format(os.path.basename(filepath))) | |
268 # try: | |
269 # shutil.copyfile( | |
270 # filepath, os.path.join(output_path, os.path.basename(filepath))) | |
271 # except Exception as e: | |
272 # print(e) | |
273 # exit(1) | |
274 try: | |
275 os.symlink( | |
276 filepath, os.path.join(output_path, os.path.basename(filepath))) | |
277 except Exception as e: | |
278 print(e) | |
279 exit(1) | |
280 with open('cli.log', 'w') as fp: | |
281 fp.writelines(loglines) | |
282 | |
283 | |
284 def query_isatab(options): | |
285 source_dir = options.source_dir if options.source_dir else "" | |
286 galaxy_parameters_file = options.galaxy_parameters_file | |
287 output = options.output | |
288 | |
289 debug = True | |
290 if galaxy_parameters_file: | |
291 galaxy_parameters = json.load(galaxy_parameters_file) | |
292 print('Galaxy parameters:') | |
293 print(json.dumps(galaxy_parameters, indent=4)) | |
294 else: | |
295 raise IOError('Could not load Galaxy parameters file!') | |
296 if source_dir: | |
297 if not os.path.exists(source_dir): | |
298 raise IOError('Source path does not exist!') | |
299 query = galaxy_parameters['query'] | |
300 if debug: | |
301 print('Query is:') | |
302 print(json.dumps(query, indent=4)) # for debugging only | |
303 if source_dir: | |
304 investigation = isatab.load(source_dir) | |
305 else: | |
306 tmp = tempfile.mkdtemp() | |
307 _ = MTBLS.get(galaxy_parameters['input']['mtbls_id'], tmp) | |
308 investigation = isatab.load(tmp) | |
309 # filter assays by mt/tt | |
310 matching_assays = [] | |
311 mt = query.get('measurement_type').strip() | |
312 tt = query.get('technology_type').strip() | |
313 if mt and tt: | |
314 for study in investigation.studies: | |
315 matching_assays.extend( | |
316 [x for x in study.assays if x.measurement_type.term == mt | |
317 and x.technology_type.term == tt]) | |
318 elif mt and not tt: | |
319 for study in investigation.studies: | |
320 matching_assays.extend( | |
321 [x for x in study.assays if x.measurement_type.term == mt]) | |
322 elif not mt and tt: | |
323 for study in investigation.studies: | |
324 matching_assays.extend( | |
325 [x for x in study.assays if x.technology_type.term == tt]) | |
326 else: | |
327 for study in investigation.studies: | |
328 matching_assays.extend(study.assays) | |
329 assay_samples = [] | |
330 for assay in matching_assays: | |
331 assay_samples.extend(assay.samples) | |
332 if debug: | |
333 print('Total samples: {}'.format(len(assay_samples))) | |
334 | |
335 # filter samples by fv | |
336 factor_selection = { | |
337 x.get('factor_name').strip(): x.get('factor_value').strip() for x in | |
338 query.get('factor_selection', [])} | |
339 | |
340 fv_samples = set() | |
341 if factor_selection: | |
342 samples_to_remove = set() | |
343 for f, v in factor_selection.items(): | |
344 for sample in assay_samples: | |
345 for fv in [x for x in sample.factor_values if | |
346 x.factor_name.name == f]: | |
347 if isinstance(fv.value, OntologyAnnotation): | |
348 if fv.value.term == v: | |
349 fv_samples.add(sample) | |
350 elif fv.value == v: | |
351 fv_samples.add(sample) | |
352 for f, v in factor_selection.items(): | |
353 for sample in fv_samples: | |
354 for fv in [x for x in sample.factor_values if | |
355 x.factor_name.name == f]: | |
356 if isinstance(fv.value, OntologyAnnotation): | |
357 if fv.value.term != v: | |
358 samples_to_remove.add(sample) | |
359 elif fv.value != v: | |
360 samples_to_remove.add(sample) | |
361 final_fv_samples = fv_samples.difference(samples_to_remove) | |
362 else: | |
363 final_fv_samples = assay_samples | |
364 | |
365 # filter samples by characteristic | |
366 characteristics_selection = { | |
367 x.get('characteristic_name').strip(): | |
368 x.get('characteristic_value').strip() for x in | |
369 query.get('characteristics_selection', [])} | |
370 | |
371 cv_samples = set() | |
372 if characteristics_selection: | |
373 first_pass = True | |
374 samples_to_remove = set() | |
375 for c, v in characteristics_selection.items(): | |
376 if first_pass: | |
377 for sample in final_fv_samples: | |
378 for cv in [x for x in sample.characteristics if | |
379 x.category.term == c]: | |
380 if isinstance(cv.value, OntologyAnnotation): | |
381 if cv.value.term == v: | |
382 cv_samples.add(sample) | |
383 elif cv.value == v: | |
384 cv_samples.add(sample) | |
385 for source in sample.derives_from: | |
386 for cv in [x for x in source.characteristics if | |
387 x.category.term == c]: | |
388 if isinstance(cv.value, OntologyAnnotation): | |
389 if cv.value.term == v: | |
390 cv_samples.add(sample) | |
391 elif cv.value == v: | |
392 cv_samples.add(sample) | |
393 first_pass = False | |
394 else: | |
395 for sample in cv_samples: | |
396 for cv in [x for x in sample.characteristics if | |
397 x.category.term == c]: | |
398 if isinstance(cv.value, OntologyAnnotation): | |
399 if cv.value.term != v: | |
400 samples_to_remove.add(sample) | |
401 elif cv.value != v: | |
402 samples_to_remove.add(sample) | |
403 for source in sample.derives_from: | |
404 for cv in [x for x in source.characteristics if | |
405 x.category.term == c]: | |
406 if isinstance(cv.value, OntologyAnnotation): | |
407 if cv.value.term != v: | |
408 samples_to_remove.add(sample) | |
409 elif cv.value != v: | |
410 samples_to_remove.add(sample) | |
411 final_cv_samples = cv_samples.difference(samples_to_remove) | |
412 else: | |
413 final_cv_samples = final_fv_samples | |
414 | |
415 # filter samples by process parameter | |
416 parameters_selection = { | |
417 x.get('parameter_name').strip(): | |
418 x.get('parameter_value').strip() for x in | |
419 query.get('parameter_selection', [])} | |
420 | |
421 final_samples = final_cv_samples | |
422 | |
423 if debug: | |
424 print('Final number of samples: {}'.format(len(final_samples))) | |
425 results = [] | |
426 for sample in final_samples: | |
427 results.append({ | |
428 'sample_name': sample.name, | |
429 'data_files': [] | |
430 }) | |
431 for result in results: | |
432 sample_name = result['sample_name'] | |
433 if source_dir: | |
434 table_files = glob.iglob(os.path.join(source_dir, 'a_*')) | |
435 else: | |
436 table_files = glob.iglob(os.path.join(tmp, 'a_*')) | |
437 for table_file in table_files: | |
438 with open(table_file) as fp: | |
439 df = isatab.load_table(fp) | |
440 data_files = [] | |
441 table_headers = list(df.columns.values) | |
442 sample_rows = df.loc[df['Sample Name'] == sample_name] | |
443 data_node_labels = [ | |
444 'Raw Data File', 'Raw Spectral Data File', | |
445 'Derived Spectral Data File', | |
446 'Derived Array Data File', 'Array Data File', | |
447 'Protein Assignment File', 'Peptide Assignment File', | |
448 'Post Translational Modification Assignment File', | |
449 'Acquisition Parameter Data File', | |
450 'Free Induction Decay Data File', | |
451 'Derived Array Data Matrix File', 'Image File', | |
452 'Derived Data File', 'Metabolite Assignment File'] | |
453 if parameters_selection: | |
454 for p, v in parameters_selection.items(): | |
455 sample_pv_rows = sample_rows.loc[ | |
456 sample_rows['Parameter Value[{}]'.format(p)] == v] | |
457 for node_label in data_node_labels: | |
458 if node_label in table_headers: | |
459 data_files.extend( | |
460 list(sample_pv_rows[node_label])) | |
461 result['data_files'].extend(list(set( | |
462 i for i in list(data_files) if | |
463 str(i) not in ('nan', '')))) | |
464 else: | |
465 for node_label in data_node_labels: | |
466 if node_label in table_headers: | |
467 data_files.extend(list(sample_rows[node_label])) | |
468 result['data_files'].extend( | |
469 list(set(i for i in list(data_files) if | |
470 str(i) not in ('nan', '')))) | |
471 results_json = { | |
472 'query': query, | |
473 'results': results | |
474 } | |
475 json.dump(results_json, output, indent=4) | |
476 | |
477 # if galaxy_parameters['input']['collection_output']: | |
478 # logger = logging.getLogger() | |
479 # logger.debug("copying data files to %s", os.path.dirname(output)) | |
480 # for result in results: | |
481 # for data_file_name in result['data_files']: | |
482 # logging.info("Copying {}".format(data_file_name)) | |
483 # shutil.copy(os.path.join(source_dir, data_file_name), | |
484 # os.path.dirname(output)) | |
485 # logger.info( | |
486 # "Finished writing data files to {}".format(os.path.dirname(output))) | |
487 | |
488 | |
489 def get_study_archive_command(options): | |
490 study_id = options.study_id | |
491 | |
492 logger.info("Downloading study %s into archive at path %s.%s", | |
493 study_id, options.output, options.format) | |
494 | |
495 tmpdir = MTBLS.get(study_id) | |
496 logger.debug("MTBLS.get returned '%s'", tmpdir) | |
497 if tmpdir is not None: | |
498 try: | |
499 shutil.make_archive( | |
500 options.output, options.format, tmpdir, logger=logger) | |
501 logger.info("ISA archive written") | |
502 finally: | |
503 logger.debug("Trying to clean up tmp dir %s", tmpdir) | |
504 shutil.rmtree(tmpdir, ignore_errors=True) | |
505 else: | |
506 raise RuntimeError("Error downloading ISA study") | |
507 | |
508 # mtblisa commands | |
509 | |
510 | |
511 def get_study_command(options): | |
512 if os.path.exists(options.output): | |
513 raise RuntimeError("Selected output path {} already exists!".format( | |
514 options.output)) | |
515 | |
516 if options.isa_format == "isa-tab": | |
517 tmp_data = None | |
518 try: | |
519 logger.info("Downloading study %s", options.study_id) | |
520 tmp_data = MTBLS.get(options.study_id) | |
521 if tmp_data is None: | |
522 raise RuntimeError("Error downloading ISA study") | |
523 | |
524 logger.debug( | |
525 "Finished downloading data. Moving to final location %s", | |
526 options.output) | |
527 shutil.move(tmp_data, options.output) | |
528 logger.info("ISA archive written to %s", options.output) | |
529 finally: | |
530 if tmp_data: | |
531 # try to clean up any temporary files left behind | |
532 logger.debug("Deleting %s, if there's anything there", tmp_data) | |
533 shutil.rmtree(tmp_data, ignore_errors=True) | |
534 elif options.isa_format == "isa-json": | |
535 isajson = MTBLS.getj(options.study_id) | |
536 if isajson is None: | |
537 raise RuntimeError("Error downloading ISA study") | |
538 | |
539 logger.debug( | |
540 "Finished downloading data. Dumping json to final location %s", | |
541 options.output) | |
542 os.makedirs(options.output) | |
543 json_file = os.path.join(options.output, "{}.json".format( | |
544 isajson['identifier'])) | |
545 with open(json_file, 'w') as fd: | |
546 json.dump(isajson, fd) | |
547 logger.info("ISA-JSON written to %s", options.output) | |
548 else: | |
549 raise ValueError("BUG! Got an invalid isa format '{}'".format( | |
550 options.isa_format)) | |
551 | |
552 | |
553 def get_factors_command(options): | |
554 logger.info("Getting factors for study %s. Writing to %s.", | |
555 options.study_id, options.output.name) | |
556 factor_names = MTBLS.get_factor_names(options.study_id) | |
557 if factor_names is not None: | |
558 json.dump(list(factor_names), options.output, indent=4) | |
559 logger.debug("Factor names written") | |
560 else: | |
561 raise RuntimeError("Error downloading factors.") | |
562 | |
563 | |
564 def get_factor_values_command(options): | |
565 logger.info("Getting values for factor {factor} in study {study_id}. Writing to {output_file}." | |
566 .format(factor=options.factor, study_id=options.study_id, output_file=options.output.name)) | |
567 fvs = MTBLS.get_factor_values(options.study_id, options.factor) | |
568 if fvs is not None: | |
569 json.dump(list(fvs), options.output, indent=4) | |
570 logger.debug("Factor values written to {}".format(options.output)) | |
571 else: | |
572 raise RuntimeError("Error getting factor values") | |
573 | |
574 | |
575 def get_data_files_command(options): | |
576 logger.info("Getting data files for study %s. Writing to %s.", | |
577 options.study_id, options.output.name) | |
578 if options.json_query: | |
579 logger.debug("This is the specified query:\n%s", options.json_query) | |
580 json_struct = json.loads(options.json_query) | |
581 data_files = MTBLS.get_data_files(options.study_id, json_struct) | |
582 elif options.galaxy_parameters_file: | |
583 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
584 options.galaxy_parameters_file) | |
585 with open(options.galaxy_parameters_file) as json_fp: | |
586 galaxy_json = json.load(json_fp) | |
587 json_struct = {} | |
588 for fv_item in galaxy_json['factor_value_series']: | |
589 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
590 data_files = MTBLS.get_data_files(options.study_id, json_struct) | |
591 else: | |
592 logger.debug("No query was specified") | |
593 data_files = MTBLS.get_data_files(options.study_id) | |
594 | |
595 logger.debug("Result data files list: %s", data_files) | |
596 if data_files is None: | |
597 raise RuntimeError("Error getting data files with isatools") | |
598 | |
599 logger.debug("dumping data files to %s", options.output.name) | |
600 json.dump(list(data_files), options.output, indent=4) | |
601 logger.info("Finished writing data files to {}".format(options.output)) | |
602 | |
603 | |
604 def build_html_data_files_list(data_files_list): | |
605 data_files_table = '<table>' | |
606 data_files_table += '<tr><th>Sample Name</th><th>Data File Names</th></tr>' | |
607 for data_file in data_files_list: | |
608 sample_name = data_file['sample'] | |
609 data_files = ', '.join(data_file['data_files']) | |
610 data_files_table += '<tr><td>{sample_name}</td><td>{data_files}</td>' \ | |
611 .format(sample_name=sample_name, data_files=data_files) | |
612 html_data_files_list = """ | |
613 <html> | |
614 <head> | |
615 <title>ISA-Tab Factors Summary</title> | |
616 </head> | |
617 <body> | |
618 {summary_table} | |
619 </body> | |
620 </html> | |
621 """.format(summary_table=data_files_table) | |
622 return html_data_files_list | |
623 | |
624 | |
625 def build_html_summary(summary): | |
626 study_groups = {} | |
627 for item in summary: | |
628 sample_name = item['sample_name'] | |
629 study_factors = [] | |
630 for item in [x for x in item.items() if x[0] != "sample_name"]: | |
631 study_factors.append(': '.join([item[0], item[1]])) | |
632 study_group = ', '.join(study_factors) | |
633 if study_group not in study_groups.keys(): | |
634 study_groups[study_group] = [] | |
635 study_groups[study_group].append(sample_name) | |
636 summary_table = '<table>' | |
637 summary_table += '<tr><th>Study group</th><th>Number of samples</th></tr>' | |
638 for item in study_groups.items(): | |
639 study_group = item[0] | |
640 num_samples = len(item[1]) | |
641 summary_table += '<tr><td>{study_group}</td><td>{num_samples}</td>' \ | |
642 .format(study_group=study_group, num_samples=num_samples) | |
643 summary_table += '</table>' | |
644 html_summary = """ | |
645 <html> | |
646 <head> | |
647 <title>ISA-Tab Factors Summary</title> | |
648 </head> | |
649 <body> | |
650 {summary_table} | |
651 </body> | |
652 </html> | |
653 """.format(summary_table=summary_table) | |
654 return html_summary | |
655 | |
656 | |
657 def get_summary_command(options): | |
658 logger.info("Getting summary for study %s. Writing to %s.", | |
659 options.study_id, options.json_output.name) | |
660 | |
661 summary = MTBLS.get_study_variable_summary(options.study_id) | |
662 # new_summary = [] | |
663 # for item in summary: | |
664 # new_summary.append( | |
665 # {k: v for k, v in item.items() if k is not "sample_name"}) | |
666 # summary = new_summary | |
667 if summary is not None: | |
668 json.dump(summary, options.json_output, indent=4) | |
669 logger.debug("Summary dumped to JSON") | |
670 html_summary = build_html_summary(summary) | |
671 with options.html_output as html_fp: | |
672 html_fp.write(html_summary) | |
673 else: | |
674 raise RuntimeError("Error getting study summary") | |
675 | |
676 | |
677 # isaslicer commands | |
678 | |
679 def isatab_get_data_files_list_command(options): | |
680 logger.info("Getting data files for study %s. Writing to %s.", | |
681 options.input_path, options.output.name) | |
682 if options.json_query: | |
683 logger.debug("This is the specified query:\n%s", options.json_query) | |
684 json_struct = json.loads(options.json_query) | |
685 elif options.galaxy_parameters_file: | |
686 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
687 options.galaxy_parameters_file) | |
688 with open(options.galaxy_parameters_file) as json_fp: | |
689 galaxy_json = json.load(json_fp) | |
690 json_struct = {} | |
691 for fv_item in galaxy_json['factor_value_series']: | |
692 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
693 else: | |
694 logger.debug("No query was specified") | |
695 json_struct = None | |
696 factor_selection = json_struct | |
697 input_path = options.input_path | |
698 result = slice_data_files(input_path, factor_selection=factor_selection) | |
699 data_files = result | |
700 logger.debug("Result data files list: %s", data_files) | |
701 if data_files is None: | |
702 raise RuntimeError("Error getting data files with isatools") | |
703 | |
704 logger.debug("dumping data files to %s", options.output.name) | |
705 json.dump(list(data_files), options.output, indent=4) | |
706 logger.info("Finished writing data files to {}".format(options.output)) | |
707 | |
708 | |
709 def zip_get_data_files_list_command(options): | |
710 logger.info("Getting data files for study %s. Writing to %s.", | |
711 options.input_path, options.output.name) | |
712 if options.json_query: | |
713 logger.debug("This is the specified query:\n%s", options.json_query) | |
714 json_struct = json.loads(options.json_query) | |
715 elif options.galaxy_parameters_file: | |
716 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
717 options.galaxy_parameters_file) | |
718 with open(options.galaxy_parameters_file) as json_fp: | |
719 galaxy_json = json.load(json_fp) | |
720 json_struct = {} | |
721 for fv_item in galaxy_json['factor_value_series']: | |
722 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
723 else: | |
724 logger.debug("No query was specified") | |
725 json_struct = None | |
726 factor_selection = json_struct | |
727 input_path = options.input_path | |
728 with zipfile.ZipFile(input_path) as zfp: | |
729 tmpdir = tempfile.mkdtemp() | |
730 zfp.extractall(path=tmpdir) | |
731 result = slice_data_files(tmpdir, factor_selection=factor_selection) | |
732 data_files = result | |
733 logger.debug("Result data files list: %s", data_files) | |
734 if data_files is None: | |
735 raise RuntimeError("Error getting data files with isatools") | |
736 logger.debug("dumping data files to %s", options.output.name) | |
737 json.dump(list(data_files), options.output, indent=4) | |
738 logger.info("Finished writing data files to {}".format(options.output)) | |
739 shutil.rmtree(tmpdir) | |
740 | |
741 | |
742 def isatab_get_data_files_collection_command(options): | |
743 logger.info("Getting data files for study %s. Writing to %s.", | |
744 options.input_path, options.output_path) | |
745 if options.json_query: | |
746 logger.debug("This is the specified query:\n%s", options.json_query) | |
747 else: | |
748 logger.debug("No query was specified") | |
749 input_path = options.input_path | |
750 if options.json_query is not None: | |
751 json_struct = json.loads(options.json_query) | |
752 elif options.galaxy_parameters_file: | |
753 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
754 options.galaxy_parameters_file) | |
755 with open(options.galaxy_parameters_file) as json_fp: | |
756 galaxy_json = json.load(json_fp) | |
757 json_struct = {} | |
758 for fv_item in galaxy_json['factor_value_series']: | |
759 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
760 else: | |
761 logger.debug("No query was specified") | |
762 json_struct = None | |
763 factor_selection = json_struct | |
764 result = slice_data_files(input_path, factor_selection=factor_selection) | |
765 data_files = result | |
766 logger.debug("Result data files list: %s", data_files) | |
767 if data_files is None: | |
768 raise RuntimeError("Error getting data files with isatools") | |
769 output_path = options.output_path | |
770 logger.debug("copying data files to %s", output_path) | |
771 for result in data_files: | |
772 for data_file_name in result['data_files']: | |
773 logging.info("Copying {}".format(data_file_name)) | |
774 shutil.copy(os.path.join(input_path, data_file_name), output_path) | |
775 logger.info("Finished writing data files to {}".format(output_path)) | |
776 | |
777 | |
778 def zip_get_data_files_collection_command(options): | |
779 logger.info("Getting data files for study %s. Writing to %s.", | |
780 options.input_path, options.output_path) | |
781 if options.json_query: | |
782 logger.debug("This is the specified query:\n%s", options.json_query) | |
783 else: | |
784 logger.debug("No query was specified") | |
785 input_path = options.input_path | |
786 output_path = options.output_path | |
787 if options.json_query is not None: | |
788 json_struct = json.loads(options.json_query) | |
789 factor_selection = json_struct | |
790 else: | |
791 factor_selection = None | |
792 with zipfile.ZipFile(input_path) as zfp: | |
793 tmpdir = tempfile.mkdtemp() | |
794 zfp.extractall(path=tmpdir) | |
795 result = slice_data_files(tmpdir, factor_selection=factor_selection) | |
796 data_files = result | |
797 logger.debug("Result data files list: %s", data_files) | |
798 if data_files is None: | |
799 raise RuntimeError("Error getting data files with isatools") | |
800 logger.debug("copying data files to %s", output_path) | |
801 for result in data_files: | |
802 for data_file_name in result['data_files']: | |
803 logging.info("Copying {}".format(data_file_name)) | |
804 shutil.copy(os.path.join(tmpdir, data_file_name), output_path) | |
805 logger.info("Finished writing data files to {}".format(output_path)) | |
806 shutil.rmtree(tmpdir) | |
807 | |
808 | |
809 def slice_data_files(dir, factor_selection=None): | |
810 results = [] | |
811 # first collect matching samples | |
812 for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')): | |
813 logger.info('Loading {table_file}'.format(table_file=table_file)) | |
814 | |
815 with open(os.path.join(dir, table_file)) as fp: | |
816 df = isatab.load_table(fp) | |
817 | |
818 if factor_selection is None: | |
819 matches = df['Sample Name'].items() | |
820 | |
821 for indx, match in matches: | |
822 sample_name = match | |
823 if len([r for r in results if r['sample'] == | |
824 sample_name]) == 1: | |
825 continue | |
826 else: | |
827 results.append( | |
828 { | |
829 'sample': sample_name, | |
830 'data_files': [] | |
831 } | |
832 ) | |
833 | |
834 else: | |
835 for factor_name, factor_value in factor_selection.items(): | |
836 if 'Factor Value[{}]'.format(factor_name) in list( | |
837 df.columns.values): | |
838 matches = df.loc[df['Factor Value[{factor}]'.format( | |
839 factor=factor_name)] == factor_value][ | |
840 'Sample Name'].items() | |
841 | |
842 for indx, match in matches: | |
843 sample_name = match | |
844 if len([r for r in results if r['sample'] == | |
845 sample_name]) == 1: | |
846 continue | |
847 else: | |
848 results.append( | |
849 { | |
850 'sample': sample_name, | |
851 'data_files': [], | |
852 'query_used': factor_selection | |
853 } | |
854 ) | |
855 | |
856 # now collect the data files relating to the samples | |
857 for result in results: | |
858 sample_name = result['sample'] | |
859 | |
860 for table_file in glob.iglob(os.path.join(dir, 'a_*')): | |
861 with open(table_file) as fp: | |
862 df = isatab.load_table(fp) | |
863 | |
864 data_files = [] | |
865 | |
866 table_headers = list(df.columns.values) | |
867 sample_rows = df.loc[df['Sample Name'] == sample_name] | |
868 | |
869 data_node_labels = [ | |
870 'Raw Data File', | |
871 'Raw Spectral Data File', | |
872 'Derived Spectral Data File', | |
873 'Derived Array Data File', | |
874 'Array Data File', | |
875 'Protein Assignment File', | |
876 'Peptide Assignment File', | |
877 'Post Translational Modification Assignment File', | |
878 'Acquisition Parameter Data File', | |
879 'Free Induction Decay Data File', | |
880 'Derived Array Data Matrix File', | |
881 'Image File', | |
882 'Derived Data File', | |
883 'Metabolite Assignment File'] | |
884 for node_label in data_node_labels: | |
885 if node_label in table_headers: | |
886 data_files.extend(list(sample_rows[node_label])) | |
887 | |
888 result['data_files'] = [i for i in list(data_files) if | |
889 str(i) != 'nan'] | |
890 return results | |
891 | |
892 | |
893 def isatab_get_factor_names_command(options): | |
894 input_path = options.input_path | |
895 logger.info("Getting factors for study %s. Writing to %s.", | |
896 input_path, options.output.name) | |
897 _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]') | |
898 factors = set() | |
899 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
900 with open(os.path.join(input_path, table_file)) as fp: | |
901 df = isatab.load_table(fp) | |
902 | |
903 factors_headers = [header for header in list(df.columns.values) | |
904 if _RX_FACTOR_VALUE.match(header)] | |
905 | |
906 for header in factors_headers: | |
907 factors.add(header[13:-1]) | |
908 if factors is not None: | |
909 json.dump(list(factors), options.output, indent=4) | |
910 logger.debug("Factor names written") | |
911 else: | |
912 raise RuntimeError("Error reading factors.") | |
913 | |
914 | |
915 def zip_get_factor_names_command(options): | |
916 input_path = options.input_path | |
917 logger.info("Getting factors for study %s. Writing to %s.", | |
918 input_path, options.output.name) | |
919 # unpack input_path | |
920 with zipfile.ZipFile(input_path) as zfp: | |
921 tmpdir = tempfile.mkdtemp() | |
922 zfp.extractall(path=tmpdir) | |
923 _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]') | |
924 factors = set() | |
925 for table_file in glob.iglob(os.path.join(tmpdir, '[a|s]_*')): | |
926 logging.info('Searching {}'.format(table_file)) | |
927 with open(os.path.join(tmpdir, table_file)) as fp: | |
928 df = isatab.load_table(fp) | |
929 | |
930 factors_headers = [header for header in list(df.columns.values) | |
931 if _RX_FACTOR_VALUE.match(header)] | |
932 | |
933 for header in factors_headers: | |
934 factors.add(header[13:-1]) | |
935 if factors is not None: | |
936 json.dump(list(factors), options.output, indent=4) | |
937 logger.debug("Factor names written") | |
938 else: | |
939 raise RuntimeError("Error reading factors.") | |
940 shutil.rmtree(tmpdir) | |
941 | |
942 | |
943 def isatab_get_factor_values_command(options): | |
944 logger.info("Getting values for factor {factor} in study {input_path}. Writing to {output_file}." | |
945 .format(factor=options.factor, input_path=options.input_path, output_file=options.output.name)) | |
946 fvs = set() | |
947 | |
948 input_path = options.input_path | |
949 factor_name = options.factor | |
950 | |
951 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
952 with open(os.path.join(input_path, table_file)) as fp: | |
953 df = isatab.load_table(fp) | |
954 | |
955 if 'Factor Value[{factor}]'.format(factor=factor_name) in \ | |
956 list(df.columns.values): | |
957 for _, match in df[ | |
958 'Factor Value[{factor}]'.format( | |
959 factor=factor_name)].iteritems(): | |
960 try: | |
961 match = match.item() | |
962 except AttributeError: | |
963 pass | |
964 | |
965 if isinstance(match, (str, int, float)): | |
966 if str(match) != 'nan': | |
967 fvs.add(match) | |
968 if fvs is not None: | |
969 json.dump(list(fvs), options.output, indent=4) | |
970 logger.debug("Factor values written to {}".format(options.output)) | |
971 else: | |
972 raise RuntimeError("Error getting factor values") | |
973 | |
974 | |
975 def zip_get_factor_values_command(options): | |
976 input_path = options.input_path | |
977 logger.info("Getting factors for study %s. Writing to %s.", | |
978 input_path, options.output.name) | |
979 logger.info("Getting values for factor {factor} in study {input_path}. " | |
980 "Writing to {output_file}.".format( | |
981 factor=options.factor, input_path=options.input_path, | |
982 output_file=options.output.name)) | |
983 fvs = set() | |
984 factor_name = options.factor | |
985 | |
986 # unpack input_path | |
987 with zipfile.ZipFile(input_path) as zfp: | |
988 tmpdir = tempfile.mkdtemp() | |
989 zfp.extractall(path=tmpdir) | |
990 for table_file in glob.glob(os.path.join(tmpdir, '[a|s]_*')): | |
991 logging.info('Searching {}'.format(table_file)) | |
992 with open(os.path.join(input_path, table_file)) as fp: | |
993 df = isatab.load_table(fp) | |
994 if 'Factor Value[{factor}]'.format(factor=factor_name) in \ | |
995 list(df.columns.values): | |
996 for _, match in df[ | |
997 'Factor Value[{factor}]'.format( | |
998 factor=factor_name)].iteritems(): | |
999 try: | |
1000 match = match.item() | |
1001 except AttributeError: | |
1002 pass | |
1003 | |
1004 if isinstance(match, (str, int, float)): | |
1005 if str(match) != 'nan': | |
1006 fvs.add(match) | |
1007 if fvs is not None: | |
1008 json.dump(list(fvs), options.output, indent=4) | |
1009 logger.debug("Factor values written to {}".format(options.output)) | |
1010 else: | |
1011 raise RuntimeError("Error getting factor values") | |
1012 shutil.rmtree(tmpdir) | |
1013 | |
1014 | |
1015 def isatab_get_factors_summary_command(options): | |
1016 logger.info("Getting summary for study %s. Writing to %s.", | |
1017 options.input_path, options.output.name) | |
1018 input_path = options.input_path | |
1019 ISA = isatab.load(input_path) | |
1020 | |
1021 all_samples = [] | |
1022 for study in ISA.studies: | |
1023 all_samples.extend(study.samples) | |
1024 | |
1025 samples_and_fvs = [] | |
1026 | |
1027 for sample in all_samples: | |
1028 sample_and_fvs = { | |
1029 'sample_name': sample.name, | |
1030 } | |
1031 | |
1032 for fv in sample.factor_values: | |
1033 if isinstance(fv.value, (str, int, float)): | |
1034 fv_value = fv.value | |
1035 sample_and_fvs[fv.factor_name.name] = fv_value | |
1036 elif isinstance(fv.value, OntologyAnnotation): | |
1037 fv_value = fv.value.term | |
1038 sample_and_fvs[fv.factor_name.name] = fv_value | |
1039 | |
1040 samples_and_fvs.append(sample_and_fvs) | |
1041 | |
1042 df = pd.DataFrame(samples_and_fvs) | |
1043 nunique = df.apply(pd.Series.nunique) | |
1044 cols_to_drop = nunique[nunique == 1].index | |
1045 | |
1046 df = df.drop(cols_to_drop, axis=1) | |
1047 summary = df.to_dict(orient='records') | |
1048 if summary is not None: | |
1049 json.dump(summary, options.output, indent=4) | |
1050 logger.debug("Summary dumped to JSON") | |
1051 # html_summary = build_html_summary(summary) | |
1052 # with options.html_output as html_fp: | |
1053 # html_fp.write(html_summary) | |
1054 else: | |
1055 raise RuntimeError("Error getting study summary") | |
1056 | |
1057 | |
1058 def zip_get_factors_summary_command(options): | |
1059 logger.info("Getting summary for study %s. Writing to %s.", | |
1060 options.input_path, options.json_output.name) | |
1061 input_path = options.input_path | |
1062 with zipfile.ZipFile(input_path) as zfp: | |
1063 tmpdir = tempfile.mkdtemp() | |
1064 zfp.extractall(path=tmpdir) | |
1065 ISA = isatab.load(tmpdir) | |
1066 all_samples = [] | |
1067 for study in ISA.studies: | |
1068 all_samples.extend(study.samples) | |
1069 samples_and_fvs = [] | |
1070 for sample in all_samples: | |
1071 sample_and_fvs = { | |
1072 'sample_name': sample.name, | |
1073 } | |
1074 for fv in sample.factor_values: | |
1075 if isinstance(fv.value, (str, int, float)): | |
1076 fv_value = fv.value | |
1077 sample_and_fvs[fv.factor_name.name] = fv_value | |
1078 elif isinstance(fv.value, OntologyAnnotation): | |
1079 fv_value = fv.value.term | |
1080 sample_and_fvs[fv.factor_name.name] = fv_value | |
1081 samples_and_fvs.append(sample_and_fvs) | |
1082 df = pd.DataFrame(samples_and_fvs) | |
1083 nunique = df.apply(pd.Series.nunique) | |
1084 cols_to_drop = nunique[nunique == 1].index | |
1085 df = df.drop(cols_to_drop, axis=1) | |
1086 summary = df.to_dict(orient='records') | |
1087 if summary is not None: | |
1088 json.dump(summary, options.json_output, indent=4) | |
1089 logger.debug("Summary dumped to JSON") | |
1090 print(json.dumps(summary, indent=4)) | |
1091 html_summary = build_html_summary(summary) | |
1092 with options.html_output as html_fp: | |
1093 html_fp.write(html_summary) | |
1094 else: | |
1095 raise RuntimeError("Error getting study summary") | |
1096 shutil.rmtree(tmpdir) | |
1097 | |
1098 | |
1099 def get_study_groups(input_path): | |
1100 factors_summary = isatab_get_factors_summary_command(input_path=input_path) | |
1101 study_groups = {} | |
1102 | |
1103 for factors_item in factors_summary: | |
1104 fvs = tuple(factors_item[k] for k in factors_item.keys() if k != 'name') | |
1105 | |
1106 if fvs in study_groups.keys(): | |
1107 study_groups[fvs].append(factors_item['name']) | |
1108 else: | |
1109 study_groups[fvs] = [factors_item['name']] | |
1110 return study_groups | |
1111 | |
1112 | |
1113 def get_study_groups_samples_sizes(input_path): | |
1114 study_groups = get_study_groups(input_path=input_path) | |
1115 return list(map(lambda x: (x[0], len(x[1])), study_groups.items())) | |
1116 | |
1117 | |
1118 def get_sources_for_sample(input_path, sample_name): | |
1119 ISA = isatab.load(input_path) | |
1120 hits = [] | |
1121 | |
1122 for study in ISA.studies: | |
1123 for sample in study.samples: | |
1124 if sample.name == sample_name: | |
1125 print('found a hit: {sample_name}'.format( | |
1126 sample_name=sample.name)) | |
1127 | |
1128 for source in sample.derives_from: | |
1129 hits.append(source.name) | |
1130 return hits | |
1131 | |
1132 | |
1133 def get_data_for_sample(input_path, sample_name): | |
1134 ISA = isatab.load(input_path) | |
1135 hits = [] | |
1136 for study in ISA.studies: | |
1137 for assay in study.assays: | |
1138 for data in assay.data_files: | |
1139 if sample_name in [x.name for x in data.generated_from]: | |
1140 logger.info('found a hit: {filename}'.format( | |
1141 filename=data.filename)) | |
1142 hits.append(data) | |
1143 return hits | |
1144 | |
1145 | |
1146 def get_study_groups_data_sizes(input_path): | |
1147 study_groups = get_study_groups(input_path=input_path) | |
1148 return list(map(lambda x: (x[0], len(x[1])), study_groups.items())) | |
1149 | |
1150 | |
1151 def get_characteristics_summary(input_path): | |
1152 """ | |
1153 This function generates a characteristics summary for a MetaboLights | |
1154 study | |
1155 | |
1156 :param input_path: Input path to ISA-tab | |
1157 :return: A list of dicts summarising the set of characteristic names | |
1158 and values associated with each sample | |
1159 | |
1160 Note: it only returns a summary of characteristics with variable values. | |
1161 | |
1162 Example usage: | |
1163 characteristics_summary = get_characteristics_summary('/path/to/my/study/') | |
1164 [ | |
1165 { | |
1166 "name": "6089if_9", | |
1167 "Variant": "Synechocystis sp. PCC 6803.sll0171.ko" | |
1168 }, | |
1169 { | |
1170 "name": "6089if_43", | |
1171 "Variant": "Synechocystis sp. PCC 6803.WT.none" | |
1172 }, | |
1173 ] | |
1174 | |
1175 | |
1176 """ | |
1177 ISA = isatab.load(input_path) | |
1178 | |
1179 all_samples = [] | |
1180 for study in ISA.studies: | |
1181 all_samples.extend(study.samples) | |
1182 | |
1183 samples_and_characs = [] | |
1184 for sample in all_samples: | |
1185 sample_and_characs = { | |
1186 'name': sample.name | |
1187 } | |
1188 | |
1189 for source in sample.derives_from: | |
1190 for c in source.characteristics: | |
1191 if isinstance(c.value, (str, int, float)): | |
1192 c_value = c.value | |
1193 sample_and_characs[c.category.term] = c_value | |
1194 elif isinstance(c.value, OntologyAnnotation): | |
1195 c_value = c.value.term | |
1196 sample_and_characs[c.category.term] = c_value | |
1197 | |
1198 samples_and_characs.append(sample_and_characs) | |
1199 | |
1200 df = pd.DataFrame(samples_and_characs) | |
1201 nunique = df.apply(pd.Series.nunique) | |
1202 cols_to_drop = nunique[nunique == 1].index | |
1203 | |
1204 df = df.drop(cols_to_drop, axis=1) | |
1205 return df.to_dict(orient='records') | |
1206 | |
1207 | |
1208 def get_study_variable_summary(input_path): | |
1209 ISA = isatab.load(input_path) | |
1210 | |
1211 all_samples = [] | |
1212 for study in ISA.studies: | |
1213 all_samples.extend(study.samples) | |
1214 | |
1215 samples_and_variables = [] | |
1216 for sample in all_samples: | |
1217 sample_and_vars = { | |
1218 'sample_name': sample.name | |
1219 } | |
1220 | |
1221 for fv in sample.factor_values: | |
1222 if isinstance(fv.value, (str, int, float)): | |
1223 fv_value = fv.value | |
1224 sample_and_vars[fv.factor_name.name] = fv_value | |
1225 elif isinstance(fv.value, OntologyAnnotation): | |
1226 fv_value = fv.value.term | |
1227 sample_and_vars[fv.factor_name.name] = fv_value | |
1228 | |
1229 for source in sample.derives_from: | |
1230 sample_and_vars['source_name'] = source.name | |
1231 for c in source.characteristics: | |
1232 if isinstance(c.value, (str, int, float)): | |
1233 c_value = c.value | |
1234 sample_and_vars[c.category.term] = c_value | |
1235 elif isinstance(c.value, OntologyAnnotation): | |
1236 c_value = c.value.term | |
1237 sample_and_vars[c.category.term] = c_value | |
1238 | |
1239 samples_and_variables.append(sample_and_vars) | |
1240 | |
1241 df = pd.DataFrame(samples_and_variables) | |
1242 nunique = df.apply(pd.Series.nunique) | |
1243 cols_to_drop = nunique[nunique == 1].index | |
1244 | |
1245 df = df.drop(cols_to_drop, axis=1) | |
1246 return df.to_dict(orient='records') | |
1247 | |
1248 | |
1249 def get_study_group_factors(input_path): | |
1250 factors_list = [] | |
1251 | |
1252 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
1253 with open(os.path.join(input_path, table_file)) as fp: | |
1254 df = isatab.load_table(fp) | |
1255 | |
1256 factor_columns = [x for x in df.columns if x.startswith( | |
1257 'Factor Value')] | |
1258 if len(factor_columns) > 0: | |
1259 factors_list = df[factor_columns].drop_duplicates()\ | |
1260 .to_dict(orient='records') | |
1261 return factors_list | |
1262 | |
1263 | |
1264 def get_filtered_df_on_factors_list(input_path): | |
1265 factors_list = get_study_group_factors(input_path=input_path) | |
1266 queries = [] | |
1267 | |
1268 for item in factors_list: | |
1269 query_str = [] | |
1270 | |
1271 for k, v in item.items(): | |
1272 k = k.replace(' ', '_').replace('[', '_').replace(']', '_') | |
1273 if isinstance(v, str): | |
1274 v = v.replace(' ', '_').replace('[', '_').replace(']', '_') | |
1275 query_str.append("{k} == '{v}' and ".format(k=k, v=v)) | |
1276 | |
1277 query_str = ''.join(query_str)[:-4] | |
1278 queries.append(query_str) | |
1279 | |
1280 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
1281 with open(os.path.join(input_path, table_file)) as fp: | |
1282 df = isatab.load_table(fp) | |
1283 | |
1284 cols = df.columns | |
1285 cols = cols.map( | |
1286 lambda x: x.replace(' ', '_') if isinstance(x, str) else x) | |
1287 df.columns = cols | |
1288 | |
1289 cols = df.columns | |
1290 cols = cols.map( | |
1291 lambda x: x.replace('[', '_') if isinstance(x, str) else x) | |
1292 df.columns = cols | |
1293 | |
1294 cols = df.columns | |
1295 cols = cols.map( | |
1296 lambda x: x.replace(']', '_') if isinstance(x, str) else x) | |
1297 df.columns = cols | |
1298 | |
1299 for query in queries: | |
1300 # query uses pandas.eval, which evaluates queries like pure Python | |
1301 # notation | |
1302 df2 = df.query(query) | |
1303 if 'Sample_Name' in df.columns: | |
1304 print('Group: {query} / Sample_Name: {sample_name}'.format( | |
1305 query=query, sample_name=list(df2['Sample_Name']))) | |
1306 | |
1307 if 'Source_Name' in df.columns: | |
1308 print('Group: {} / Sources_Name: {}'.format( | |
1309 query, list(df2['Source_Name']))) | |
1310 | |
1311 if 'Raw_Spectral_Data_File' in df.columns: | |
1312 print('Group: {query} / Raw_Spectral_Data_File: {filename}' | |
1313 .format(query=query[13:-2], | |
1314 filename=list(df2['Raw_Spectral_Data_File']))) | |
1315 return queries | |
1316 | |
1317 | |
1318 def datatype_get_summary_command(options): | |
1319 logger.info("Getting summary for study %s. Writing to %s.", | |
1320 options.study_id, options.output.name) | |
1321 | |
1322 summary = get_study_variable_summary(options.study_id) | |
1323 print('summary: ', list(summary)) | |
1324 if summary is not None: | |
1325 json.dump(summary, options.output, indent=4) | |
1326 logger.debug("Summary dumped") | |
1327 else: | |
1328 raise RuntimeError("Error getting study summary") | |
1329 | |
1330 | |
1331 # logging and argument parsing | |
1332 | |
1333 def _configure_logger(options): | |
1334 logging_level = getattr(logging, options.log_level, logging.INFO) | |
1335 logging.basicConfig(level=logging_level) | |
1336 | |
1337 global logger | |
1338 logger = logging.getLogger() | |
1339 logger.setLevel(logging_level) # there's a bug somewhere. The level set through basicConfig isn't taking effect | |
1340 | |
1341 | |
1342 def _parse_args(args): | |
1343 parser = make_parser() | |
1344 options = parser.parse_args(args) | |
1345 return options | |
1346 | |
1347 | |
1348 def main(args): | |
1349 options = _parse_args(args) | |
1350 _configure_logger(options) | |
1351 # run subcommand | |
1352 options.func(options) | |
1353 | |
1354 | |
1355 if __name__ == '__main__': | |
1356 try: | |
1357 main(sys.argv[1:]) | |
1358 sys.exit(0) | |
1359 except Exception as e: | |
1360 logger.exception(e) | |
1361 logger.error(e) | |
1362 sys.exit(e.code if hasattr(e, "code") else 99) |