Mercurial > repos > prog > mtblsdwnld
comparison isaslicer.py @ 0:8dab200e02cb draft
"planemo upload commit 239561a6401593c5f87df40ac971a9aa393c4663-dirty"
| author | prog |
|---|---|
| date | Tue, 07 Jan 2020 09:05:21 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8dab200e02cb |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import argparse | |
| 4 import glob | |
| 5 import json | |
| 6 import logging | |
| 7 import os | |
| 8 import re | |
| 9 import shutil | |
| 10 import sys | |
| 11 import tempfile | |
| 12 import zipfile | |
| 13 | |
| 14 import pandas as pd | |
| 15 from isatools import isatab | |
| 16 from isatools.model import OntologyAnnotation | |
| 17 from isatools.net import mtbls as MTBLS | |
| 18 | |
| 19 logger = None | |
| 20 | |
| 21 # isaslicer.py <command> <study_id> [ command-specific options ] | |
| 22 | |
| 23 | |
| 24 def make_parser(): | |
| 25 parser = argparse.ArgumentParser( description="ISA slicer") | |
| 26 | |
| 27 parser.add_argument('--log-level', choices=[ | |
| 28 'DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'], | |
| 29 default='INFO', help="Set the desired logging level") | |
| 30 | |
| 31 subparsers = parser.add_subparsers( | |
| 32 title='Actions', | |
| 33 dest='command') # specified subcommand will be available in attribute 'command' | |
| 34 subparsers.required = True | |
| 35 | |
| 36 # mtblisa commands | |
| 37 | |
| 38 subparser = subparsers.add_parser( | |
| 39 'mtbls-get-study-archive', aliases=['gsa'], | |
| 40 help="Get ISA study from MetaboLights as zip archive") | |
| 41 subparser.set_defaults(func=get_study_archive_command) | |
| 42 subparser.add_argument('study_id') | |
| 43 subparser.add_argument( | |
| 44 'output', metavar="OUTPUT", | |
| 45 help="Name of output archive (extension will be added)") | |
| 46 subparser.add_argument('--format', metavar="FMT", choices=[ | |
| 47 'zip', 'tar', 'gztar', 'bztar', 'xztar'], default='zip', | |
| 48 help="Type of archive to create") | |
| 49 | |
| 50 subparser = subparsers.add_parser('mtbls-get-study', aliases=['gs'], | |
| 51 help="Get ISA study from MetaboLights") | |
| 52 subparser.set_defaults(func=get_study_command) | |
| 53 subparser.add_argument('study_id') | |
| 54 subparser.add_argument('output', metavar="PATH", help="Name of output") | |
| 55 subparser.add_argument( | |
| 56 '-f', '--isa-format', choices=['isa-tab', 'isa-json'], | |
| 57 metavar="FORMAT", default='isa-tab', help="Desired ISA format") | |
| 58 | |
| 59 subparser = subparsers.add_parser( | |
| 60 'mtbls-get-factors', aliases=['gf'], | |
| 61 help="Get factor names from a study in json format") | |
| 62 subparser.set_defaults(func=get_factors_command) | |
| 63 subparser.add_argument('study_id') | |
| 64 subparser.add_argument( | |
| 65 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 66 help="Output file") | |
| 67 | |
| 68 subparser = subparsers.add_parser( | |
| 69 'mtbls-get-factor-values', aliases=['gfv'], | |
| 70 help="Get factor values from a study in json format") | |
| 71 subparser.set_defaults(func=get_factor_values_command) | |
| 72 subparser.add_argument('study_id') | |
| 73 subparser.add_argument( | |
| 74 'factor', help="The desired factor. Use `get-factors` to get the list " | |
| 75 "of available factors") | |
| 76 subparser.add_argument( | |
| 77 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 78 help="Output file") | |
| 79 | |
| 80 subparser = subparsers.add_parser('mtbls-get-data-list', aliases=['gd'], | |
| 81 help="Get data files list in json format") | |
| 82 subparser.set_defaults(func=get_data_files_command) | |
| 83 subparser.add_argument('study_id') | |
| 84 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 85 help="Output file") | |
| 86 subparser.add_argument( | |
| 87 '--json-query', | |
| 88 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
| 89 subparser.add_argument( | |
| 90 '--galaxy_parameters_file', | |
| 91 help="Path to JSON file containing input Galaxy JSON") | |
| 92 | |
| 93 subparser = subparsers.add_parser( | |
| 94 'mtbls-get-factors-summary', aliases=['gsum'], | |
| 95 help="Get the variables summary from a study, in json format") | |
| 96 subparser.set_defaults(func=get_summary_command) | |
| 97 subparser.add_argument('study_id') | |
| 98 subparser.add_argument( | |
| 99 'json_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 100 help="Output JSON file") | |
| 101 subparser.add_argument( | |
| 102 'html_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 103 help="Output HTML file") | |
| 104 | |
| 105 # isaslicer commands on path to unpacked ISA-Tab as input | |
| 106 | |
| 107 subparser = subparsers.add_parser( | |
| 108 'isa-tab-get-factors', aliases=['isagf'], | |
| 109 help="Get factor names from a study in json format") | |
| 110 subparser.set_defaults(func=isatab_get_factor_names_command) | |
| 111 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 112 subparser.add_argument( | |
| 113 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 114 help="Output file") | |
| 115 | |
| 116 subparser = subparsers.add_parser( | |
| 117 'zip-get-factors', aliases=['zipgf'], | |
| 118 help="Get factor names from a study in json format") | |
| 119 subparser.set_defaults(func=zip_get_factor_names_command) | |
| 120 subparser.add_argument('input_path', type=str, | |
| 121 help="Input ISA-Tab zip path") | |
| 122 subparser.add_argument( | |
| 123 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 124 help="Output file") | |
| 125 | |
| 126 subparser = subparsers.add_parser( | |
| 127 'isa-tab-get-factor-values', aliases=['isagfv'], | |
| 128 help="Get factor values from a study in json format") | |
| 129 subparser.set_defaults(func=isatab_get_factor_values_command) | |
| 130 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 131 subparser.add_argument( | |
| 132 'factor', help="The desired factor. Use `get-factors` to get the list " | |
| 133 "of available factors") | |
| 134 subparser.add_argument( | |
| 135 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 136 help="Output file") | |
| 137 | |
| 138 subparser = subparsers.add_parser( | |
| 139 'zip-get-factor-values', aliases=['zipgfv'], | |
| 140 help="Get factor values from a study in json format") | |
| 141 subparser.set_defaults(func=zip_get_factor_values_command) | |
| 142 subparser.add_argument('input_path', type=str, | |
| 143 help="Input ISA-Tab zip path") | |
| 144 subparser.add_argument( | |
| 145 'factor', help="The desired factor. Use `get-factors` to get the list " | |
| 146 "of available factors") | |
| 147 subparser.add_argument( | |
| 148 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 149 help="Output file") | |
| 150 | |
| 151 subparser = subparsers.add_parser('isa-tab-get-data-list', aliases=['isagdl'], | |
| 152 help="Get data files list in json format") | |
| 153 subparser.set_defaults(func=isatab_get_data_files_list_command) | |
| 154 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 155 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 156 help="Output file") | |
| 157 subparser.add_argument( | |
| 158 '--json-query', | |
| 159 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
| 160 subparser.add_argument( | |
| 161 '--galaxy_parameters_file', | |
| 162 help="Path to JSON file containing input Galaxy JSON") | |
| 163 | |
| 164 subparser = subparsers.add_parser('zip-get-data-list', aliases=['zipgdl'], | |
| 165 help="Get data files list in json format") | |
| 166 subparser.set_defaults(func=zip_get_data_files_list_command) | |
| 167 subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path") | |
| 168 subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 169 help="Output file") | |
| 170 subparser.add_argument( | |
| 171 '--json-query', | |
| 172 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
| 173 subparser.add_argument( | |
| 174 '--galaxy_parameters_file', | |
| 175 help="Path to JSON file containing input Galaxy JSON") | |
| 176 | |
| 177 subparser = subparsers.add_parser('isa-tab-get-data-collection', aliases=['isagdc'], | |
| 178 help="Get data files collection") | |
| 179 subparser.set_defaults(func=isatab_get_data_files_collection_command) | |
| 180 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 181 subparser.add_argument('output_path', type=str, help="Output data files path") | |
| 182 subparser.add_argument( | |
| 183 '--json-query', | |
| 184 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
| 185 subparser.add_argument( | |
| 186 '--galaxy_parameters_file', | |
| 187 help="Path to JSON file containing input Galaxy JSON") | |
| 188 | |
| 189 subparser = subparsers.add_parser('zip-get-data-collection', aliases=['zipgdc'], | |
| 190 help="Get data files collection") | |
| 191 subparser.set_defaults(func=zip_get_data_files_collection_command) | |
| 192 subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path") | |
| 193 subparser.add_argument('output_path', type=str, help="Output data files path") | |
| 194 subparser.add_argument( | |
| 195 '--json-query', | |
| 196 help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'") | |
| 197 | |
| 198 subparser = subparsers.add_parser( | |
| 199 'isa-tab-get-factors-summary', aliases=['isasum'], | |
| 200 help="Get the variables summary from a study, in json format") | |
| 201 subparser.set_defaults(func=isatab_get_factors_summary_command) | |
| 202 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 203 subparser.add_argument( | |
| 204 'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, | |
| 205 help="Output file") | |
| 206 | |
| 207 subparser = subparsers.add_parser( | |
| 208 'zip-get-factors-summary', aliases=['zipsum'], | |
| 209 help="Get the variables summary from a study, in json format") | |
| 210 subparser.set_defaults(func=zip_get_factors_summary_command) | |
| 211 subparser.add_argument('input_path', type=str, | |
| 212 help="Input ISA-Tab zip path") | |
| 213 subparser.add_argument( | |
| 214 'json_output', nargs='?', type=argparse.FileType('w'), | |
| 215 default=sys.stdout, | |
| 216 help="Output JSON file") | |
| 217 subparser.add_argument( | |
| 218 'html_output', nargs='?', type=argparse.FileType('w'), | |
| 219 default=sys.stdout, | |
| 220 help="Output HTML file") | |
| 221 | |
| 222 subparser = subparsers.add_parser( | |
| 223 'isaslicer2-slice', aliases=['slice2'], | |
| 224 help="Slice ISA-Tabs version 2") | |
| 225 subparser.set_defaults(func=query_isatab) | |
| 226 subparser.add_argument('--source_dir', type=str, | |
| 227 help="Input ISA-Tab zip path") | |
| 228 subparser.add_argument( | |
| 229 '--galaxy_parameters_file', type=argparse.FileType(mode='r'), | |
| 230 help="Path to JSON file containing input Galaxy JSON") | |
| 231 subparser.add_argument('--output', type=argparse.FileType(mode='w'), | |
| 232 help="Input ISA-Tab zip path") | |
| 233 | |
| 234 subparser = subparsers.add_parser( | |
| 235 'filter-data', aliases=['filter'], | |
| 236 help="Filter out data based on slicer2") | |
| 237 subparser.set_defaults(func=filter_data) | |
| 238 subparser.add_argument('input_path', type=str, help="Input ISA-Tab path") | |
| 239 subparser.add_argument('output_path', type=str, help="Output data files path") | |
| 240 subparser.add_argument('--slice', type=argparse.FileType(mode='r'), | |
| 241 help="slice") | |
| 242 subparser.add_argument('--filename_filter', type=str, help="shell-like wildcard to filter files") | |
| 243 | |
| 244 return parser | |
| 245 | |
| 246 | |
| 247 def filter_data(options): | |
| 248 loglines = [] | |
| 249 source_dir = options.input_path if options.input_path else "" | |
| 250 output_path = options.output_path | |
| 251 filename_filter = options.filename_filter | |
| 252 if source_dir: | |
| 253 if not os.path.exists(source_dir): | |
| 254 raise IOError('Source path does not exist!') | |
| 255 data_files = [] | |
| 256 slice_json = options.slice | |
| 257 for result in json.load(slice_json)['results']: | |
| 258 data_files.extend(result.get('data_files', [])) | |
| 259 reduced_data_files = list(set(data_files)) | |
| 260 filtered_files = glob.glob(os.path.join(source_dir, filename_filter)) | |
| 261 to_copy = [] | |
| 262 for filepath in filtered_files: | |
| 263 if os.path.basename(filepath) in reduced_data_files: | |
| 264 to_copy.append(filepath) | |
| 265 loglines.append("Using slice results from {}\n".format(slice_json.name)) | |
| 266 for filepath in to_copy: | |
| 267 loglines.append("Copying {}\n".format(os.path.basename(filepath))) | |
| 268 # try: | |
| 269 # shutil.copyfile( | |
| 270 # filepath, os.path.join(output_path, os.path.basename(filepath))) | |
| 271 # except Exception as e: | |
| 272 # print(e) | |
| 273 # exit(1) | |
| 274 try: | |
| 275 os.symlink( | |
| 276 filepath, os.path.join(output_path, os.path.basename(filepath))) | |
| 277 except Exception as e: | |
| 278 print(e) | |
| 279 exit(1) | |
| 280 with open('cli.log', 'w') as fp: | |
| 281 fp.writelines(loglines) | |
| 282 | |
| 283 | |
| 284 def query_isatab(options): | |
| 285 source_dir = options.source_dir if options.source_dir else "" | |
| 286 galaxy_parameters_file = options.galaxy_parameters_file | |
| 287 output = options.output | |
| 288 | |
| 289 debug = True | |
| 290 if galaxy_parameters_file: | |
| 291 galaxy_parameters = json.load(galaxy_parameters_file) | |
| 292 print('Galaxy parameters:') | |
| 293 print(json.dumps(galaxy_parameters, indent=4)) | |
| 294 else: | |
| 295 raise IOError('Could not load Galaxy parameters file!') | |
| 296 if source_dir: | |
| 297 if not os.path.exists(source_dir): | |
| 298 raise IOError('Source path does not exist!') | |
| 299 query = galaxy_parameters['query'] | |
| 300 if debug: | |
| 301 print('Query is:') | |
| 302 print(json.dumps(query, indent=4)) # for debugging only | |
| 303 if source_dir: | |
| 304 investigation = isatab.load(source_dir) | |
| 305 else: | |
| 306 tmp = tempfile.mkdtemp() | |
| 307 _ = MTBLS.get(galaxy_parameters['input']['mtbls_id'], tmp) | |
| 308 investigation = isatab.load(tmp) | |
| 309 # filter assays by mt/tt | |
| 310 matching_assays = [] | |
| 311 mt = query.get('measurement_type').strip() | |
| 312 tt = query.get('technology_type').strip() | |
| 313 if mt and tt: | |
| 314 for study in investigation.studies: | |
| 315 matching_assays.extend( | |
| 316 [x for x in study.assays if x.measurement_type.term == mt | |
| 317 and x.technology_type.term == tt]) | |
| 318 elif mt and not tt: | |
| 319 for study in investigation.studies: | |
| 320 matching_assays.extend( | |
| 321 [x for x in study.assays if x.measurement_type.term == mt]) | |
| 322 elif not mt and tt: | |
| 323 for study in investigation.studies: | |
| 324 matching_assays.extend( | |
| 325 [x for x in study.assays if x.technology_type.term == tt]) | |
| 326 else: | |
| 327 for study in investigation.studies: | |
| 328 matching_assays.extend(study.assays) | |
| 329 assay_samples = [] | |
| 330 for assay in matching_assays: | |
| 331 assay_samples.extend(assay.samples) | |
| 332 if debug: | |
| 333 print('Total samples: {}'.format(len(assay_samples))) | |
| 334 | |
| 335 # filter samples by fv | |
| 336 factor_selection = { | |
| 337 x.get('factor_name').strip(): x.get('factor_value').strip() for x in | |
| 338 query.get('factor_selection', [])} | |
| 339 | |
| 340 fv_samples = set() | |
| 341 if factor_selection: | |
| 342 samples_to_remove = set() | |
| 343 for f, v in factor_selection.items(): | |
| 344 for sample in assay_samples: | |
| 345 for fv in [x for x in sample.factor_values if | |
| 346 x.factor_name.name == f]: | |
| 347 if isinstance(fv.value, OntologyAnnotation): | |
| 348 if fv.value.term == v: | |
| 349 fv_samples.add(sample) | |
| 350 elif fv.value == v: | |
| 351 fv_samples.add(sample) | |
| 352 for f, v in factor_selection.items(): | |
| 353 for sample in fv_samples: | |
| 354 for fv in [x for x in sample.factor_values if | |
| 355 x.factor_name.name == f]: | |
| 356 if isinstance(fv.value, OntologyAnnotation): | |
| 357 if fv.value.term != v: | |
| 358 samples_to_remove.add(sample) | |
| 359 elif fv.value != v: | |
| 360 samples_to_remove.add(sample) | |
| 361 final_fv_samples = fv_samples.difference(samples_to_remove) | |
| 362 else: | |
| 363 final_fv_samples = assay_samples | |
| 364 | |
| 365 # filter samples by characteristic | |
| 366 characteristics_selection = { | |
| 367 x.get('characteristic_name').strip(): | |
| 368 x.get('characteristic_value').strip() for x in | |
| 369 query.get('characteristics_selection', [])} | |
| 370 | |
| 371 cv_samples = set() | |
| 372 if characteristics_selection: | |
| 373 first_pass = True | |
| 374 samples_to_remove = set() | |
| 375 for c, v in characteristics_selection.items(): | |
| 376 if first_pass: | |
| 377 for sample in final_fv_samples: | |
| 378 for cv in [x for x in sample.characteristics if | |
| 379 x.category.term == c]: | |
| 380 if isinstance(cv.value, OntologyAnnotation): | |
| 381 if cv.value.term == v: | |
| 382 cv_samples.add(sample) | |
| 383 elif cv.value == v: | |
| 384 cv_samples.add(sample) | |
| 385 for source in sample.derives_from: | |
| 386 for cv in [x for x in source.characteristics if | |
| 387 x.category.term == c]: | |
| 388 if isinstance(cv.value, OntologyAnnotation): | |
| 389 if cv.value.term == v: | |
| 390 cv_samples.add(sample) | |
| 391 elif cv.value == v: | |
| 392 cv_samples.add(sample) | |
| 393 first_pass = False | |
| 394 else: | |
| 395 for sample in cv_samples: | |
| 396 for cv in [x for x in sample.characteristics if | |
| 397 x.category.term == c]: | |
| 398 if isinstance(cv.value, OntologyAnnotation): | |
| 399 if cv.value.term != v: | |
| 400 samples_to_remove.add(sample) | |
| 401 elif cv.value != v: | |
| 402 samples_to_remove.add(sample) | |
| 403 for source in sample.derives_from: | |
| 404 for cv in [x for x in source.characteristics if | |
| 405 x.category.term == c]: | |
| 406 if isinstance(cv.value, OntologyAnnotation): | |
| 407 if cv.value.term != v: | |
| 408 samples_to_remove.add(sample) | |
| 409 elif cv.value != v: | |
| 410 samples_to_remove.add(sample) | |
| 411 final_cv_samples = cv_samples.difference(samples_to_remove) | |
| 412 else: | |
| 413 final_cv_samples = final_fv_samples | |
| 414 | |
| 415 # filter samples by process parameter | |
| 416 parameters_selection = { | |
| 417 x.get('parameter_name').strip(): | |
| 418 x.get('parameter_value').strip() for x in | |
| 419 query.get('parameter_selection', [])} | |
| 420 | |
| 421 final_samples = final_cv_samples | |
| 422 | |
| 423 if debug: | |
| 424 print('Final number of samples: {}'.format(len(final_samples))) | |
| 425 results = [] | |
| 426 for sample in final_samples: | |
| 427 results.append({ | |
| 428 'sample_name': sample.name, | |
| 429 'data_files': [] | |
| 430 }) | |
| 431 for result in results: | |
| 432 sample_name = result['sample_name'] | |
| 433 if source_dir: | |
| 434 table_files = glob.iglob(os.path.join(source_dir, 'a_*')) | |
| 435 else: | |
| 436 table_files = glob.iglob(os.path.join(tmp, 'a_*')) | |
| 437 for table_file in table_files: | |
| 438 with open(table_file) as fp: | |
| 439 df = isatab.load_table(fp) | |
| 440 data_files = [] | |
| 441 table_headers = list(df.columns.values) | |
| 442 sample_rows = df.loc[df['Sample Name'] == sample_name] | |
| 443 data_node_labels = [ | |
| 444 'Raw Data File', 'Raw Spectral Data File', | |
| 445 'Derived Spectral Data File', | |
| 446 'Derived Array Data File', 'Array Data File', | |
| 447 'Protein Assignment File', 'Peptide Assignment File', | |
| 448 'Post Translational Modification Assignment File', | |
| 449 'Acquisition Parameter Data File', | |
| 450 'Free Induction Decay Data File', | |
| 451 'Derived Array Data Matrix File', 'Image File', | |
| 452 'Derived Data File', 'Metabolite Assignment File'] | |
| 453 if parameters_selection: | |
| 454 for p, v in parameters_selection.items(): | |
| 455 sample_pv_rows = sample_rows.loc[ | |
| 456 sample_rows['Parameter Value[{}]'.format(p)] == v] | |
| 457 for node_label in data_node_labels: | |
| 458 if node_label in table_headers: | |
| 459 data_files.extend( | |
| 460 list(sample_pv_rows[node_label])) | |
| 461 result['data_files'].extend(list(set( | |
| 462 i for i in list(data_files) if | |
| 463 str(i) not in ('nan', '')))) | |
| 464 else: | |
| 465 for node_label in data_node_labels: | |
| 466 if node_label in table_headers: | |
| 467 data_files.extend(list(sample_rows[node_label])) | |
| 468 result['data_files'].extend( | |
| 469 list(set(i for i in list(data_files) if | |
| 470 str(i) not in ('nan', '')))) | |
| 471 results_json = { | |
| 472 'query': query, | |
| 473 'results': results | |
| 474 } | |
| 475 json.dump(results_json, output, indent=4) | |
| 476 | |
| 477 # if galaxy_parameters['input']['collection_output']: | |
| 478 # logger = logging.getLogger() | |
| 479 # logger.debug("copying data files to %s", os.path.dirname(output)) | |
| 480 # for result in results: | |
| 481 # for data_file_name in result['data_files']: | |
| 482 # logging.info("Copying {}".format(data_file_name)) | |
| 483 # shutil.copy(os.path.join(source_dir, data_file_name), | |
| 484 # os.path.dirname(output)) | |
| 485 # logger.info( | |
| 486 # "Finished writing data files to {}".format(os.path.dirname(output))) | |
| 487 | |
| 488 | |
| 489 def get_study_archive_command(options): | |
| 490 study_id = options.study_id | |
| 491 | |
| 492 logger.info("Downloading study %s into archive at path %s.%s", | |
| 493 study_id, options.output, options.format) | |
| 494 | |
| 495 tmpdir = MTBLS.get(study_id) | |
| 496 logger.debug("MTBLS.get returned '%s'", tmpdir) | |
| 497 if tmpdir is not None: | |
| 498 try: | |
| 499 shutil.make_archive( | |
| 500 options.output, options.format, tmpdir, logger=logger) | |
| 501 logger.info("ISA archive written") | |
| 502 finally: | |
| 503 logger.debug("Trying to clean up tmp dir %s", tmpdir) | |
| 504 shutil.rmtree(tmpdir, ignore_errors=True) | |
| 505 else: | |
| 506 raise RuntimeError("Error downloading ISA study") | |
| 507 | |
| 508 # mtblisa commands | |
| 509 | |
| 510 | |
| 511 def get_study_command(options): | |
| 512 if os.path.exists(options.output): | |
| 513 raise RuntimeError("Selected output path {} already exists!".format( | |
| 514 options.output)) | |
| 515 | |
| 516 if options.isa_format == "isa-tab": | |
| 517 tmp_data = None | |
| 518 try: | |
| 519 logger.info("Downloading study %s", options.study_id) | |
| 520 tmp_data = MTBLS.get(options.study_id) | |
| 521 if tmp_data is None: | |
| 522 raise RuntimeError("Error downloading ISA study") | |
| 523 | |
| 524 logger.debug( | |
| 525 "Finished downloading data. Moving to final location %s", | |
| 526 options.output) | |
| 527 shutil.move(tmp_data, options.output) | |
| 528 logger.info("ISA archive written to %s", options.output) | |
| 529 finally: | |
| 530 if tmp_data: | |
| 531 # try to clean up any temporary files left behind | |
| 532 logger.debug("Deleting %s, if there's anything there", tmp_data) | |
| 533 shutil.rmtree(tmp_data, ignore_errors=True) | |
| 534 elif options.isa_format == "isa-json": | |
| 535 isajson = MTBLS.getj(options.study_id) | |
| 536 if isajson is None: | |
| 537 raise RuntimeError("Error downloading ISA study") | |
| 538 | |
| 539 logger.debug( | |
| 540 "Finished downloading data. Dumping json to final location %s", | |
| 541 options.output) | |
| 542 os.makedirs(options.output) | |
| 543 json_file = os.path.join(options.output, "{}.json".format( | |
| 544 isajson['identifier'])) | |
| 545 with open(json_file, 'w') as fd: | |
| 546 json.dump(isajson, fd) | |
| 547 logger.info("ISA-JSON written to %s", options.output) | |
| 548 else: | |
| 549 raise ValueError("BUG! Got an invalid isa format '{}'".format( | |
| 550 options.isa_format)) | |
| 551 | |
| 552 | |
| 553 def get_factors_command(options): | |
| 554 logger.info("Getting factors for study %s. Writing to %s.", | |
| 555 options.study_id, options.output.name) | |
| 556 factor_names = MTBLS.get_factor_names(options.study_id) | |
| 557 if factor_names is not None: | |
| 558 json.dump(list(factor_names), options.output, indent=4) | |
| 559 logger.debug("Factor names written") | |
| 560 else: | |
| 561 raise RuntimeError("Error downloading factors.") | |
| 562 | |
| 563 | |
| 564 def get_factor_values_command(options): | |
| 565 logger.info("Getting values for factor {factor} in study {study_id}. Writing to {output_file}." | |
| 566 .format(factor=options.factor, study_id=options.study_id, output_file=options.output.name)) | |
| 567 fvs = MTBLS.get_factor_values(options.study_id, options.factor) | |
| 568 if fvs is not None: | |
| 569 json.dump(list(fvs), options.output, indent=4) | |
| 570 logger.debug("Factor values written to {}".format(options.output)) | |
| 571 else: | |
| 572 raise RuntimeError("Error getting factor values") | |
| 573 | |
| 574 | |
| 575 def get_data_files_command(options): | |
| 576 logger.info("Getting data files for study %s. Writing to %s.", | |
| 577 options.study_id, options.output.name) | |
| 578 if options.json_query: | |
| 579 logger.debug("This is the specified query:\n%s", options.json_query) | |
| 580 json_struct = json.loads(options.json_query) | |
| 581 data_files = MTBLS.get_data_files(options.study_id, json_struct) | |
| 582 elif options.galaxy_parameters_file: | |
| 583 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
| 584 options.galaxy_parameters_file) | |
| 585 with open(options.galaxy_parameters_file) as json_fp: | |
| 586 galaxy_json = json.load(json_fp) | |
| 587 json_struct = {} | |
| 588 for fv_item in galaxy_json['factor_value_series']: | |
| 589 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
| 590 data_files = MTBLS.get_data_files(options.study_id, json_struct) | |
| 591 else: | |
| 592 logger.debug("No query was specified") | |
| 593 data_files = MTBLS.get_data_files(options.study_id) | |
| 594 | |
| 595 logger.debug("Result data files list: %s", data_files) | |
| 596 if data_files is None: | |
| 597 raise RuntimeError("Error getting data files with isatools") | |
| 598 | |
| 599 logger.debug("dumping data files to %s", options.output.name) | |
| 600 json.dump(list(data_files), options.output, indent=4) | |
| 601 logger.info("Finished writing data files to {}".format(options.output)) | |
| 602 | |
| 603 | |
| 604 def build_html_data_files_list(data_files_list): | |
| 605 data_files_table = '<table>' | |
| 606 data_files_table += '<tr><th>Sample Name</th><th>Data File Names</th></tr>' | |
| 607 for data_file in data_files_list: | |
| 608 sample_name = data_file['sample'] | |
| 609 data_files = ', '.join(data_file['data_files']) | |
| 610 data_files_table += '<tr><td>{sample_name}</td><td>{data_files}</td>' \ | |
| 611 .format(sample_name=sample_name, data_files=data_files) | |
| 612 html_data_files_list = """ | |
| 613 <html> | |
| 614 <head> | |
| 615 <title>ISA-Tab Factors Summary</title> | |
| 616 </head> | |
| 617 <body> | |
| 618 {summary_table} | |
| 619 </body> | |
| 620 </html> | |
| 621 """.format(summary_table=data_files_table) | |
| 622 return html_data_files_list | |
| 623 | |
| 624 | |
| 625 def build_html_summary(summary): | |
| 626 study_groups = {} | |
| 627 for item in summary: | |
| 628 sample_name = item['sample_name'] | |
| 629 study_factors = [] | |
| 630 for item in [x for x in item.items() if x[0] != "sample_name"]: | |
| 631 study_factors.append(': '.join([item[0], item[1]])) | |
| 632 study_group = ', '.join(study_factors) | |
| 633 if study_group not in study_groups.keys(): | |
| 634 study_groups[study_group] = [] | |
| 635 study_groups[study_group].append(sample_name) | |
| 636 summary_table = '<table>' | |
| 637 summary_table += '<tr><th>Study group</th><th>Number of samples</th></tr>' | |
| 638 for item in study_groups.items(): | |
| 639 study_group = item[0] | |
| 640 num_samples = len(item[1]) | |
| 641 summary_table += '<tr><td>{study_group}</td><td>{num_samples}</td>' \ | |
| 642 .format(study_group=study_group, num_samples=num_samples) | |
| 643 summary_table += '</table>' | |
| 644 html_summary = """ | |
| 645 <html> | |
| 646 <head> | |
| 647 <title>ISA-Tab Factors Summary</title> | |
| 648 </head> | |
| 649 <body> | |
| 650 {summary_table} | |
| 651 </body> | |
| 652 </html> | |
| 653 """.format(summary_table=summary_table) | |
| 654 return html_summary | |
| 655 | |
| 656 | |
| 657 def get_summary_command(options): | |
| 658 logger.info("Getting summary for study %s. Writing to %s.", | |
| 659 options.study_id, options.json_output.name) | |
| 660 | |
| 661 summary = MTBLS.get_study_variable_summary(options.study_id) | |
| 662 # new_summary = [] | |
| 663 # for item in summary: | |
| 664 # new_summary.append( | |
| 665 # {k: v for k, v in item.items() if k is not "sample_name"}) | |
| 666 # summary = new_summary | |
| 667 if summary is not None: | |
| 668 json.dump(summary, options.json_output, indent=4) | |
| 669 logger.debug("Summary dumped to JSON") | |
| 670 html_summary = build_html_summary(summary) | |
| 671 with options.html_output as html_fp: | |
| 672 html_fp.write(html_summary) | |
| 673 else: | |
| 674 raise RuntimeError("Error getting study summary") | |
| 675 | |
| 676 | |
| 677 # isaslicer commands | |
| 678 | |
| 679 def isatab_get_data_files_list_command(options): | |
| 680 logger.info("Getting data files for study %s. Writing to %s.", | |
| 681 options.input_path, options.output.name) | |
| 682 if options.json_query: | |
| 683 logger.debug("This is the specified query:\n%s", options.json_query) | |
| 684 json_struct = json.loads(options.json_query) | |
| 685 elif options.galaxy_parameters_file: | |
| 686 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
| 687 options.galaxy_parameters_file) | |
| 688 with open(options.galaxy_parameters_file) as json_fp: | |
| 689 galaxy_json = json.load(json_fp) | |
| 690 json_struct = {} | |
| 691 for fv_item in galaxy_json['factor_value_series']: | |
| 692 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
| 693 else: | |
| 694 logger.debug("No query was specified") | |
| 695 json_struct = None | |
| 696 factor_selection = json_struct | |
| 697 input_path = options.input_path | |
| 698 result = slice_data_files(input_path, factor_selection=factor_selection) | |
| 699 data_files = result | |
| 700 logger.debug("Result data files list: %s", data_files) | |
| 701 if data_files is None: | |
| 702 raise RuntimeError("Error getting data files with isatools") | |
| 703 | |
| 704 logger.debug("dumping data files to %s", options.output.name) | |
| 705 json.dump(list(data_files), options.output, indent=4) | |
| 706 logger.info("Finished writing data files to {}".format(options.output)) | |
| 707 | |
| 708 | |
| 709 def zip_get_data_files_list_command(options): | |
| 710 logger.info("Getting data files for study %s. Writing to %s.", | |
| 711 options.input_path, options.output.name) | |
| 712 if options.json_query: | |
| 713 logger.debug("This is the specified query:\n%s", options.json_query) | |
| 714 json_struct = json.loads(options.json_query) | |
| 715 elif options.galaxy_parameters_file: | |
| 716 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
| 717 options.galaxy_parameters_file) | |
| 718 with open(options.galaxy_parameters_file) as json_fp: | |
| 719 galaxy_json = json.load(json_fp) | |
| 720 json_struct = {} | |
| 721 for fv_item in galaxy_json['factor_value_series']: | |
| 722 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
| 723 else: | |
| 724 logger.debug("No query was specified") | |
| 725 json_struct = None | |
| 726 factor_selection = json_struct | |
| 727 input_path = options.input_path | |
| 728 with zipfile.ZipFile(input_path) as zfp: | |
| 729 tmpdir = tempfile.mkdtemp() | |
| 730 zfp.extractall(path=tmpdir) | |
| 731 result = slice_data_files(tmpdir, factor_selection=factor_selection) | |
| 732 data_files = result | |
| 733 logger.debug("Result data files list: %s", data_files) | |
| 734 if data_files is None: | |
| 735 raise RuntimeError("Error getting data files with isatools") | |
| 736 logger.debug("dumping data files to %s", options.output.name) | |
| 737 json.dump(list(data_files), options.output, indent=4) | |
| 738 logger.info("Finished writing data files to {}".format(options.output)) | |
| 739 shutil.rmtree(tmpdir) | |
| 740 | |
| 741 | |
| 742 def isatab_get_data_files_collection_command(options): | |
| 743 logger.info("Getting data files for study %s. Writing to %s.", | |
| 744 options.input_path, options.output_path) | |
| 745 if options.json_query: | |
| 746 logger.debug("This is the specified query:\n%s", options.json_query) | |
| 747 else: | |
| 748 logger.debug("No query was specified") | |
| 749 input_path = options.input_path | |
| 750 if options.json_query is not None: | |
| 751 json_struct = json.loads(options.json_query) | |
| 752 elif options.galaxy_parameters_file: | |
| 753 logger.debug("Using input Galaxy JSON parameters from:\n%s", | |
| 754 options.galaxy_parameters_file) | |
| 755 with open(options.galaxy_parameters_file) as json_fp: | |
| 756 galaxy_json = json.load(json_fp) | |
| 757 json_struct = {} | |
| 758 for fv_item in galaxy_json['factor_value_series']: | |
| 759 json_struct[fv_item['factor_name']] = fv_item['factor_value'] | |
| 760 else: | |
| 761 logger.debug("No query was specified") | |
| 762 json_struct = None | |
| 763 factor_selection = json_struct | |
| 764 result = slice_data_files(input_path, factor_selection=factor_selection) | |
| 765 data_files = result | |
| 766 logger.debug("Result data files list: %s", data_files) | |
| 767 if data_files is None: | |
| 768 raise RuntimeError("Error getting data files with isatools") | |
| 769 output_path = options.output_path | |
| 770 logger.debug("copying data files to %s", output_path) | |
| 771 for result in data_files: | |
| 772 for data_file_name in result['data_files']: | |
| 773 logging.info("Copying {}".format(data_file_name)) | |
| 774 shutil.copy(os.path.join(input_path, data_file_name), output_path) | |
| 775 logger.info("Finished writing data files to {}".format(output_path)) | |
| 776 | |
| 777 | |
| 778 def zip_get_data_files_collection_command(options): | |
| 779 logger.info("Getting data files for study %s. Writing to %s.", | |
| 780 options.input_path, options.output_path) | |
| 781 if options.json_query: | |
| 782 logger.debug("This is the specified query:\n%s", options.json_query) | |
| 783 else: | |
| 784 logger.debug("No query was specified") | |
| 785 input_path = options.input_path | |
| 786 output_path = options.output_path | |
| 787 if options.json_query is not None: | |
| 788 json_struct = json.loads(options.json_query) | |
| 789 factor_selection = json_struct | |
| 790 else: | |
| 791 factor_selection = None | |
| 792 with zipfile.ZipFile(input_path) as zfp: | |
| 793 tmpdir = tempfile.mkdtemp() | |
| 794 zfp.extractall(path=tmpdir) | |
| 795 result = slice_data_files(tmpdir, factor_selection=factor_selection) | |
| 796 data_files = result | |
| 797 logger.debug("Result data files list: %s", data_files) | |
| 798 if data_files is None: | |
| 799 raise RuntimeError("Error getting data files with isatools") | |
| 800 logger.debug("copying data files to %s", output_path) | |
| 801 for result in data_files: | |
| 802 for data_file_name in result['data_files']: | |
| 803 logging.info("Copying {}".format(data_file_name)) | |
| 804 shutil.copy(os.path.join(tmpdir, data_file_name), output_path) | |
| 805 logger.info("Finished writing data files to {}".format(output_path)) | |
| 806 shutil.rmtree(tmpdir) | |
| 807 | |
| 808 | |
| 809 def slice_data_files(dir, factor_selection=None): | |
| 810 results = [] | |
| 811 # first collect matching samples | |
| 812 for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')): | |
| 813 logger.info('Loading {table_file}'.format(table_file=table_file)) | |
| 814 | |
| 815 with open(os.path.join(dir, table_file)) as fp: | |
| 816 df = isatab.load_table(fp) | |
| 817 | |
| 818 if factor_selection is None: | |
| 819 matches = df['Sample Name'].items() | |
| 820 | |
| 821 for indx, match in matches: | |
| 822 sample_name = match | |
| 823 if len([r for r in results if r['sample'] == | |
| 824 sample_name]) == 1: | |
| 825 continue | |
| 826 else: | |
| 827 results.append( | |
| 828 { | |
| 829 'sample': sample_name, | |
| 830 'data_files': [] | |
| 831 } | |
| 832 ) | |
| 833 | |
| 834 else: | |
| 835 for factor_name, factor_value in factor_selection.items(): | |
| 836 if 'Factor Value[{}]'.format(factor_name) in list( | |
| 837 df.columns.values): | |
| 838 matches = df.loc[df['Factor Value[{factor}]'.format( | |
| 839 factor=factor_name)] == factor_value][ | |
| 840 'Sample Name'].items() | |
| 841 | |
| 842 for indx, match in matches: | |
| 843 sample_name = match | |
| 844 if len([r for r in results if r['sample'] == | |
| 845 sample_name]) == 1: | |
| 846 continue | |
| 847 else: | |
| 848 results.append( | |
| 849 { | |
| 850 'sample': sample_name, | |
| 851 'data_files': [], | |
| 852 'query_used': factor_selection | |
| 853 } | |
| 854 ) | |
| 855 | |
| 856 # now collect the data files relating to the samples | |
| 857 for result in results: | |
| 858 sample_name = result['sample'] | |
| 859 | |
| 860 for table_file in glob.iglob(os.path.join(dir, 'a_*')): | |
| 861 with open(table_file) as fp: | |
| 862 df = isatab.load_table(fp) | |
| 863 | |
| 864 data_files = [] | |
| 865 | |
| 866 table_headers = list(df.columns.values) | |
| 867 sample_rows = df.loc[df['Sample Name'] == sample_name] | |
| 868 | |
| 869 data_node_labels = [ | |
| 870 'Raw Data File', | |
| 871 'Raw Spectral Data File', | |
| 872 'Derived Spectral Data File', | |
| 873 'Derived Array Data File', | |
| 874 'Array Data File', | |
| 875 'Protein Assignment File', | |
| 876 'Peptide Assignment File', | |
| 877 'Post Translational Modification Assignment File', | |
| 878 'Acquisition Parameter Data File', | |
| 879 'Free Induction Decay Data File', | |
| 880 'Derived Array Data Matrix File', | |
| 881 'Image File', | |
| 882 'Derived Data File', | |
| 883 'Metabolite Assignment File'] | |
| 884 for node_label in data_node_labels: | |
| 885 if node_label in table_headers: | |
| 886 data_files.extend(list(sample_rows[node_label])) | |
| 887 | |
| 888 result['data_files'] = [i for i in list(data_files) if | |
| 889 str(i) != 'nan'] | |
| 890 return results | |
| 891 | |
| 892 | |
| 893 def isatab_get_factor_names_command(options): | |
| 894 input_path = options.input_path | |
| 895 logger.info("Getting factors for study %s. Writing to %s.", | |
| 896 input_path, options.output.name) | |
| 897 _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]') | |
| 898 factors = set() | |
| 899 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
| 900 with open(os.path.join(input_path, table_file)) as fp: | |
| 901 df = isatab.load_table(fp) | |
| 902 | |
| 903 factors_headers = [header for header in list(df.columns.values) | |
| 904 if _RX_FACTOR_VALUE.match(header)] | |
| 905 | |
| 906 for header in factors_headers: | |
| 907 factors.add(header[13:-1]) | |
| 908 if factors is not None: | |
| 909 json.dump(list(factors), options.output, indent=4) | |
| 910 logger.debug("Factor names written") | |
| 911 else: | |
| 912 raise RuntimeError("Error reading factors.") | |
| 913 | |
| 914 | |
| 915 def zip_get_factor_names_command(options): | |
| 916 input_path = options.input_path | |
| 917 logger.info("Getting factors for study %s. Writing to %s.", | |
| 918 input_path, options.output.name) | |
| 919 # unpack input_path | |
| 920 with zipfile.ZipFile(input_path) as zfp: | |
| 921 tmpdir = tempfile.mkdtemp() | |
| 922 zfp.extractall(path=tmpdir) | |
| 923 _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]') | |
| 924 factors = set() | |
| 925 for table_file in glob.iglob(os.path.join(tmpdir, '[a|s]_*')): | |
| 926 logging.info('Searching {}'.format(table_file)) | |
| 927 with open(os.path.join(tmpdir, table_file)) as fp: | |
| 928 df = isatab.load_table(fp) | |
| 929 | |
| 930 factors_headers = [header for header in list(df.columns.values) | |
| 931 if _RX_FACTOR_VALUE.match(header)] | |
| 932 | |
| 933 for header in factors_headers: | |
| 934 factors.add(header[13:-1]) | |
| 935 if factors is not None: | |
| 936 json.dump(list(factors), options.output, indent=4) | |
| 937 logger.debug("Factor names written") | |
| 938 else: | |
| 939 raise RuntimeError("Error reading factors.") | |
| 940 shutil.rmtree(tmpdir) | |
| 941 | |
| 942 | |
| 943 def isatab_get_factor_values_command(options): | |
| 944 logger.info("Getting values for factor {factor} in study {input_path}. Writing to {output_file}." | |
| 945 .format(factor=options.factor, input_path=options.input_path, output_file=options.output.name)) | |
| 946 fvs = set() | |
| 947 | |
| 948 input_path = options.input_path | |
| 949 factor_name = options.factor | |
| 950 | |
| 951 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
| 952 with open(os.path.join(input_path, table_file)) as fp: | |
| 953 df = isatab.load_table(fp) | |
| 954 | |
| 955 if 'Factor Value[{factor}]'.format(factor=factor_name) in \ | |
| 956 list(df.columns.values): | |
| 957 for _, match in df[ | |
| 958 'Factor Value[{factor}]'.format( | |
| 959 factor=factor_name)].iteritems(): | |
| 960 try: | |
| 961 match = match.item() | |
| 962 except AttributeError: | |
| 963 pass | |
| 964 | |
| 965 if isinstance(match, (str, int, float)): | |
| 966 if str(match) != 'nan': | |
| 967 fvs.add(match) | |
| 968 if fvs is not None: | |
| 969 json.dump(list(fvs), options.output, indent=4) | |
| 970 logger.debug("Factor values written to {}".format(options.output)) | |
| 971 else: | |
| 972 raise RuntimeError("Error getting factor values") | |
| 973 | |
| 974 | |
| 975 def zip_get_factor_values_command(options): | |
| 976 input_path = options.input_path | |
| 977 logger.info("Getting factors for study %s. Writing to %s.", | |
| 978 input_path, options.output.name) | |
| 979 logger.info("Getting values for factor {factor} in study {input_path}. " | |
| 980 "Writing to {output_file}.".format( | |
| 981 factor=options.factor, input_path=options.input_path, | |
| 982 output_file=options.output.name)) | |
| 983 fvs = set() | |
| 984 factor_name = options.factor | |
| 985 | |
| 986 # unpack input_path | |
| 987 with zipfile.ZipFile(input_path) as zfp: | |
| 988 tmpdir = tempfile.mkdtemp() | |
| 989 zfp.extractall(path=tmpdir) | |
| 990 for table_file in glob.glob(os.path.join(tmpdir, '[a|s]_*')): | |
| 991 logging.info('Searching {}'.format(table_file)) | |
| 992 with open(os.path.join(input_path, table_file)) as fp: | |
| 993 df = isatab.load_table(fp) | |
| 994 if 'Factor Value[{factor}]'.format(factor=factor_name) in \ | |
| 995 list(df.columns.values): | |
| 996 for _, match in df[ | |
| 997 'Factor Value[{factor}]'.format( | |
| 998 factor=factor_name)].iteritems(): | |
| 999 try: | |
| 1000 match = match.item() | |
| 1001 except AttributeError: | |
| 1002 pass | |
| 1003 | |
| 1004 if isinstance(match, (str, int, float)): | |
| 1005 if str(match) != 'nan': | |
| 1006 fvs.add(match) | |
| 1007 if fvs is not None: | |
| 1008 json.dump(list(fvs), options.output, indent=4) | |
| 1009 logger.debug("Factor values written to {}".format(options.output)) | |
| 1010 else: | |
| 1011 raise RuntimeError("Error getting factor values") | |
| 1012 shutil.rmtree(tmpdir) | |
| 1013 | |
| 1014 | |
| 1015 def isatab_get_factors_summary_command(options): | |
| 1016 logger.info("Getting summary for study %s. Writing to %s.", | |
| 1017 options.input_path, options.output.name) | |
| 1018 input_path = options.input_path | |
| 1019 ISA = isatab.load(input_path) | |
| 1020 | |
| 1021 all_samples = [] | |
| 1022 for study in ISA.studies: | |
| 1023 all_samples.extend(study.samples) | |
| 1024 | |
| 1025 samples_and_fvs = [] | |
| 1026 | |
| 1027 for sample in all_samples: | |
| 1028 sample_and_fvs = { | |
| 1029 'sample_name': sample.name, | |
| 1030 } | |
| 1031 | |
| 1032 for fv in sample.factor_values: | |
| 1033 if isinstance(fv.value, (str, int, float)): | |
| 1034 fv_value = fv.value | |
| 1035 sample_and_fvs[fv.factor_name.name] = fv_value | |
| 1036 elif isinstance(fv.value, OntologyAnnotation): | |
| 1037 fv_value = fv.value.term | |
| 1038 sample_and_fvs[fv.factor_name.name] = fv_value | |
| 1039 | |
| 1040 samples_and_fvs.append(sample_and_fvs) | |
| 1041 | |
| 1042 df = pd.DataFrame(samples_and_fvs) | |
| 1043 nunique = df.apply(pd.Series.nunique) | |
| 1044 cols_to_drop = nunique[nunique == 1].index | |
| 1045 | |
| 1046 df = df.drop(cols_to_drop, axis=1) | |
| 1047 summary = df.to_dict(orient='records') | |
| 1048 if summary is not None: | |
| 1049 json.dump(summary, options.output, indent=4) | |
| 1050 logger.debug("Summary dumped to JSON") | |
| 1051 # html_summary = build_html_summary(summary) | |
| 1052 # with options.html_output as html_fp: | |
| 1053 # html_fp.write(html_summary) | |
| 1054 else: | |
| 1055 raise RuntimeError("Error getting study summary") | |
| 1056 | |
| 1057 | |
| 1058 def zip_get_factors_summary_command(options): | |
| 1059 logger.info("Getting summary for study %s. Writing to %s.", | |
| 1060 options.input_path, options.json_output.name) | |
| 1061 input_path = options.input_path | |
| 1062 with zipfile.ZipFile(input_path) as zfp: | |
| 1063 tmpdir = tempfile.mkdtemp() | |
| 1064 zfp.extractall(path=tmpdir) | |
| 1065 ISA = isatab.load(tmpdir) | |
| 1066 all_samples = [] | |
| 1067 for study in ISA.studies: | |
| 1068 all_samples.extend(study.samples) | |
| 1069 samples_and_fvs = [] | |
| 1070 for sample in all_samples: | |
| 1071 sample_and_fvs = { | |
| 1072 'sample_name': sample.name, | |
| 1073 } | |
| 1074 for fv in sample.factor_values: | |
| 1075 if isinstance(fv.value, (str, int, float)): | |
| 1076 fv_value = fv.value | |
| 1077 sample_and_fvs[fv.factor_name.name] = fv_value | |
| 1078 elif isinstance(fv.value, OntologyAnnotation): | |
| 1079 fv_value = fv.value.term | |
| 1080 sample_and_fvs[fv.factor_name.name] = fv_value | |
| 1081 samples_and_fvs.append(sample_and_fvs) | |
| 1082 df = pd.DataFrame(samples_and_fvs) | |
| 1083 nunique = df.apply(pd.Series.nunique) | |
| 1084 cols_to_drop = nunique[nunique == 1].index | |
| 1085 df = df.drop(cols_to_drop, axis=1) | |
| 1086 summary = df.to_dict(orient='records') | |
| 1087 if summary is not None: | |
| 1088 json.dump(summary, options.json_output, indent=4) | |
| 1089 logger.debug("Summary dumped to JSON") | |
| 1090 print(json.dumps(summary, indent=4)) | |
| 1091 html_summary = build_html_summary(summary) | |
| 1092 with options.html_output as html_fp: | |
| 1093 html_fp.write(html_summary) | |
| 1094 else: | |
| 1095 raise RuntimeError("Error getting study summary") | |
| 1096 shutil.rmtree(tmpdir) | |
| 1097 | |
| 1098 | |
| 1099 def get_study_groups(input_path): | |
| 1100 factors_summary = isatab_get_factors_summary_command(input_path=input_path) | |
| 1101 study_groups = {} | |
| 1102 | |
| 1103 for factors_item in factors_summary: | |
| 1104 fvs = tuple(factors_item[k] for k in factors_item.keys() if k != 'name') | |
| 1105 | |
| 1106 if fvs in study_groups.keys(): | |
| 1107 study_groups[fvs].append(factors_item['name']) | |
| 1108 else: | |
| 1109 study_groups[fvs] = [factors_item['name']] | |
| 1110 return study_groups | |
| 1111 | |
| 1112 | |
| 1113 def get_study_groups_samples_sizes(input_path): | |
| 1114 study_groups = get_study_groups(input_path=input_path) | |
| 1115 return list(map(lambda x: (x[0], len(x[1])), study_groups.items())) | |
| 1116 | |
| 1117 | |
| 1118 def get_sources_for_sample(input_path, sample_name): | |
| 1119 ISA = isatab.load(input_path) | |
| 1120 hits = [] | |
| 1121 | |
| 1122 for study in ISA.studies: | |
| 1123 for sample in study.samples: | |
| 1124 if sample.name == sample_name: | |
| 1125 print('found a hit: {sample_name}'.format( | |
| 1126 sample_name=sample.name)) | |
| 1127 | |
| 1128 for source in sample.derives_from: | |
| 1129 hits.append(source.name) | |
| 1130 return hits | |
| 1131 | |
| 1132 | |
| 1133 def get_data_for_sample(input_path, sample_name): | |
| 1134 ISA = isatab.load(input_path) | |
| 1135 hits = [] | |
| 1136 for study in ISA.studies: | |
| 1137 for assay in study.assays: | |
| 1138 for data in assay.data_files: | |
| 1139 if sample_name in [x.name for x in data.generated_from]: | |
| 1140 logger.info('found a hit: {filename}'.format( | |
| 1141 filename=data.filename)) | |
| 1142 hits.append(data) | |
| 1143 return hits | |
| 1144 | |
| 1145 | |
| 1146 def get_study_groups_data_sizes(input_path): | |
| 1147 study_groups = get_study_groups(input_path=input_path) | |
| 1148 return list(map(lambda x: (x[0], len(x[1])), study_groups.items())) | |
| 1149 | |
| 1150 | |
| 1151 def get_characteristics_summary(input_path): | |
| 1152 """ | |
| 1153 This function generates a characteristics summary for a MetaboLights | |
| 1154 study | |
| 1155 | |
| 1156 :param input_path: Input path to ISA-tab | |
| 1157 :return: A list of dicts summarising the set of characteristic names | |
| 1158 and values associated with each sample | |
| 1159 | |
| 1160 Note: it only returns a summary of characteristics with variable values. | |
| 1161 | |
| 1162 Example usage: | |
| 1163 characteristics_summary = get_characteristics_summary('/path/to/my/study/') | |
| 1164 [ | |
| 1165 { | |
| 1166 "name": "6089if_9", | |
| 1167 "Variant": "Synechocystis sp. PCC 6803.sll0171.ko" | |
| 1168 }, | |
| 1169 { | |
| 1170 "name": "6089if_43", | |
| 1171 "Variant": "Synechocystis sp. PCC 6803.WT.none" | |
| 1172 }, | |
| 1173 ] | |
| 1174 | |
| 1175 | |
| 1176 """ | |
| 1177 ISA = isatab.load(input_path) | |
| 1178 | |
| 1179 all_samples = [] | |
| 1180 for study in ISA.studies: | |
| 1181 all_samples.extend(study.samples) | |
| 1182 | |
| 1183 samples_and_characs = [] | |
| 1184 for sample in all_samples: | |
| 1185 sample_and_characs = { | |
| 1186 'name': sample.name | |
| 1187 } | |
| 1188 | |
| 1189 for source in sample.derives_from: | |
| 1190 for c in source.characteristics: | |
| 1191 if isinstance(c.value, (str, int, float)): | |
| 1192 c_value = c.value | |
| 1193 sample_and_characs[c.category.term] = c_value | |
| 1194 elif isinstance(c.value, OntologyAnnotation): | |
| 1195 c_value = c.value.term | |
| 1196 sample_and_characs[c.category.term] = c_value | |
| 1197 | |
| 1198 samples_and_characs.append(sample_and_characs) | |
| 1199 | |
| 1200 df = pd.DataFrame(samples_and_characs) | |
| 1201 nunique = df.apply(pd.Series.nunique) | |
| 1202 cols_to_drop = nunique[nunique == 1].index | |
| 1203 | |
| 1204 df = df.drop(cols_to_drop, axis=1) | |
| 1205 return df.to_dict(orient='records') | |
| 1206 | |
| 1207 | |
| 1208 def get_study_variable_summary(input_path): | |
| 1209 ISA = isatab.load(input_path) | |
| 1210 | |
| 1211 all_samples = [] | |
| 1212 for study in ISA.studies: | |
| 1213 all_samples.extend(study.samples) | |
| 1214 | |
| 1215 samples_and_variables = [] | |
| 1216 for sample in all_samples: | |
| 1217 sample_and_vars = { | |
| 1218 'sample_name': sample.name | |
| 1219 } | |
| 1220 | |
| 1221 for fv in sample.factor_values: | |
| 1222 if isinstance(fv.value, (str, int, float)): | |
| 1223 fv_value = fv.value | |
| 1224 sample_and_vars[fv.factor_name.name] = fv_value | |
| 1225 elif isinstance(fv.value, OntologyAnnotation): | |
| 1226 fv_value = fv.value.term | |
| 1227 sample_and_vars[fv.factor_name.name] = fv_value | |
| 1228 | |
| 1229 for source in sample.derives_from: | |
| 1230 sample_and_vars['source_name'] = source.name | |
| 1231 for c in source.characteristics: | |
| 1232 if isinstance(c.value, (str, int, float)): | |
| 1233 c_value = c.value | |
| 1234 sample_and_vars[c.category.term] = c_value | |
| 1235 elif isinstance(c.value, OntologyAnnotation): | |
| 1236 c_value = c.value.term | |
| 1237 sample_and_vars[c.category.term] = c_value | |
| 1238 | |
| 1239 samples_and_variables.append(sample_and_vars) | |
| 1240 | |
| 1241 df = pd.DataFrame(samples_and_variables) | |
| 1242 nunique = df.apply(pd.Series.nunique) | |
| 1243 cols_to_drop = nunique[nunique == 1].index | |
| 1244 | |
| 1245 df = df.drop(cols_to_drop, axis=1) | |
| 1246 return df.to_dict(orient='records') | |
| 1247 | |
| 1248 | |
| 1249 def get_study_group_factors(input_path): | |
| 1250 factors_list = [] | |
| 1251 | |
| 1252 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
| 1253 with open(os.path.join(input_path, table_file)) as fp: | |
| 1254 df = isatab.load_table(fp) | |
| 1255 | |
| 1256 factor_columns = [x for x in df.columns if x.startswith( | |
| 1257 'Factor Value')] | |
| 1258 if len(factor_columns) > 0: | |
| 1259 factors_list = df[factor_columns].drop_duplicates()\ | |
| 1260 .to_dict(orient='records') | |
| 1261 return factors_list | |
| 1262 | |
| 1263 | |
| 1264 def get_filtered_df_on_factors_list(input_path): | |
| 1265 factors_list = get_study_group_factors(input_path=input_path) | |
| 1266 queries = [] | |
| 1267 | |
| 1268 for item in factors_list: | |
| 1269 query_str = [] | |
| 1270 | |
| 1271 for k, v in item.items(): | |
| 1272 k = k.replace(' ', '_').replace('[', '_').replace(']', '_') | |
| 1273 if isinstance(v, str): | |
| 1274 v = v.replace(' ', '_').replace('[', '_').replace(']', '_') | |
| 1275 query_str.append("{k} == '{v}' and ".format(k=k, v=v)) | |
| 1276 | |
| 1277 query_str = ''.join(query_str)[:-4] | |
| 1278 queries.append(query_str) | |
| 1279 | |
| 1280 for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')): | |
| 1281 with open(os.path.join(input_path, table_file)) as fp: | |
| 1282 df = isatab.load_table(fp) | |
| 1283 | |
| 1284 cols = df.columns | |
| 1285 cols = cols.map( | |
| 1286 lambda x: x.replace(' ', '_') if isinstance(x, str) else x) | |
| 1287 df.columns = cols | |
| 1288 | |
| 1289 cols = df.columns | |
| 1290 cols = cols.map( | |
| 1291 lambda x: x.replace('[', '_') if isinstance(x, str) else x) | |
| 1292 df.columns = cols | |
| 1293 | |
| 1294 cols = df.columns | |
| 1295 cols = cols.map( | |
| 1296 lambda x: x.replace(']', '_') if isinstance(x, str) else x) | |
| 1297 df.columns = cols | |
| 1298 | |
| 1299 for query in queries: | |
| 1300 # query uses pandas.eval, which evaluates queries like pure Python | |
| 1301 # notation | |
| 1302 df2 = df.query(query) | |
| 1303 if 'Sample_Name' in df.columns: | |
| 1304 print('Group: {query} / Sample_Name: {sample_name}'.format( | |
| 1305 query=query, sample_name=list(df2['Sample_Name']))) | |
| 1306 | |
| 1307 if 'Source_Name' in df.columns: | |
| 1308 print('Group: {} / Sources_Name: {}'.format( | |
| 1309 query, list(df2['Source_Name']))) | |
| 1310 | |
| 1311 if 'Raw_Spectral_Data_File' in df.columns: | |
| 1312 print('Group: {query} / Raw_Spectral_Data_File: {filename}' | |
| 1313 .format(query=query[13:-2], | |
| 1314 filename=list(df2['Raw_Spectral_Data_File']))) | |
| 1315 return queries | |
| 1316 | |
| 1317 | |
| 1318 def datatype_get_summary_command(options): | |
| 1319 logger.info("Getting summary for study %s. Writing to %s.", | |
| 1320 options.study_id, options.output.name) | |
| 1321 | |
| 1322 summary = get_study_variable_summary(options.study_id) | |
| 1323 print('summary: ', list(summary)) | |
| 1324 if summary is not None: | |
| 1325 json.dump(summary, options.output, indent=4) | |
| 1326 logger.debug("Summary dumped") | |
| 1327 else: | |
| 1328 raise RuntimeError("Error getting study summary") | |
| 1329 | |
| 1330 | |
| 1331 # logging and argument parsing | |
| 1332 | |
| 1333 def _configure_logger(options): | |
| 1334 logging_level = getattr(logging, options.log_level, logging.INFO) | |
| 1335 logging.basicConfig(level=logging_level) | |
| 1336 | |
| 1337 global logger | |
| 1338 logger = logging.getLogger() | |
| 1339 logger.setLevel(logging_level) # there's a bug somewhere. The level set through basicConfig isn't taking effect | |
| 1340 | |
| 1341 | |
| 1342 def _parse_args(args): | |
| 1343 parser = make_parser() | |
| 1344 options = parser.parse_args(args) | |
| 1345 return options | |
| 1346 | |
| 1347 | |
| 1348 def main(args): | |
| 1349 options = _parse_args(args) | |
| 1350 _configure_logger(options) | |
| 1351 # run subcommand | |
| 1352 options.func(options) | |
| 1353 | |
| 1354 | |
| 1355 if __name__ == '__main__': | |
| 1356 try: | |
| 1357 main(sys.argv[1:]) | |
| 1358 sys.exit(0) | |
| 1359 except Exception as e: | |
| 1360 logger.exception(e) | |
| 1361 logger.error(e) | |
| 1362 sys.exit(e.code if hasattr(e, "code") else 99) |
