diff isaslicer.py @ 0:8dab200e02cb draft

"planemo upload commit 239561a6401593c5f87df40ac971a9aa393c4663-dirty"
author prog
date Tue, 07 Jan 2020 09:05:21 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/isaslicer.py	Tue Jan 07 09:05:21 2020 -0500
@@ -0,0 +1,1362 @@
+#!/usr/bin/env python3
+
+import argparse
+import glob
+import json
+import logging
+import os
+import re
+import shutil
+import sys
+import tempfile
+import zipfile
+
+import pandas as pd
+from isatools import isatab
+from isatools.model import OntologyAnnotation
+from isatools.net import mtbls as MTBLS
+
+logger = None
+
+#    isaslicer.py <command> <study_id> [ command-specific options ]
+
+
+def make_parser():
+    parser = argparse.ArgumentParser( description="ISA slicer")
+
+    parser.add_argument('--log-level', choices=[
+        'DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'],
+        default='INFO', help="Set the desired logging level")
+
+    subparsers = parser.add_subparsers(
+        title='Actions',
+        dest='command')  # specified subcommand will be available in attribute 'command'
+    subparsers.required = True
+
+    # mtblisa commands
+
+    subparser = subparsers.add_parser(
+        'mtbls-get-study-archive', aliases=['gsa'],
+        help="Get ISA study from MetaboLights as zip archive")
+    subparser.set_defaults(func=get_study_archive_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument(
+        'output', metavar="OUTPUT",
+        help="Name of output archive (extension will be added)")
+    subparser.add_argument('--format', metavar="FMT", choices=[
+        'zip', 'tar', 'gztar', 'bztar', 'xztar'], default='zip',
+        help="Type of archive to create")
+
+    subparser = subparsers.add_parser('mtbls-get-study', aliases=['gs'],
+                                      help="Get ISA study from MetaboLights")
+    subparser.set_defaults(func=get_study_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument('output', metavar="PATH", help="Name of output")
+    subparser.add_argument(
+        '-f', '--isa-format', choices=['isa-tab', 'isa-json'],
+        metavar="FORMAT", default='isa-tab', help="Desired ISA format")
+
+    subparser = subparsers.add_parser(
+        'mtbls-get-factors', aliases=['gf'],
+        help="Get factor names from a study in json format")
+    subparser.set_defaults(func=get_factors_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser(
+        'mtbls-get-factor-values', aliases=['gfv'],
+        help="Get factor values from a study in json format")
+    subparser.set_defaults(func=get_factor_values_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument(
+        'factor', help="The desired factor. Use `get-factors` to get the list "
+                       "of available factors")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser('mtbls-get-data-list', aliases=['gd'],
+                                      help="Get data files list in json format")
+    subparser.set_defaults(func=get_data_files_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+                           help="Output file")
+    subparser.add_argument(
+        '--json-query',
+        help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'")
+    subparser.add_argument(
+        '--galaxy_parameters_file',
+        help="Path to JSON file containing input Galaxy JSON")
+
+    subparser = subparsers.add_parser(
+        'mtbls-get-factors-summary', aliases=['gsum'],
+        help="Get the variables summary from a study, in json format")
+    subparser.set_defaults(func=get_summary_command)
+    subparser.add_argument('study_id')
+    subparser.add_argument(
+        'json_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output JSON file")
+    subparser.add_argument(
+        'html_output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output HTML file")
+
+    # isaslicer commands on path to unpacked ISA-Tab as input
+
+    subparser = subparsers.add_parser(
+        'isa-tab-get-factors', aliases=['isagf'],
+        help="Get factor names from a study in json format")
+    subparser.set_defaults(func=isatab_get_factor_names_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser(
+        'zip-get-factors', aliases=['zipgf'],
+        help="Get factor names from a study in json format")
+    subparser.set_defaults(func=zip_get_factor_names_command)
+    subparser.add_argument('input_path', type=str,
+                           help="Input ISA-Tab zip path")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser(
+        'isa-tab-get-factor-values', aliases=['isagfv'],
+        help="Get factor values from a study in json format")
+    subparser.set_defaults(func=isatab_get_factor_values_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument(
+        'factor', help="The desired factor. Use `get-factors` to get the list "
+                       "of available factors")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser(
+        'zip-get-factor-values', aliases=['zipgfv'],
+        help="Get factor values from a study in json format")
+    subparser.set_defaults(func=zip_get_factor_values_command)
+    subparser.add_argument('input_path', type=str,
+                           help="Input ISA-Tab zip path")
+    subparser.add_argument(
+        'factor', help="The desired factor. Use `get-factors` to get the list "
+                       "of available factors")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser('isa-tab-get-data-list', aliases=['isagdl'],
+                                      help="Get data files list in json format")
+    subparser.set_defaults(func=isatab_get_data_files_list_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+                           help="Output file")
+    subparser.add_argument(
+        '--json-query',
+        help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'")
+    subparser.add_argument(
+        '--galaxy_parameters_file',
+        help="Path to JSON file containing input Galaxy JSON")
+
+    subparser = subparsers.add_parser('zip-get-data-list', aliases=['zipgdl'],
+                                      help="Get data files list in json format")
+    subparser.set_defaults(func=zip_get_data_files_list_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path")
+    subparser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+                           help="Output file")
+    subparser.add_argument(
+        '--json-query',
+        help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'")
+    subparser.add_argument(
+        '--galaxy_parameters_file',
+        help="Path to JSON file containing input Galaxy JSON")
+
+    subparser = subparsers.add_parser('isa-tab-get-data-collection', aliases=['isagdc'],
+                                      help="Get data files collection")
+    subparser.set_defaults(func=isatab_get_data_files_collection_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument('output_path', type=str, help="Output data files path")
+    subparser.add_argument(
+        '--json-query',
+        help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'")
+    subparser.add_argument(
+        '--galaxy_parameters_file',
+        help="Path to JSON file containing input Galaxy JSON")
+
+    subparser = subparsers.add_parser('zip-get-data-collection', aliases=['zipgdc'],
+                                      help="Get data files collection")
+    subparser.set_defaults(func=zip_get_data_files_collection_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab zip path")
+    subparser.add_argument('output_path', type=str, help="Output data files path")
+    subparser.add_argument(
+        '--json-query',
+        help="Factor query in JSON (e.g., '{\"Gender\":\"Male\"}'")
+
+    subparser = subparsers.add_parser(
+        'isa-tab-get-factors-summary', aliases=['isasum'],
+        help="Get the variables summary from a study, in json format")
+    subparser.set_defaults(func=isatab_get_factors_summary_command)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument(
+        'output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
+        help="Output file")
+
+    subparser = subparsers.add_parser(
+        'zip-get-factors-summary', aliases=['zipsum'],
+        help="Get the variables summary from a study, in json format")
+    subparser.set_defaults(func=zip_get_factors_summary_command)
+    subparser.add_argument('input_path', type=str,
+                           help="Input ISA-Tab zip path")
+    subparser.add_argument(
+        'json_output', nargs='?', type=argparse.FileType('w'),
+        default=sys.stdout,
+        help="Output JSON file")
+    subparser.add_argument(
+        'html_output', nargs='?', type=argparse.FileType('w'),
+        default=sys.stdout,
+        help="Output HTML file")
+
+    subparser = subparsers.add_parser(
+        'isaslicer2-slice', aliases=['slice2'],
+        help="Slice ISA-Tabs version 2")
+    subparser.set_defaults(func=query_isatab)
+    subparser.add_argument('--source_dir', type=str,
+                           help="Input ISA-Tab zip path")
+    subparser.add_argument(
+        '--galaxy_parameters_file', type=argparse.FileType(mode='r'),
+        help="Path to JSON file containing input Galaxy JSON")
+    subparser.add_argument('--output', type=argparse.FileType(mode='w'),
+                           help="Input ISA-Tab zip path")
+
+    subparser = subparsers.add_parser(
+        'filter-data', aliases=['filter'],
+        help="Filter out data based on slicer2")
+    subparser.set_defaults(func=filter_data)
+    subparser.add_argument('input_path', type=str, help="Input ISA-Tab path")
+    subparser.add_argument('output_path', type=str, help="Output data files path")
+    subparser.add_argument('--slice', type=argparse.FileType(mode='r'),
+                           help="slice")
+    subparser.add_argument('--filename_filter', type=str, help="shell-like wildcard to filter files")
+
+    return parser
+
+
+def filter_data(options):
+    loglines = []
+    source_dir = options.input_path if options.input_path else ""
+    output_path = options.output_path
+    filename_filter = options.filename_filter
+    if source_dir:
+        if not os.path.exists(source_dir):
+            raise IOError('Source path does not exist!')
+    data_files = []
+    slice_json = options.slice
+    for result in json.load(slice_json)['results']:
+        data_files.extend(result.get('data_files', []))
+    reduced_data_files = list(set(data_files))
+    filtered_files = glob.glob(os.path.join(source_dir, filename_filter))
+    to_copy = []
+    for filepath in filtered_files:
+        if os.path.basename(filepath) in reduced_data_files:
+            to_copy.append(filepath)
+    loglines.append("Using slice results from {}\n".format(slice_json.name))
+    for filepath in to_copy:
+        loglines.append("Copying {}\n".format(os.path.basename(filepath)))
+        # try:
+        #     shutil.copyfile(
+        #         filepath, os.path.join(output_path, os.path.basename(filepath)))
+        # except Exception as e:
+        #     print(e)
+        #     exit(1)
+        try:
+            os.symlink(
+                filepath, os.path.join(output_path, os.path.basename(filepath)))
+        except Exception as e:
+            print(e)
+            exit(1)
+    with open('cli.log', 'w') as fp:
+        fp.writelines(loglines)
+
+
+def query_isatab(options):
+    source_dir = options.source_dir if options.source_dir else ""
+    galaxy_parameters_file = options.galaxy_parameters_file
+    output = options.output
+
+    debug = True
+    if galaxy_parameters_file:
+        galaxy_parameters = json.load(galaxy_parameters_file)
+        print('Galaxy parameters:')
+        print(json.dumps(galaxy_parameters, indent=4))
+    else:
+        raise IOError('Could not load Galaxy parameters file!')
+    if source_dir:
+        if not os.path.exists(source_dir):
+            raise IOError('Source path does not exist!')
+    query = galaxy_parameters['query']
+    if debug:
+        print('Query is:')
+        print(json.dumps(query, indent=4))  # for debugging only
+    if source_dir:
+        investigation = isatab.load(source_dir)
+    else:
+        tmp = tempfile.mkdtemp()
+        _ = MTBLS.get(galaxy_parameters['input']['mtbls_id'], tmp)
+        investigation = isatab.load(tmp)
+    # filter assays by mt/tt
+    matching_assays = []
+    mt = query.get('measurement_type').strip()
+    tt = query.get('technology_type').strip()
+    if mt and tt:
+        for study in investigation.studies:
+            matching_assays.extend(
+                [x for x in study.assays if x.measurement_type.term == mt
+                 and x.technology_type.term == tt])
+    elif mt and not tt:
+        for study in investigation.studies:
+            matching_assays.extend(
+                [x for x in study.assays if x.measurement_type.term == mt])
+    elif not mt and tt:
+        for study in investigation.studies:
+            matching_assays.extend(
+                [x for x in study.assays if x.technology_type.term == tt])
+    else:
+        for study in investigation.studies:
+            matching_assays.extend(study.assays)
+    assay_samples = []
+    for assay in matching_assays:
+        assay_samples.extend(assay.samples)
+    if debug:
+        print('Total samples: {}'.format(len(assay_samples)))
+
+    # filter samples by fv
+    factor_selection = {
+        x.get('factor_name').strip(): x.get('factor_value').strip() for x in
+        query.get('factor_selection', [])}
+
+    fv_samples = set()
+    if factor_selection:
+        samples_to_remove = set()
+        for f, v in factor_selection.items():
+            for sample in assay_samples:
+                for fv in [x for x in sample.factor_values if
+                           x.factor_name.name == f]:
+                    if isinstance(fv.value, OntologyAnnotation):
+                        if fv.value.term == v:
+                            fv_samples.add(sample)
+                    elif fv.value == v:
+                        fv_samples.add(sample)
+        for f, v in factor_selection.items():
+            for sample in fv_samples:
+                for fv in [x for x in sample.factor_values if
+                           x.factor_name.name == f]:
+                    if isinstance(fv.value, OntologyAnnotation):
+                        if fv.value.term != v:
+                            samples_to_remove.add(sample)
+                    elif fv.value != v:
+                        samples_to_remove.add(sample)
+        final_fv_samples = fv_samples.difference(samples_to_remove)
+    else:
+        final_fv_samples = assay_samples
+
+    # filter samples by characteristic
+    characteristics_selection = {
+        x.get('characteristic_name').strip():
+            x.get('characteristic_value').strip() for x in
+            query.get('characteristics_selection', [])}
+
+    cv_samples = set()
+    if characteristics_selection:
+        first_pass = True
+        samples_to_remove = set()
+        for c, v in characteristics_selection.items():
+            if first_pass:
+                for sample in final_fv_samples:
+                    for cv in [x for x in sample.characteristics if
+                               x.category.term == c]:
+                        if isinstance(cv.value, OntologyAnnotation):
+                            if cv.value.term == v:
+                                cv_samples.add(sample)
+                        elif cv.value == v:
+                            cv_samples.add(sample)
+                    for source in sample.derives_from:
+                        for cv in [x for x in source.characteristics if
+                                   x.category.term == c]:
+                            if isinstance(cv.value, OntologyAnnotation):
+                                if cv.value.term == v:
+                                    cv_samples.add(sample)
+                            elif cv.value == v:
+                                cv_samples.add(sample)
+                first_pass = False
+            else:
+                for sample in cv_samples:
+                    for cv in [x for x in sample.characteristics if
+                               x.category.term == c]:
+                        if isinstance(cv.value, OntologyAnnotation):
+                            if cv.value.term != v:
+                                samples_to_remove.add(sample)
+                        elif cv.value != v:
+                            samples_to_remove.add(sample)
+                    for source in sample.derives_from:
+                        for cv in [x for x in source.characteristics if
+                                   x.category.term == c]:
+                            if isinstance(cv.value, OntologyAnnotation):
+                                if cv.value.term != v:
+                                    samples_to_remove.add(sample)
+                            elif cv.value != v:
+                                samples_to_remove.add(sample)
+        final_cv_samples = cv_samples.difference(samples_to_remove)
+    else:
+        final_cv_samples = final_fv_samples
+
+    # filter samples by process parameter
+    parameters_selection = {
+        x.get('parameter_name').strip():
+            x.get('parameter_value').strip() for x in
+        query.get('parameter_selection', [])}
+
+    final_samples = final_cv_samples
+
+    if debug:
+        print('Final number of samples: {}'.format(len(final_samples)))
+    results = []
+    for sample in final_samples:
+        results.append({
+            'sample_name': sample.name,
+            'data_files': []
+        })
+    for result in results:
+        sample_name = result['sample_name']
+        if source_dir:
+            table_files = glob.iglob(os.path.join(source_dir, 'a_*'))
+        else:
+            table_files = glob.iglob(os.path.join(tmp, 'a_*'))
+        for table_file in table_files:
+            with open(table_file) as fp:
+                df = isatab.load_table(fp)
+                data_files = []
+                table_headers = list(df.columns.values)
+                sample_rows = df.loc[df['Sample Name'] == sample_name]
+                data_node_labels = [
+                    'Raw Data File', 'Raw Spectral Data File',
+                    'Derived Spectral Data File',
+                    'Derived Array Data File', 'Array Data File',
+                    'Protein Assignment File', 'Peptide Assignment File',
+                    'Post Translational Modification Assignment File',
+                    'Acquisition Parameter Data File',
+                    'Free Induction Decay Data File',
+                    'Derived Array Data Matrix File', 'Image File',
+                    'Derived Data File', 'Metabolite Assignment File']
+                if parameters_selection:
+                    for p, v in parameters_selection.items():
+                        sample_pv_rows = sample_rows.loc[
+                            sample_rows['Parameter Value[{}]'.format(p)] == v]
+                        for node_label in data_node_labels:
+                            if node_label in table_headers:
+                                data_files.extend(
+                                    list(sample_pv_rows[node_label]))
+                    result['data_files'].extend(list(set(
+                        i for i in list(data_files) if
+                        str(i) not in ('nan', ''))))
+                else:
+                    for node_label in data_node_labels:
+                        if node_label in table_headers:
+                            data_files.extend(list(sample_rows[node_label]))
+                    result['data_files'].extend(
+                        list(set(i for i in list(data_files) if
+                                 str(i) not in ('nan', ''))))
+    results_json = {
+        'query': query,
+        'results': results
+    }
+    json.dump(results_json, output, indent=4)
+
+    # if galaxy_parameters['input']['collection_output']:
+    #     logger = logging.getLogger()
+    #     logger.debug("copying data files to %s", os.path.dirname(output))
+    #     for result in results:
+    #         for data_file_name in result['data_files']:
+    #             logging.info("Copying {}".format(data_file_name))
+    #             shutil.copy(os.path.join(source_dir, data_file_name),
+    #                         os.path.dirname(output))
+    #     logger.info(
+    #       "Finished writing data files to {}".format(os.path.dirname(output)))
+
+
+def get_study_archive_command(options):
+    study_id = options.study_id
+
+    logger.info("Downloading study %s into archive at path %s.%s",
+                study_id, options.output, options.format)
+
+    tmpdir = MTBLS.get(study_id)
+    logger.debug("MTBLS.get returned '%s'", tmpdir)
+    if tmpdir is not None:
+        try:
+            shutil.make_archive(
+                options.output, options.format, tmpdir, logger=logger)
+            logger.info("ISA archive written")
+        finally:
+            logger.debug("Trying to clean up tmp dir %s", tmpdir)
+            shutil.rmtree(tmpdir, ignore_errors=True)
+    else:
+        raise RuntimeError("Error downloading ISA study")
+
+# mtblisa commands
+
+
+def get_study_command(options):
+    if os.path.exists(options.output):
+        raise RuntimeError("Selected output path {} already exists!".format(
+            options.output))
+
+    if options.isa_format == "isa-tab":
+        tmp_data = None
+        try:
+            logger.info("Downloading study %s", options.study_id)
+            tmp_data = MTBLS.get(options.study_id)
+            if tmp_data is None:
+                raise RuntimeError("Error downloading ISA study")
+
+            logger.debug(
+                "Finished downloading data. Moving to final location %s",
+                options.output)
+            shutil.move(tmp_data, options.output)
+            logger.info("ISA archive written to %s", options.output)
+        finally:
+            if tmp_data:
+                # try to clean up any temporary files left behind
+                logger.debug("Deleting %s, if there's anything there", tmp_data)
+                shutil.rmtree(tmp_data, ignore_errors=True)
+    elif options.isa_format == "isa-json":
+        isajson = MTBLS.getj(options.study_id)
+        if isajson is None:
+            raise RuntimeError("Error downloading ISA study")
+
+        logger.debug(
+            "Finished downloading data. Dumping json to final location %s",
+            options.output)
+        os.makedirs(options.output)
+        json_file = os.path.join(options.output, "{}.json".format(
+            isajson['identifier']))
+        with open(json_file, 'w') as fd:
+            json.dump(isajson, fd)
+        logger.info("ISA-JSON written to %s", options.output)
+    else:
+        raise ValueError("BUG! Got an invalid isa format '{}'".format(
+            options.isa_format))
+
+
+def get_factors_command(options):
+    logger.info("Getting factors for study %s. Writing to %s.",
+                options.study_id, options.output.name)
+    factor_names = MTBLS.get_factor_names(options.study_id)
+    if factor_names is not None:
+        json.dump(list(factor_names), options.output, indent=4)
+        logger.debug("Factor names written")
+    else:
+        raise RuntimeError("Error downloading factors.")
+
+
+def get_factor_values_command(options):
+    logger.info("Getting values for factor {factor} in study {study_id}. Writing to {output_file}."
+                .format(factor=options.factor, study_id=options.study_id, output_file=options.output.name))
+    fvs = MTBLS.get_factor_values(options.study_id, options.factor)
+    if fvs is not None:
+        json.dump(list(fvs), options.output, indent=4)
+        logger.debug("Factor values written to {}".format(options.output))
+    else:
+        raise RuntimeError("Error getting factor values")
+
+
+def get_data_files_command(options):
+    logger.info("Getting data files for study %s. Writing to %s.",
+                options.study_id, options.output.name)
+    if options.json_query:
+        logger.debug("This is the specified query:\n%s", options.json_query)
+        json_struct = json.loads(options.json_query)
+        data_files = MTBLS.get_data_files(options.study_id, json_struct)
+    elif options.galaxy_parameters_file:
+        logger.debug("Using input Galaxy JSON parameters from:\n%s",
+                     options.galaxy_parameters_file)
+        with open(options.galaxy_parameters_file) as json_fp:
+            galaxy_json = json.load(json_fp)
+            json_struct = {}
+            for fv_item in galaxy_json['factor_value_series']:
+                json_struct[fv_item['factor_name']] = fv_item['factor_value']
+            data_files = MTBLS.get_data_files(options.study_id, json_struct)
+    else:
+        logger.debug("No query was specified")
+        data_files = MTBLS.get_data_files(options.study_id)
+
+    logger.debug("Result data files list: %s", data_files)
+    if data_files is None:
+        raise RuntimeError("Error getting data files with isatools")
+
+    logger.debug("dumping data files to %s", options.output.name)
+    json.dump(list(data_files), options.output, indent=4)
+    logger.info("Finished writing data files to {}".format(options.output))
+
+
+def build_html_data_files_list(data_files_list):
+    data_files_table = '<table>'
+    data_files_table += '<tr><th>Sample Name</th><th>Data File Names</th></tr>'
+    for data_file in data_files_list:
+        sample_name = data_file['sample']
+        data_files = ', '.join(data_file['data_files'])
+        data_files_table += '<tr><td>{sample_name}</td><td>{data_files}</td>' \
+            .format(sample_name=sample_name, data_files=data_files)
+    html_data_files_list = """
+    <html>
+    <head>
+    <title>ISA-Tab Factors Summary</title>
+    </head>
+    <body>
+    {summary_table}
+    </body>
+    </html>
+""".format(summary_table=data_files_table)
+    return html_data_files_list
+
+
+def build_html_summary(summary):
+    study_groups = {}
+    for item in summary:
+        sample_name = item['sample_name']
+        study_factors = []
+        for item in [x for x in item.items() if x[0] != "sample_name"]:
+            study_factors.append(': '.join([item[0], item[1]]))
+        study_group = ', '.join(study_factors)
+        if study_group not in study_groups.keys():
+            study_groups[study_group] = []
+        study_groups[study_group].append(sample_name)
+    summary_table = '<table>'
+    summary_table += '<tr><th>Study group</th><th>Number of samples</th></tr>'
+    for item in study_groups.items():
+        study_group = item[0]
+        num_samples = len(item[1])
+        summary_table += '<tr><td>{study_group}</td><td>{num_samples}</td>' \
+            .format(study_group=study_group, num_samples=num_samples)
+    summary_table += '</table>'
+    html_summary = """
+<html>
+<head>
+<title>ISA-Tab Factors Summary</title>
+</head>
+<body>
+{summary_table}
+</body>
+</html>
+""".format(summary_table=summary_table)
+    return html_summary
+
+
+def get_summary_command(options):
+    logger.info("Getting summary for study %s. Writing to %s.",
+                options.study_id, options.json_output.name)
+
+    summary = MTBLS.get_study_variable_summary(options.study_id)
+    # new_summary = []
+    # for item in summary:
+    #     new_summary.append(
+    #         {k: v for k, v in item.items() if k is not "sample_name"})
+    # summary = new_summary
+    if summary is not None:
+        json.dump(summary, options.json_output, indent=4)
+        logger.debug("Summary dumped to JSON")
+        html_summary = build_html_summary(summary)
+        with options.html_output as html_fp:
+            html_fp.write(html_summary)
+    else:
+        raise RuntimeError("Error getting study summary")
+
+
+# isaslicer commands
+
+def isatab_get_data_files_list_command(options):
+    logger.info("Getting data files for study %s. Writing to %s.",
+                options.input_path, options.output.name)
+    if options.json_query:
+        logger.debug("This is the specified query:\n%s", options.json_query)
+        json_struct = json.loads(options.json_query)
+    elif options.galaxy_parameters_file:
+        logger.debug("Using input Galaxy JSON parameters from:\n%s",
+                     options.galaxy_parameters_file)
+        with open(options.galaxy_parameters_file) as json_fp:
+            galaxy_json = json.load(json_fp)
+            json_struct = {}
+            for fv_item in galaxy_json['factor_value_series']:
+                json_struct[fv_item['factor_name']] = fv_item['factor_value']
+    else:
+        logger.debug("No query was specified")
+        json_struct = None
+    factor_selection = json_struct
+    input_path = options.input_path
+    result = slice_data_files(input_path, factor_selection=factor_selection)
+    data_files = result
+    logger.debug("Result data files list: %s", data_files)
+    if data_files is None:
+        raise RuntimeError("Error getting data files with isatools")
+
+    logger.debug("dumping data files to %s", options.output.name)
+    json.dump(list(data_files), options.output, indent=4)
+    logger.info("Finished writing data files to {}".format(options.output))
+
+
+def zip_get_data_files_list_command(options):
+    logger.info("Getting data files for study %s. Writing to %s.",
+                options.input_path, options.output.name)
+    if options.json_query:
+        logger.debug("This is the specified query:\n%s", options.json_query)
+        json_struct = json.loads(options.json_query)
+    elif options.galaxy_parameters_file:
+        logger.debug("Using input Galaxy JSON parameters from:\n%s",
+                     options.galaxy_parameters_file)
+        with open(options.galaxy_parameters_file) as json_fp:
+            galaxy_json = json.load(json_fp)
+            json_struct = {}
+            for fv_item in galaxy_json['factor_value_series']:
+                json_struct[fv_item['factor_name']] = fv_item['factor_value']
+    else:
+        logger.debug("No query was specified")
+        json_struct = None
+    factor_selection = json_struct
+    input_path = options.input_path
+    with zipfile.ZipFile(input_path) as zfp:
+        tmpdir = tempfile.mkdtemp()
+        zfp.extractall(path=tmpdir)
+        result = slice_data_files(tmpdir, factor_selection=factor_selection)
+        data_files = result
+        logger.debug("Result data files list: %s", data_files)
+    if data_files is None:
+        raise RuntimeError("Error getting data files with isatools")
+    logger.debug("dumping data files to %s", options.output.name)
+    json.dump(list(data_files), options.output, indent=4)
+    logger.info("Finished writing data files to {}".format(options.output))
+    shutil.rmtree(tmpdir)
+
+
+def isatab_get_data_files_collection_command(options):
+    logger.info("Getting data files for study %s. Writing to %s.",
+                options.input_path, options.output_path)
+    if options.json_query:
+        logger.debug("This is the specified query:\n%s", options.json_query)
+    else:
+        logger.debug("No query was specified")
+    input_path = options.input_path
+    if options.json_query is not None:
+        json_struct = json.loads(options.json_query)
+    elif options.galaxy_parameters_file:
+        logger.debug("Using input Galaxy JSON parameters from:\n%s",
+                     options.galaxy_parameters_file)
+        with open(options.galaxy_parameters_file) as json_fp:
+            galaxy_json = json.load(json_fp)
+            json_struct = {}
+            for fv_item in galaxy_json['factor_value_series']:
+                json_struct[fv_item['factor_name']] = fv_item['factor_value']
+    else:
+        logger.debug("No query was specified")
+        json_struct = None
+    factor_selection = json_struct
+    result = slice_data_files(input_path, factor_selection=factor_selection)
+    data_files = result
+    logger.debug("Result data files list: %s", data_files)
+    if data_files is None:
+        raise RuntimeError("Error getting data files with isatools")
+    output_path = options.output_path
+    logger.debug("copying data files to %s", output_path)
+    for result in data_files:
+        for data_file_name in result['data_files']:
+            logging.info("Copying {}".format(data_file_name))
+            shutil.copy(os.path.join(input_path, data_file_name), output_path)
+    logger.info("Finished writing data files to {}".format(output_path))
+
+
+def zip_get_data_files_collection_command(options):
+    logger.info("Getting data files for study %s. Writing to %s.",
+                options.input_path, options.output_path)
+    if options.json_query:
+        logger.debug("This is the specified query:\n%s", options.json_query)
+    else:
+        logger.debug("No query was specified")
+    input_path = options.input_path
+    output_path = options.output_path
+    if options.json_query is not None:
+        json_struct = json.loads(options.json_query)
+        factor_selection = json_struct
+    else:
+        factor_selection = None
+    with zipfile.ZipFile(input_path) as zfp:
+        tmpdir = tempfile.mkdtemp()
+        zfp.extractall(path=tmpdir)
+        result = slice_data_files(tmpdir, factor_selection=factor_selection)
+        data_files = result
+        logger.debug("Result data files list: %s", data_files)
+        if data_files is None:
+            raise RuntimeError("Error getting data files with isatools")
+        logger.debug("copying data files to %s", output_path)
+        for result in data_files:
+            for data_file_name in result['data_files']:
+                logging.info("Copying {}".format(data_file_name))
+                shutil.copy(os.path.join(tmpdir, data_file_name), output_path)
+    logger.info("Finished writing data files to {}".format(output_path))
+    shutil.rmtree(tmpdir)
+
+
+def slice_data_files(dir, factor_selection=None):
+    results = []
+    # first collect matching samples
+    for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')):
+        logger.info('Loading {table_file}'.format(table_file=table_file))
+
+        with open(os.path.join(dir, table_file)) as fp:
+            df = isatab.load_table(fp)
+
+            if factor_selection is None:
+                matches = df['Sample Name'].items()
+
+                for indx, match in matches:
+                    sample_name = match
+                    if len([r for r in results if r['sample'] ==
+                            sample_name]) == 1:
+                        continue
+                    else:
+                        results.append(
+                            {
+                                'sample': sample_name,
+                                'data_files': []
+                            }
+                        )
+
+            else:
+                for factor_name, factor_value in factor_selection.items():
+                    if 'Factor Value[{}]'.format(factor_name) in list(
+                            df.columns.values):
+                        matches = df.loc[df['Factor Value[{factor}]'.format(
+                            factor=factor_name)] == factor_value][
+                            'Sample Name'].items()
+
+                        for indx, match in matches:
+                            sample_name = match
+                            if len([r for r in results if r['sample'] ==
+                                    sample_name]) == 1:
+                                continue
+                            else:
+                                results.append(
+                                    {
+                                        'sample': sample_name,
+                                        'data_files': [],
+                                        'query_used': factor_selection
+                                    }
+                                )
+
+    # now collect the data files relating to the samples
+    for result in results:
+        sample_name = result['sample']
+
+        for table_file in glob.iglob(os.path.join(dir, 'a_*')):
+            with open(table_file) as fp:
+                df = isatab.load_table(fp)
+
+                data_files = []
+
+                table_headers = list(df.columns.values)
+                sample_rows = df.loc[df['Sample Name'] == sample_name]
+
+                data_node_labels = [
+                    'Raw Data File',
+                    'Raw Spectral Data File',
+                    'Derived Spectral Data File',
+                    'Derived Array Data File',
+                    'Array Data File',
+                    'Protein Assignment File',
+                    'Peptide Assignment File',
+                    'Post Translational Modification Assignment File',
+                    'Acquisition Parameter Data File',
+                    'Free Induction Decay Data File',
+                    'Derived Array Data Matrix File',
+                    'Image File',
+                    'Derived Data File',
+                    'Metabolite Assignment File']
+                for node_label in data_node_labels:
+                    if node_label in table_headers:
+                        data_files.extend(list(sample_rows[node_label]))
+
+                result['data_files'] = [i for i in list(data_files) if
+                                        str(i) != 'nan']
+    return results
+
+
+def isatab_get_factor_names_command(options):
+    input_path = options.input_path
+    logger.info("Getting factors for study %s. Writing to %s.",
+                input_path, options.output.name)
+    _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]')
+    factors = set()
+    for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')):
+        with open(os.path.join(input_path, table_file)) as fp:
+            df = isatab.load_table(fp)
+
+            factors_headers = [header for header in list(df.columns.values)
+                               if _RX_FACTOR_VALUE.match(header)]
+
+            for header in factors_headers:
+                factors.add(header[13:-1])
+    if factors is not None:
+        json.dump(list(factors), options.output, indent=4)
+        logger.debug("Factor names written")
+    else:
+        raise RuntimeError("Error reading factors.")
+
+
+def zip_get_factor_names_command(options):
+    input_path = options.input_path
+    logger.info("Getting factors for study %s. Writing to %s.",
+                input_path, options.output.name)
+    # unpack input_path
+    with zipfile.ZipFile(input_path) as zfp:
+        tmpdir = tempfile.mkdtemp()
+        zfp.extractall(path=tmpdir)
+        _RX_FACTOR_VALUE = re.compile(r'Factor Value\[(.*?)\]')
+        factors = set()
+        for table_file in glob.iglob(os.path.join(tmpdir, '[a|s]_*')):
+            logging.info('Searching {}'.format(table_file))
+            with open(os.path.join(tmpdir, table_file)) as fp:
+                df = isatab.load_table(fp)
+
+                factors_headers = [header for header in list(df.columns.values)
+                                   if _RX_FACTOR_VALUE.match(header)]
+
+                for header in factors_headers:
+                    factors.add(header[13:-1])
+    if factors is not None:
+        json.dump(list(factors), options.output, indent=4)
+        logger.debug("Factor names written")
+    else:
+        raise RuntimeError("Error reading factors.")
+    shutil.rmtree(tmpdir)
+
+
+def isatab_get_factor_values_command(options):
+    logger.info("Getting values for factor {factor} in study {input_path}. Writing to {output_file}."
+                .format(factor=options.factor, input_path=options.input_path, output_file=options.output.name))
+    fvs = set()
+
+    input_path = options.input_path
+    factor_name = options.factor
+
+    for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')):
+        with open(os.path.join(input_path, table_file)) as fp:
+            df = isatab.load_table(fp)
+
+            if 'Factor Value[{factor}]'.format(factor=factor_name) in \
+                    list(df.columns.values):
+                for _, match in df[
+                    'Factor Value[{factor}]'.format(
+                        factor=factor_name)].iteritems():
+                    try:
+                        match = match.item()
+                    except AttributeError:
+                        pass
+
+                    if isinstance(match, (str, int, float)):
+                        if str(match) != 'nan':
+                            fvs.add(match)
+    if fvs is not None:
+        json.dump(list(fvs), options.output, indent=4)
+        logger.debug("Factor values written to {}".format(options.output))
+    else:
+        raise RuntimeError("Error getting factor values")
+
+
+def zip_get_factor_values_command(options):
+    input_path = options.input_path
+    logger.info("Getting factors for study %s. Writing to %s.",
+                input_path, options.output.name)
+    logger.info("Getting values for factor {factor} in study {input_path}. "
+                "Writing to {output_file}.".format(
+                    factor=options.factor, input_path=options.input_path,
+                    output_file=options.output.name))
+    fvs = set()
+    factor_name = options.factor
+
+    # unpack input_path
+    with zipfile.ZipFile(input_path) as zfp:
+        tmpdir = tempfile.mkdtemp()
+        zfp.extractall(path=tmpdir)
+        for table_file in glob.glob(os.path.join(tmpdir, '[a|s]_*')):
+            logging.info('Searching {}'.format(table_file))
+            with open(os.path.join(input_path, table_file)) as fp:
+                df = isatab.load_table(fp)
+                if 'Factor Value[{factor}]'.format(factor=factor_name) in \
+                        list(df.columns.values):
+                    for _, match in df[
+                        'Factor Value[{factor}]'.format(
+                            factor=factor_name)].iteritems():
+                        try:
+                            match = match.item()
+                        except AttributeError:
+                            pass
+
+                        if isinstance(match, (str, int, float)):
+                            if str(match) != 'nan':
+                                fvs.add(match)
+    if fvs is not None:
+        json.dump(list(fvs), options.output, indent=4)
+        logger.debug("Factor values written to {}".format(options.output))
+    else:
+        raise RuntimeError("Error getting factor values")
+    shutil.rmtree(tmpdir)
+
+
+def isatab_get_factors_summary_command(options):
+    logger.info("Getting summary for study %s. Writing to %s.",
+                options.input_path, options.output.name)
+    input_path = options.input_path
+    ISA = isatab.load(input_path)
+
+    all_samples = []
+    for study in ISA.studies:
+        all_samples.extend(study.samples)
+
+    samples_and_fvs = []
+
+    for sample in all_samples:
+        sample_and_fvs = {
+            'sample_name': sample.name,
+        }
+
+        for fv in sample.factor_values:
+            if isinstance(fv.value, (str, int, float)):
+                fv_value = fv.value
+                sample_and_fvs[fv.factor_name.name] = fv_value
+            elif isinstance(fv.value, OntologyAnnotation):
+                fv_value = fv.value.term
+                sample_and_fvs[fv.factor_name.name] = fv_value
+
+        samples_and_fvs.append(sample_and_fvs)
+
+    df = pd.DataFrame(samples_and_fvs)
+    nunique = df.apply(pd.Series.nunique)
+    cols_to_drop = nunique[nunique == 1].index
+
+    df = df.drop(cols_to_drop, axis=1)
+    summary = df.to_dict(orient='records')
+    if summary is not None:
+        json.dump(summary, options.output, indent=4)
+        logger.debug("Summary dumped to JSON")
+        # html_summary = build_html_summary(summary)
+        # with options.html_output as html_fp:
+        #     html_fp.write(html_summary)
+    else:
+        raise RuntimeError("Error getting study summary")
+
+
+def zip_get_factors_summary_command(options):
+    logger.info("Getting summary for study %s. Writing to %s.",
+                options.input_path, options.json_output.name)
+    input_path = options.input_path
+    with zipfile.ZipFile(input_path) as zfp:
+        tmpdir = tempfile.mkdtemp()
+        zfp.extractall(path=tmpdir)
+        ISA = isatab.load(tmpdir)
+        all_samples = []
+        for study in ISA.studies:
+            all_samples.extend(study.samples)
+        samples_and_fvs = []
+        for sample in all_samples:
+            sample_and_fvs = {
+                'sample_name': sample.name,
+            }
+            for fv in sample.factor_values:
+                if isinstance(fv.value, (str, int, float)):
+                    fv_value = fv.value
+                    sample_and_fvs[fv.factor_name.name] = fv_value
+                elif isinstance(fv.value, OntologyAnnotation):
+                    fv_value = fv.value.term
+                    sample_and_fvs[fv.factor_name.name] = fv_value
+            samples_and_fvs.append(sample_and_fvs)
+        df = pd.DataFrame(samples_and_fvs)
+        nunique = df.apply(pd.Series.nunique)
+        cols_to_drop = nunique[nunique == 1].index
+        df = df.drop(cols_to_drop, axis=1)
+        summary = df.to_dict(orient='records')
+    if summary is not None:
+        json.dump(summary, options.json_output, indent=4)
+        logger.debug("Summary dumped to JSON")
+        print(json.dumps(summary, indent=4))
+        html_summary = build_html_summary(summary)
+        with options.html_output as html_fp:
+            html_fp.write(html_summary)
+    else:
+        raise RuntimeError("Error getting study summary")
+    shutil.rmtree(tmpdir)
+
+
+def get_study_groups(input_path):
+    factors_summary = isatab_get_factors_summary_command(input_path=input_path)
+    study_groups = {}
+
+    for factors_item in factors_summary:
+        fvs = tuple(factors_item[k] for k in factors_item.keys() if k != 'name')
+
+        if fvs in study_groups.keys():
+            study_groups[fvs].append(factors_item['name'])
+        else:
+            study_groups[fvs] = [factors_item['name']]
+    return study_groups
+
+
+def get_study_groups_samples_sizes(input_path):
+    study_groups = get_study_groups(input_path=input_path)
+    return list(map(lambda x: (x[0], len(x[1])), study_groups.items()))
+
+
+def get_sources_for_sample(input_path, sample_name):
+    ISA = isatab.load(input_path)
+    hits = []
+
+    for study in ISA.studies:
+        for sample in study.samples:
+            if sample.name == sample_name:
+                print('found a hit: {sample_name}'.format(
+                    sample_name=sample.name))
+
+                for source in sample.derives_from:
+                    hits.append(source.name)
+    return hits
+
+
+def get_data_for_sample(input_path, sample_name):
+    ISA = isatab.load(input_path)
+    hits = []
+    for study in ISA.studies:
+        for assay in study.assays:
+            for data in assay.data_files:
+                if sample_name in [x.name for x in data.generated_from]:
+                    logger.info('found a hit: {filename}'.format(
+                        filename=data.filename))
+                    hits.append(data)
+    return hits
+
+
+def get_study_groups_data_sizes(input_path):
+    study_groups = get_study_groups(input_path=input_path)
+    return list(map(lambda x: (x[0], len(x[1])), study_groups.items()))
+
+
+def get_characteristics_summary(input_path):
+    """
+        This function generates a characteristics summary for a MetaboLights
+        study
+
+        :param input_path: Input path to ISA-tab
+        :return: A list of dicts summarising the set of characteristic names
+        and values associated with each sample
+
+        Note: it only returns a summary of characteristics with variable values.
+
+        Example usage:
+            characteristics_summary = get_characteristics_summary('/path/to/my/study/')
+            [
+                {
+                    "name": "6089if_9",
+                    "Variant": "Synechocystis sp. PCC 6803.sll0171.ko"
+                },
+                {
+                    "name": "6089if_43",
+                    "Variant": "Synechocystis sp. PCC 6803.WT.none"
+                },
+            ]
+
+
+        """
+    ISA = isatab.load(input_path)
+
+    all_samples = []
+    for study in ISA.studies:
+        all_samples.extend(study.samples)
+
+    samples_and_characs = []
+    for sample in all_samples:
+        sample_and_characs = {
+            'name': sample.name
+        }
+
+        for source in sample.derives_from:
+            for c in source.characteristics:
+                if isinstance(c.value, (str, int, float)):
+                    c_value = c.value
+                    sample_and_characs[c.category.term] = c_value
+                elif isinstance(c.value, OntologyAnnotation):
+                    c_value = c.value.term
+                    sample_and_characs[c.category.term] = c_value
+
+        samples_and_characs.append(sample_and_characs)
+
+    df = pd.DataFrame(samples_and_characs)
+    nunique = df.apply(pd.Series.nunique)
+    cols_to_drop = nunique[nunique == 1].index
+
+    df = df.drop(cols_to_drop, axis=1)
+    return df.to_dict(orient='records')
+
+
+def get_study_variable_summary(input_path):
+    ISA = isatab.load(input_path)
+
+    all_samples = []
+    for study in ISA.studies:
+        all_samples.extend(study.samples)
+
+    samples_and_variables = []
+    for sample in all_samples:
+        sample_and_vars = {
+            'sample_name': sample.name
+        }
+
+        for fv in sample.factor_values:
+            if isinstance(fv.value, (str, int, float)):
+                fv_value = fv.value
+                sample_and_vars[fv.factor_name.name] = fv_value
+            elif isinstance(fv.value, OntologyAnnotation):
+                fv_value = fv.value.term
+                sample_and_vars[fv.factor_name.name] = fv_value
+
+        for source in sample.derives_from:
+            sample_and_vars['source_name'] = source.name
+            for c in source.characteristics:
+                if isinstance(c.value, (str, int, float)):
+                    c_value = c.value
+                    sample_and_vars[c.category.term] = c_value
+                elif isinstance(c.value, OntologyAnnotation):
+                    c_value = c.value.term
+                    sample_and_vars[c.category.term] = c_value
+
+        samples_and_variables.append(sample_and_vars)
+
+    df = pd.DataFrame(samples_and_variables)
+    nunique = df.apply(pd.Series.nunique)
+    cols_to_drop = nunique[nunique == 1].index
+
+    df = df.drop(cols_to_drop, axis=1)
+    return df.to_dict(orient='records')
+
+
+def get_study_group_factors(input_path):
+    factors_list = []
+
+    for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')):
+        with open(os.path.join(input_path, table_file)) as fp:
+            df = isatab.load_table(fp)
+
+            factor_columns = [x for x in df.columns if x.startswith(
+                'Factor Value')]
+            if len(factor_columns) > 0:
+                factors_list = df[factor_columns].drop_duplicates()\
+                    .to_dict(orient='records')
+    return factors_list
+
+
+def get_filtered_df_on_factors_list(input_path):
+    factors_list = get_study_group_factors(input_path=input_path)
+    queries = []
+
+    for item in factors_list:
+        query_str = []
+
+        for k, v in item.items():
+            k = k.replace(' ', '_').replace('[', '_').replace(']', '_')
+            if isinstance(v, str):
+                v = v.replace(' ', '_').replace('[', '_').replace(']', '_')
+                query_str.append("{k} == '{v}' and ".format(k=k, v=v))
+
+        query_str = ''.join(query_str)[:-4]
+        queries.append(query_str)
+
+    for table_file in glob.iglob(os.path.join(input_path, '[a|s]_*')):
+        with open(os.path.join(input_path, table_file)) as fp:
+            df = isatab.load_table(fp)
+
+            cols = df.columns
+            cols = cols.map(
+                lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
+            df.columns = cols
+
+            cols = df.columns
+            cols = cols.map(
+                lambda x: x.replace('[', '_') if isinstance(x, str) else x)
+            df.columns = cols
+
+            cols = df.columns
+            cols = cols.map(
+                lambda x: x.replace(']', '_') if isinstance(x, str) else x)
+            df.columns = cols
+
+        for query in queries:
+            # query uses pandas.eval, which evaluates queries like pure Python
+            # notation
+            df2 = df.query(query)
+            if 'Sample_Name' in df.columns:
+                print('Group: {query} / Sample_Name: {sample_name}'.format(
+                    query=query, sample_name=list(df2['Sample_Name'])))
+
+            if 'Source_Name' in df.columns:
+                print('Group: {} / Sources_Name: {}'.format(
+                    query, list(df2['Source_Name'])))
+
+            if 'Raw_Spectral_Data_File' in df.columns:
+                print('Group: {query} / Raw_Spectral_Data_File: {filename}'
+                      .format(query=query[13:-2],
+                              filename=list(df2['Raw_Spectral_Data_File'])))
+    return queries
+
+
+def datatype_get_summary_command(options):
+    logger.info("Getting summary for study %s. Writing to %s.",
+                options.study_id, options.output.name)
+
+    summary = get_study_variable_summary(options.study_id)
+    print('summary: ', list(summary))
+    if summary is not None:
+        json.dump(summary, options.output, indent=4)
+        logger.debug("Summary dumped")
+    else:
+        raise RuntimeError("Error getting study summary")
+
+
+# logging and argument parsing
+
+def _configure_logger(options):
+    logging_level = getattr(logging, options.log_level, logging.INFO)
+    logging.basicConfig(level=logging_level)
+
+    global logger
+    logger = logging.getLogger()
+    logger.setLevel(logging_level)  # there's a bug somewhere.  The level set through basicConfig isn't taking effect
+
+
+def _parse_args(args):
+    parser = make_parser()
+    options = parser.parse_args(args)
+    return options
+
+
+def main(args):
+    options = _parse_args(args)
+    _configure_logger(options)
+    # run subcommand
+    options.func(options)
+
+
+if __name__ == '__main__':
+    try:
+        main(sys.argv[1:])
+        sys.exit(0)
+    except Exception as e:
+        logger.exception(e)
+        logger.error(e)
+        sys.exit(e.code if hasattr(e, "code") else 99)