Mercurial > repos > pmac > iterativepca
diff iterative_pca.py @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iterative_pca.py Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,393 @@ +import os +import subprocess +import sys +import argparse +import datetime +import csv + +import jinja2 + +import pedify + +JINJA_ENVIRONMENT = jinja2.Environment( + loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), + extensions=['jinja2.ext.autoescape'], + autoescape=True) +DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"] +CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"] +CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"] +HTML_TEMPLATE_NAME = "pca_report.html" +DEFAULT_SD_CUTOFF = 2 +PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__))) + + +def main(): + ###===ARGUMENT PARSING===### + parser = argparse.ArgumentParser() + parser.add_argument("datafile_name", help="name of data file") + parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file") + parser.add_argument("iterations", help="max number of iterations to run", + type=int) + parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP") + parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS") + parser.add_argument("--config_file", help="name of file containing config info") + parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true") + parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS, + help="algorithm used to find clusters") + parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, + help="method used to identify and trim outlier clusters") + parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers") + parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name") + group = parser.add_mutually_exclusive_group() + group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory") + group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory") + parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file") + parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples") + parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", + action="store_true") + parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove") + parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove") + + args = parser.parse_args() + + # enforce conditionally required args + if args.data_type == "variant_data": + if args.config_file == None: + parser.error("Input variant data files require a complementary config file\ + specified with '--config_file <filename>' flag") + + if args.config_file: + if args.data_type != "variant_data": + print "Warning! Your config file will be ignored as the input data is not a\ + text file containing variant data" + + if args.clustering_flag: + if (args.clustering_method == None) or (args.cluster_trimming == None): + parser.error("If --clustering_flag is set,\ + --clustering_method and --cluster_trimming options must also be specified") + + if args.clustering_method or args.cluster_trimming: + if args.clustering_flag == False: + parser.error("--clustering_method and --cluster_trimming\ + options only valid if --clustering_flag is also set") + + # dbscan outliers only valid if cluster method == "DBSCAN" + if args.cluster_trimming == "dbscan_outliers_only": + if args.clustering_method != "dbscan": + parser.error("dbscan_outliers_only cluster trimming method only\ + valid if --clustering_method == dbscan") + + dfname = args.datafile_name + data_type = args.data_type + n = args.iterations + control_tag = args.control_tag + cases_tag = args.cases_tag + cfname = args.config_file + cflag = args.clustering_flag + cmethod = (args.clustering_method if args.clustering_method else "none") + tmethod = (args.cluster_trimming if args.cluster_trimming else "none") + sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF) + basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0]) + hfname = args.html_file + galaxy_flag = args.galaxy + ethfname = args.ethnicity_file + xsnps_filename = args.reject_snps + xsamples_filename = args.reject_samples + + # function used to handle strange galaxy behaviour... + if galaxy_flag and cfname: + rc = format_galaxy_config_file(cfname) + if rc != 0: + print "Bad config file" + sys.exit() + + path_here = os.path.dirname(os.path.abspath(__file__)) + # determine output directory + if args.relative_prefix: + outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename) + elif args.absolute_prefix: + outdir = "{}/full_output_{}".format(args.absolute_prefix, basename) + else: + outdir = "{}/full_output_{}".format(path_here, basename) + if hfname == None: + hfname = "{}/{}.html".format(outdir, basename) + + ####===BEGIN PIPELINE===#### + # create a log file + lfname = "{}.log".format(basename) + logfile = open(lfname, 'w') + logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now())) + + # create output directory + prepare_directory(outdir, logfile) + + # do pre-processing if necessary, else go straight to R + if data_type == "variant_data": + stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, + xsnps_filename, basename, outdir, logfile, True) + data_source_name = "{}/{}".format(outdir, stage1_info['pfname']) + data_type_out = "numeric_ped" + xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname']) + elif data_type in ["rds", "rdata", "numeric_ped"]: + data_source_name = dfname + data_type_out = data_type + else: + print "Unrecognised input data type, exiting..." + sys.exit(0) + + # do iterative pca in R (much faster) + rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \ + '{}' '{}' '{}' '{}' '{}'\ + '{}' '{}' '{}' '{}' '{}'".format( + path_here, + n, outdir, basename, data_source_name, + data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, + cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename) + + subprocess.call(rCommand, shell=True) + # generate dictionary of output file names + if cflag: + num_plots = 1 + else: + num_plots = 2 + if ethfname: + num_plots += 1 + if cases_tag or control_tag: + num_plots += 1 + iteration_data = create_output_dict(outdir, basename, n, num_plots) + + # clean up logfile + logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now())) + logfile.close() + subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True) + lfname = "{}/{}".format(outdir, lfname) + if cfname: + # copy and rename config file into outdir (if provided) + subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True) + # output results summary in a nice html format + html_output = produce_report(hfname, dfname, lfname, cflag, + cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag) +############################################################################## + +""" +Function to convert a text file containing variant data into a numeric or genomic ped file +Inputs: +- dfname: The name of the text file to convert +- cfname: Configuration file containing info on which columns to find required information in, + as well as filtering options +- xsamples_filename: A file containg a list of exact sample ids to exclude +- xsnps_filename: A file containing a list of exact snp ids to exclude +- basename: Filename without the extension +- outdir: Full path of output directory +- logfile: A file to keep track of program status. Any shell commands executed will be written to this file +- numeric_flag: If true, ped file will be numeric. + If false, ped file will be genomic +Outputs: +- ped file, either numeric or genomic +- map file +- excluded samples file, containing the ids of all samples which did not pass the filters and were rejected +Returns: + a dict containing the PATHS to important output files +""" +def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag): + # Create ped and map files + logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n") + + pfname = "{}/{}.ped".format(outdir, basename) + mfname = "{}/{}.map".format(outdir, basename) + xsname = "{}/{}_xsamples.txt".format(outdir, basename) + + pc = pedify.PedConverter() + # read in config file + rc = pc.read_config_file(cfname) + # Add in string filters for excluded samples and snps + if xsamples_filename: + xsamples = open(xsamples_filename, 'r').read().split('\n') + pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], + "exact", "reject", xsamples) + if xsnps_filename: + xsnps = open(xsnps_filename, 'r').read().split('\n') + pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], + "exact", "reject", xsnps) + if rc == 0: + print 'config file read successfully' + else: + print 'failed to read in config file successfully. Error code: {}'.format(rc) + + # read in data file + rc = pc.read_data_file(dfname) + if rc == 0: + print 'data file read successfully' + else: + print 'failed to read in data file successfully. Error code: {}'.format(rc) + + # create output + pc.create_ped_file(pfname, numeric=numeric_flag) + pc.create_map_file(mfname) + pc.create_excluded_samples_file(xsname) + outdict = { + "pfname": "{}.ped".format(basename), + "mfname": "{}.map".format(basename), + "xsname": "{}_xsamples.txt".format(basename), + "control_tag": pc.cfg.control_info['control_tag']['tag'], + "cases_tag": pc.cfg.control_info['cases_tag']['tag'] + } + + return outdict + +""" +Function to produce the html report of all the data +Inputs: +hfname: The ABSOLUTE PATH (including the filename) of the html file to output +dfname: path to datafile +lfname: path to logfile +sd_cutoff: multipler used to determine outliers +iterationList: a list of dicts, each containing AT LEAST: + - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration + - count: which iteration this was + - plotpaths: A list of paths to output plots + - xsname: name of file containing ids of samples excluded from this iteration + - ofname: name of file containing ids of outliers detected in this iteration +returns: + The html output that was written to the html file +""" +def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag): + hfdir = os.path.dirname(hfname) + htmldoc = open(hfname, 'w') + iteration_data = [] + for it in iteration_list: + outdir = it['outdir'] + if galaxy_flag: + dirs = outdir.split("/") + relpath = "{}/{}".format(dirs[-2], dirs[-1]) + else: + relpath = os.path.relpath(outdir, hfdir) + it_dict = {} + + ofname = "{}/{}".format(outdir, it['ofname']) + xsname = "{}/{}".format(outdir, it['xsname']) + of = open(ofname, 'r') + xf = open(xsname, 'r') + ol = [] + xl = [] + for line in of: + ol.append(line.split(',')) + for line in xf: + xl.append(line.split(',')) + of.close() + xf.close() + # fix the paths to the images + rel_plot_paths = [] + for pp in it['plot_paths']: + rel_plot_paths.append("{}/{}".format(relpath, pp)) + + it_dict['count'] = it['count'] + it_dict['ol'] = ol + it_dict['xl'] = xl + it_dict['rel_plot_paths']= rel_plot_paths + iteration_data.append(it_dict) + + logcontent = open(lfname, 'r').read() + template_values = { + "date": datetime.datetime.now(), + "dfname": dfname, + "lfname": lfname, + "logcontent": logcontent, + "iteration_data": iteration_data, + "cflag": cflag, + "cmethod": cmethod, + "tmethod": tmethod, + "sd_cutoff": sd_cutoff, + } + template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME) + rendered_template = template.render(template_values) + htmldoc.write(rendered_template) + htmldoc.close() + return rendered_template + +""" +Generate the expected directory structure of the root output directory, +given the basname, number of iterations, and number of expected plots output by +the R script +""" +def create_output_dict(outdir, basename, n, num_plots): + iteration_list = [] + for i in range(n): + it_dict = {} + i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i) + it_dict['outdir'] = i_outdir + it_dict['ofname'] = "{}_outliers.txt".format(basename) + it_dict['xsname'] = "{}_xfile.txt".format(basename) + it_dict['count'] = i + it_dict['plot_paths'] = [] + for j in range(num_plots): + it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1)) + iteration_list.append(it_dict) + return iteration_list + + +###########################UTILITY######################################## +def prepare_directory(outdir, logfile): + # set up output directory + logfile.write("##Setting up output directory:{}##\n".format(outdir)) + subprocess.call("rm -rf '{}'".format(outdir), shell=True) + logfile.write("rm -rf '{}'\n".format(outdir)) + subprocess.call("mkdir -p '{}'".format(outdir), shell=True) + logfile.write("mkdir -p '{}'\n".format(outdir)) + +# takes a list of dicts, merges them into a single big dict +def merge_dicts(dictList): + result = {} + for d in dictList: + if type(d) == dict: + result.update(d) + return result + +# Some odd rules regarding character escaping and such in galaxy: fix those here +def format_galaxy_config_file(cfname): + CHAR_CONVERSION = { + 'g': '>', + 'l': '<', + 'e': '==', + 'ge': '<=', + 'le': '>=' + } + cfile = open(cfname, 'r') + lines = cfile.readlines() + cfile.close() + + newlines = [] + section = None + for l in lines: + nl = l + if l[0] == '#': + if l.strip() == "#numeric_filters": + section = "numeric_filters" + else: + section = None + else: + if section: + tokens = l.split(',') + op = tokens[2] + if op in CHAR_CONVERSION.keys(): + newop = CHAR_CONVERSION[op] + elif op in CHAR_CONVERSION.values(): + newop = op + else: + # error code 1; bad op codes + return 1 + tokens[2] = newop + nl = ','.join(tokens) + + + nl = nl.replace("__pd__", '#') + newlines.append(nl) + + cfile = open(cfname, 'w') + cfile.writelines(newlines) + cfile.close() + return 0 +############################################################################## + +if __name__ == "__main__": + main()