annotate iterative_pca.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64e75e21466e Uploaded
pmac
parents:
diff changeset
1 import os
64e75e21466e Uploaded
pmac
parents:
diff changeset
2 import subprocess
64e75e21466e Uploaded
pmac
parents:
diff changeset
3 import sys
64e75e21466e Uploaded
pmac
parents:
diff changeset
4 import argparse
64e75e21466e Uploaded
pmac
parents:
diff changeset
5 import datetime
64e75e21466e Uploaded
pmac
parents:
diff changeset
6 import csv
64e75e21466e Uploaded
pmac
parents:
diff changeset
7
64e75e21466e Uploaded
pmac
parents:
diff changeset
8 import jinja2
64e75e21466e Uploaded
pmac
parents:
diff changeset
9
64e75e21466e Uploaded
pmac
parents:
diff changeset
10 import pedify
64e75e21466e Uploaded
pmac
parents:
diff changeset
11
64e75e21466e Uploaded
pmac
parents:
diff changeset
12 JINJA_ENVIRONMENT = jinja2.Environment(
64e75e21466e Uploaded
pmac
parents:
diff changeset
13 loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
64e75e21466e Uploaded
pmac
parents:
diff changeset
14 extensions=['jinja2.ext.autoescape'],
64e75e21466e Uploaded
pmac
parents:
diff changeset
15 autoescape=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
16 DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
17 CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
18 CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"]
64e75e21466e Uploaded
pmac
parents:
diff changeset
19 HTML_TEMPLATE_NAME = "pca_report.html"
64e75e21466e Uploaded
pmac
parents:
diff changeset
20 DEFAULT_SD_CUTOFF = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
21 PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__)))
64e75e21466e Uploaded
pmac
parents:
diff changeset
22
64e75e21466e Uploaded
pmac
parents:
diff changeset
23
64e75e21466e Uploaded
pmac
parents:
diff changeset
24 def main():
64e75e21466e Uploaded
pmac
parents:
diff changeset
25 ###===ARGUMENT PARSING===###
64e75e21466e Uploaded
pmac
parents:
diff changeset
26 parser = argparse.ArgumentParser()
64e75e21466e Uploaded
pmac
parents:
diff changeset
27 parser.add_argument("datafile_name", help="name of data file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
28 parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
29 parser.add_argument("iterations", help="max number of iterations to run",
64e75e21466e Uploaded
pmac
parents:
diff changeset
30 type=int)
64e75e21466e Uploaded
pmac
parents:
diff changeset
31 parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP")
64e75e21466e Uploaded
pmac
parents:
diff changeset
32 parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS")
64e75e21466e Uploaded
pmac
parents:
diff changeset
33 parser.add_argument("--config_file", help="name of file containing config info")
64e75e21466e Uploaded
pmac
parents:
diff changeset
34 parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true")
64e75e21466e Uploaded
pmac
parents:
diff changeset
35 parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS,
64e75e21466e Uploaded
pmac
parents:
diff changeset
36 help="algorithm used to find clusters")
64e75e21466e Uploaded
pmac
parents:
diff changeset
37 parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING,
64e75e21466e Uploaded
pmac
parents:
diff changeset
38 help="method used to identify and trim outlier clusters")
64e75e21466e Uploaded
pmac
parents:
diff changeset
39 parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers")
64e75e21466e Uploaded
pmac
parents:
diff changeset
40 parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name")
64e75e21466e Uploaded
pmac
parents:
diff changeset
41 group = parser.add_mutually_exclusive_group()
64e75e21466e Uploaded
pmac
parents:
diff changeset
42 group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory")
64e75e21466e Uploaded
pmac
parents:
diff changeset
43 group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory")
64e75e21466e Uploaded
pmac
parents:
diff changeset
44 parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file")
64e75e21466e Uploaded
pmac
parents:
diff changeset
45 parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples")
64e75e21466e Uploaded
pmac
parents:
diff changeset
46 parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy",
64e75e21466e Uploaded
pmac
parents:
diff changeset
47 action="store_true")
64e75e21466e Uploaded
pmac
parents:
diff changeset
48 parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove")
64e75e21466e Uploaded
pmac
parents:
diff changeset
49 parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove")
64e75e21466e Uploaded
pmac
parents:
diff changeset
50
64e75e21466e Uploaded
pmac
parents:
diff changeset
51 args = parser.parse_args()
64e75e21466e Uploaded
pmac
parents:
diff changeset
52
64e75e21466e Uploaded
pmac
parents:
diff changeset
53 # enforce conditionally required args
64e75e21466e Uploaded
pmac
parents:
diff changeset
54 if args.data_type == "variant_data":
64e75e21466e Uploaded
pmac
parents:
diff changeset
55 if args.config_file == None:
64e75e21466e Uploaded
pmac
parents:
diff changeset
56 parser.error("Input variant data files require a complementary config file\
64e75e21466e Uploaded
pmac
parents:
diff changeset
57 specified with '--config_file <filename>' flag")
64e75e21466e Uploaded
pmac
parents:
diff changeset
58
64e75e21466e Uploaded
pmac
parents:
diff changeset
59 if args.config_file:
64e75e21466e Uploaded
pmac
parents:
diff changeset
60 if args.data_type != "variant_data":
64e75e21466e Uploaded
pmac
parents:
diff changeset
61 print "Warning! Your config file will be ignored as the input data is not a\
64e75e21466e Uploaded
pmac
parents:
diff changeset
62 text file containing variant data"
64e75e21466e Uploaded
pmac
parents:
diff changeset
63
64e75e21466e Uploaded
pmac
parents:
diff changeset
64 if args.clustering_flag:
64e75e21466e Uploaded
pmac
parents:
diff changeset
65 if (args.clustering_method == None) or (args.cluster_trimming == None):
64e75e21466e Uploaded
pmac
parents:
diff changeset
66 parser.error("If --clustering_flag is set,\
64e75e21466e Uploaded
pmac
parents:
diff changeset
67 --clustering_method and --cluster_trimming options must also be specified")
64e75e21466e Uploaded
pmac
parents:
diff changeset
68
64e75e21466e Uploaded
pmac
parents:
diff changeset
69 if args.clustering_method or args.cluster_trimming:
64e75e21466e Uploaded
pmac
parents:
diff changeset
70 if args.clustering_flag == False:
64e75e21466e Uploaded
pmac
parents:
diff changeset
71 parser.error("--clustering_method and --cluster_trimming\
64e75e21466e Uploaded
pmac
parents:
diff changeset
72 options only valid if --clustering_flag is also set")
64e75e21466e Uploaded
pmac
parents:
diff changeset
73
64e75e21466e Uploaded
pmac
parents:
diff changeset
74 # dbscan outliers only valid if cluster method == "DBSCAN"
64e75e21466e Uploaded
pmac
parents:
diff changeset
75 if args.cluster_trimming == "dbscan_outliers_only":
64e75e21466e Uploaded
pmac
parents:
diff changeset
76 if args.clustering_method != "dbscan":
64e75e21466e Uploaded
pmac
parents:
diff changeset
77 parser.error("dbscan_outliers_only cluster trimming method only\
64e75e21466e Uploaded
pmac
parents:
diff changeset
78 valid if --clustering_method == dbscan")
64e75e21466e Uploaded
pmac
parents:
diff changeset
79
64e75e21466e Uploaded
pmac
parents:
diff changeset
80 dfname = args.datafile_name
64e75e21466e Uploaded
pmac
parents:
diff changeset
81 data_type = args.data_type
64e75e21466e Uploaded
pmac
parents:
diff changeset
82 n = args.iterations
64e75e21466e Uploaded
pmac
parents:
diff changeset
83 control_tag = args.control_tag
64e75e21466e Uploaded
pmac
parents:
diff changeset
84 cases_tag = args.cases_tag
64e75e21466e Uploaded
pmac
parents:
diff changeset
85 cfname = args.config_file
64e75e21466e Uploaded
pmac
parents:
diff changeset
86 cflag = args.clustering_flag
64e75e21466e Uploaded
pmac
parents:
diff changeset
87 cmethod = (args.clustering_method if args.clustering_method else "none")
64e75e21466e Uploaded
pmac
parents:
diff changeset
88 tmethod = (args.cluster_trimming if args.cluster_trimming else "none")
64e75e21466e Uploaded
pmac
parents:
diff changeset
89 sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF)
64e75e21466e Uploaded
pmac
parents:
diff changeset
90 basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0])
64e75e21466e Uploaded
pmac
parents:
diff changeset
91 hfname = args.html_file
64e75e21466e Uploaded
pmac
parents:
diff changeset
92 galaxy_flag = args.galaxy
64e75e21466e Uploaded
pmac
parents:
diff changeset
93 ethfname = args.ethnicity_file
64e75e21466e Uploaded
pmac
parents:
diff changeset
94 xsnps_filename = args.reject_snps
64e75e21466e Uploaded
pmac
parents:
diff changeset
95 xsamples_filename = args.reject_samples
64e75e21466e Uploaded
pmac
parents:
diff changeset
96
64e75e21466e Uploaded
pmac
parents:
diff changeset
97 # function used to handle strange galaxy behaviour...
64e75e21466e Uploaded
pmac
parents:
diff changeset
98 if galaxy_flag and cfname:
64e75e21466e Uploaded
pmac
parents:
diff changeset
99 rc = format_galaxy_config_file(cfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
100 if rc != 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
101 print "Bad config file"
64e75e21466e Uploaded
pmac
parents:
diff changeset
102 sys.exit()
64e75e21466e Uploaded
pmac
parents:
diff changeset
103
64e75e21466e Uploaded
pmac
parents:
diff changeset
104 path_here = os.path.dirname(os.path.abspath(__file__))
64e75e21466e Uploaded
pmac
parents:
diff changeset
105 # determine output directory
64e75e21466e Uploaded
pmac
parents:
diff changeset
106 if args.relative_prefix:
64e75e21466e Uploaded
pmac
parents:
diff changeset
107 outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
108 elif args.absolute_prefix:
64e75e21466e Uploaded
pmac
parents:
diff changeset
109 outdir = "{}/full_output_{}".format(args.absolute_prefix, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
110 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
111 outdir = "{}/full_output_{}".format(path_here, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
112 if hfname == None:
64e75e21466e Uploaded
pmac
parents:
diff changeset
113 hfname = "{}/{}.html".format(outdir, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
114
64e75e21466e Uploaded
pmac
parents:
diff changeset
115 ####===BEGIN PIPELINE===####
64e75e21466e Uploaded
pmac
parents:
diff changeset
116 # create a log file
64e75e21466e Uploaded
pmac
parents:
diff changeset
117 lfname = "{}.log".format(basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
118 logfile = open(lfname, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
119 logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now()))
64e75e21466e Uploaded
pmac
parents:
diff changeset
120
64e75e21466e Uploaded
pmac
parents:
diff changeset
121 # create output directory
64e75e21466e Uploaded
pmac
parents:
diff changeset
122 prepare_directory(outdir, logfile)
64e75e21466e Uploaded
pmac
parents:
diff changeset
123
64e75e21466e Uploaded
pmac
parents:
diff changeset
124 # do pre-processing if necessary, else go straight to R
64e75e21466e Uploaded
pmac
parents:
diff changeset
125 if data_type == "variant_data":
64e75e21466e Uploaded
pmac
parents:
diff changeset
126 stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename,
64e75e21466e Uploaded
pmac
parents:
diff changeset
127 xsnps_filename, basename, outdir, logfile, True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
128 data_source_name = "{}/{}".format(outdir, stage1_info['pfname'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
129 data_type_out = "numeric_ped"
64e75e21466e Uploaded
pmac
parents:
diff changeset
130 xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
131 elif data_type in ["rds", "rdata", "numeric_ped"]:
64e75e21466e Uploaded
pmac
parents:
diff changeset
132 data_source_name = dfname
64e75e21466e Uploaded
pmac
parents:
diff changeset
133 data_type_out = data_type
64e75e21466e Uploaded
pmac
parents:
diff changeset
134 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
135 print "Unrecognised input data type, exiting..."
64e75e21466e Uploaded
pmac
parents:
diff changeset
136 sys.exit(0)
64e75e21466e Uploaded
pmac
parents:
diff changeset
137
64e75e21466e Uploaded
pmac
parents:
diff changeset
138 # do iterative pca in R (much faster)
64e75e21466e Uploaded
pmac
parents:
diff changeset
139 rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \
64e75e21466e Uploaded
pmac
parents:
diff changeset
140 '{}' '{}' '{}' '{}' '{}'\
64e75e21466e Uploaded
pmac
parents:
diff changeset
141 '{}' '{}' '{}' '{}' '{}'".format(
64e75e21466e Uploaded
pmac
parents:
diff changeset
142 path_here,
64e75e21466e Uploaded
pmac
parents:
diff changeset
143 n, outdir, basename, data_source_name,
64e75e21466e Uploaded
pmac
parents:
diff changeset
144 data_type_out, ethfname, control_tag, cases_tag, sd_cutoff,
64e75e21466e Uploaded
pmac
parents:
diff changeset
145 cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
146
64e75e21466e Uploaded
pmac
parents:
diff changeset
147 subprocess.call(rCommand, shell=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
148 # generate dictionary of output file names
64e75e21466e Uploaded
pmac
parents:
diff changeset
149 if cflag:
64e75e21466e Uploaded
pmac
parents:
diff changeset
150 num_plots = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
151 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
152 num_plots = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
153 if ethfname:
64e75e21466e Uploaded
pmac
parents:
diff changeset
154 num_plots += 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
155 if cases_tag or control_tag:
64e75e21466e Uploaded
pmac
parents:
diff changeset
156 num_plots += 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
157 iteration_data = create_output_dict(outdir, basename, n, num_plots)
64e75e21466e Uploaded
pmac
parents:
diff changeset
158
64e75e21466e Uploaded
pmac
parents:
diff changeset
159 # clean up logfile
64e75e21466e Uploaded
pmac
parents:
diff changeset
160 logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now()))
64e75e21466e Uploaded
pmac
parents:
diff changeset
161 logfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
162 subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
163 lfname = "{}/{}".format(outdir, lfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
164 if cfname:
64e75e21466e Uploaded
pmac
parents:
diff changeset
165 # copy and rename config file into outdir (if provided)
64e75e21466e Uploaded
pmac
parents:
diff changeset
166 subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
167 # output results summary in a nice html format
64e75e21466e Uploaded
pmac
parents:
diff changeset
168 html_output = produce_report(hfname, dfname, lfname, cflag,
64e75e21466e Uploaded
pmac
parents:
diff changeset
169 cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag)
64e75e21466e Uploaded
pmac
parents:
diff changeset
170 ##############################################################################
64e75e21466e Uploaded
pmac
parents:
diff changeset
171
64e75e21466e Uploaded
pmac
parents:
diff changeset
172 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
173 Function to convert a text file containing variant data into a numeric or genomic ped file
64e75e21466e Uploaded
pmac
parents:
diff changeset
174 Inputs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
175 - dfname: The name of the text file to convert
64e75e21466e Uploaded
pmac
parents:
diff changeset
176 - cfname: Configuration file containing info on which columns to find required information in,
64e75e21466e Uploaded
pmac
parents:
diff changeset
177 as well as filtering options
64e75e21466e Uploaded
pmac
parents:
diff changeset
178 - xsamples_filename: A file containg a list of exact sample ids to exclude
64e75e21466e Uploaded
pmac
parents:
diff changeset
179 - xsnps_filename: A file containing a list of exact snp ids to exclude
64e75e21466e Uploaded
pmac
parents:
diff changeset
180 - basename: Filename without the extension
64e75e21466e Uploaded
pmac
parents:
diff changeset
181 - outdir: Full path of output directory
64e75e21466e Uploaded
pmac
parents:
diff changeset
182 - logfile: A file to keep track of program status. Any shell commands executed will be written to this file
64e75e21466e Uploaded
pmac
parents:
diff changeset
183 - numeric_flag: If true, ped file will be numeric.
64e75e21466e Uploaded
pmac
parents:
diff changeset
184 If false, ped file will be genomic
64e75e21466e Uploaded
pmac
parents:
diff changeset
185 Outputs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
186 - ped file, either numeric or genomic
64e75e21466e Uploaded
pmac
parents:
diff changeset
187 - map file
64e75e21466e Uploaded
pmac
parents:
diff changeset
188 - excluded samples file, containing the ids of all samples which did not pass the filters and were rejected
64e75e21466e Uploaded
pmac
parents:
diff changeset
189 Returns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
190 a dict containing the PATHS to important output files
64e75e21466e Uploaded
pmac
parents:
diff changeset
191 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
192 def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag):
64e75e21466e Uploaded
pmac
parents:
diff changeset
193 # Create ped and map files
64e75e21466e Uploaded
pmac
parents:
diff changeset
194 logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n")
64e75e21466e Uploaded
pmac
parents:
diff changeset
195
64e75e21466e Uploaded
pmac
parents:
diff changeset
196 pfname = "{}/{}.ped".format(outdir, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
197 mfname = "{}/{}.map".format(outdir, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
198 xsname = "{}/{}_xsamples.txt".format(outdir, basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
199
64e75e21466e Uploaded
pmac
parents:
diff changeset
200 pc = pedify.PedConverter()
64e75e21466e Uploaded
pmac
parents:
diff changeset
201 # read in config file
64e75e21466e Uploaded
pmac
parents:
diff changeset
202 rc = pc.read_config_file(cfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
203 # Add in string filters for excluded samples and snps
64e75e21466e Uploaded
pmac
parents:
diff changeset
204 if xsamples_filename:
64e75e21466e Uploaded
pmac
parents:
diff changeset
205 xsamples = open(xsamples_filename, 'r').read().split('\n')
64e75e21466e Uploaded
pmac
parents:
diff changeset
206 pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'],
64e75e21466e Uploaded
pmac
parents:
diff changeset
207 "exact", "reject", xsamples)
64e75e21466e Uploaded
pmac
parents:
diff changeset
208 if xsnps_filename:
64e75e21466e Uploaded
pmac
parents:
diff changeset
209 xsnps = open(xsnps_filename, 'r').read().split('\n')
64e75e21466e Uploaded
pmac
parents:
diff changeset
210 pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'],
64e75e21466e Uploaded
pmac
parents:
diff changeset
211 "exact", "reject", xsnps)
64e75e21466e Uploaded
pmac
parents:
diff changeset
212 if rc == 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
213 print 'config file read successfully'
64e75e21466e Uploaded
pmac
parents:
diff changeset
214 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
215 print 'failed to read in config file successfully. Error code: {}'.format(rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
216
64e75e21466e Uploaded
pmac
parents:
diff changeset
217 # read in data file
64e75e21466e Uploaded
pmac
parents:
diff changeset
218 rc = pc.read_data_file(dfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
219 if rc == 0:
64e75e21466e Uploaded
pmac
parents:
diff changeset
220 print 'data file read successfully'
64e75e21466e Uploaded
pmac
parents:
diff changeset
221 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
222 print 'failed to read in data file successfully. Error code: {}'.format(rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
223
64e75e21466e Uploaded
pmac
parents:
diff changeset
224 # create output
64e75e21466e Uploaded
pmac
parents:
diff changeset
225 pc.create_ped_file(pfname, numeric=numeric_flag)
64e75e21466e Uploaded
pmac
parents:
diff changeset
226 pc.create_map_file(mfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
227 pc.create_excluded_samples_file(xsname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
228 outdict = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
229 "pfname": "{}.ped".format(basename),
64e75e21466e Uploaded
pmac
parents:
diff changeset
230 "mfname": "{}.map".format(basename),
64e75e21466e Uploaded
pmac
parents:
diff changeset
231 "xsname": "{}_xsamples.txt".format(basename),
64e75e21466e Uploaded
pmac
parents:
diff changeset
232 "control_tag": pc.cfg.control_info['control_tag']['tag'],
64e75e21466e Uploaded
pmac
parents:
diff changeset
233 "cases_tag": pc.cfg.control_info['cases_tag']['tag']
64e75e21466e Uploaded
pmac
parents:
diff changeset
234 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
235
64e75e21466e Uploaded
pmac
parents:
diff changeset
236 return outdict
64e75e21466e Uploaded
pmac
parents:
diff changeset
237
64e75e21466e Uploaded
pmac
parents:
diff changeset
238 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
239 Function to produce the html report of all the data
64e75e21466e Uploaded
pmac
parents:
diff changeset
240 Inputs:
64e75e21466e Uploaded
pmac
parents:
diff changeset
241 hfname: The ABSOLUTE PATH (including the filename) of the html file to output
64e75e21466e Uploaded
pmac
parents:
diff changeset
242 dfname: path to datafile
64e75e21466e Uploaded
pmac
parents:
diff changeset
243 lfname: path to logfile
64e75e21466e Uploaded
pmac
parents:
diff changeset
244 sd_cutoff: multipler used to determine outliers
64e75e21466e Uploaded
pmac
parents:
diff changeset
245 iterationList: a list of dicts, each containing AT LEAST:
64e75e21466e Uploaded
pmac
parents:
diff changeset
246 - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration
64e75e21466e Uploaded
pmac
parents:
diff changeset
247 - count: which iteration this was
64e75e21466e Uploaded
pmac
parents:
diff changeset
248 - plotpaths: A list of paths to output plots
64e75e21466e Uploaded
pmac
parents:
diff changeset
249 - xsname: name of file containing ids of samples excluded from this iteration
64e75e21466e Uploaded
pmac
parents:
diff changeset
250 - ofname: name of file containing ids of outliers detected in this iteration
64e75e21466e Uploaded
pmac
parents:
diff changeset
251 returns:
64e75e21466e Uploaded
pmac
parents:
diff changeset
252 The html output that was written to the html file
64e75e21466e Uploaded
pmac
parents:
diff changeset
253 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
254 def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag):
64e75e21466e Uploaded
pmac
parents:
diff changeset
255 hfdir = os.path.dirname(hfname)
64e75e21466e Uploaded
pmac
parents:
diff changeset
256 htmldoc = open(hfname, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
257 iteration_data = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
258 for it in iteration_list:
64e75e21466e Uploaded
pmac
parents:
diff changeset
259 outdir = it['outdir']
64e75e21466e Uploaded
pmac
parents:
diff changeset
260 if galaxy_flag:
64e75e21466e Uploaded
pmac
parents:
diff changeset
261 dirs = outdir.split("/")
64e75e21466e Uploaded
pmac
parents:
diff changeset
262 relpath = "{}/{}".format(dirs[-2], dirs[-1])
64e75e21466e Uploaded
pmac
parents:
diff changeset
263 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
264 relpath = os.path.relpath(outdir, hfdir)
64e75e21466e Uploaded
pmac
parents:
diff changeset
265 it_dict = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
266
64e75e21466e Uploaded
pmac
parents:
diff changeset
267 ofname = "{}/{}".format(outdir, it['ofname'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
268 xsname = "{}/{}".format(outdir, it['xsname'])
64e75e21466e Uploaded
pmac
parents:
diff changeset
269 of = open(ofname, 'r')
64e75e21466e Uploaded
pmac
parents:
diff changeset
270 xf = open(xsname, 'r')
64e75e21466e Uploaded
pmac
parents:
diff changeset
271 ol = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
272 xl = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
273 for line in of:
64e75e21466e Uploaded
pmac
parents:
diff changeset
274 ol.append(line.split(','))
64e75e21466e Uploaded
pmac
parents:
diff changeset
275 for line in xf:
64e75e21466e Uploaded
pmac
parents:
diff changeset
276 xl.append(line.split(','))
64e75e21466e Uploaded
pmac
parents:
diff changeset
277 of.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
278 xf.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
279 # fix the paths to the images
64e75e21466e Uploaded
pmac
parents:
diff changeset
280 rel_plot_paths = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
281 for pp in it['plot_paths']:
64e75e21466e Uploaded
pmac
parents:
diff changeset
282 rel_plot_paths.append("{}/{}".format(relpath, pp))
64e75e21466e Uploaded
pmac
parents:
diff changeset
283
64e75e21466e Uploaded
pmac
parents:
diff changeset
284 it_dict['count'] = it['count']
64e75e21466e Uploaded
pmac
parents:
diff changeset
285 it_dict['ol'] = ol
64e75e21466e Uploaded
pmac
parents:
diff changeset
286 it_dict['xl'] = xl
64e75e21466e Uploaded
pmac
parents:
diff changeset
287 it_dict['rel_plot_paths']= rel_plot_paths
64e75e21466e Uploaded
pmac
parents:
diff changeset
288 iteration_data.append(it_dict)
64e75e21466e Uploaded
pmac
parents:
diff changeset
289
64e75e21466e Uploaded
pmac
parents:
diff changeset
290 logcontent = open(lfname, 'r').read()
64e75e21466e Uploaded
pmac
parents:
diff changeset
291 template_values = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
292 "date": datetime.datetime.now(),
64e75e21466e Uploaded
pmac
parents:
diff changeset
293 "dfname": dfname,
64e75e21466e Uploaded
pmac
parents:
diff changeset
294 "lfname": lfname,
64e75e21466e Uploaded
pmac
parents:
diff changeset
295 "logcontent": logcontent,
64e75e21466e Uploaded
pmac
parents:
diff changeset
296 "iteration_data": iteration_data,
64e75e21466e Uploaded
pmac
parents:
diff changeset
297 "cflag": cflag,
64e75e21466e Uploaded
pmac
parents:
diff changeset
298 "cmethod": cmethod,
64e75e21466e Uploaded
pmac
parents:
diff changeset
299 "tmethod": tmethod,
64e75e21466e Uploaded
pmac
parents:
diff changeset
300 "sd_cutoff": sd_cutoff,
64e75e21466e Uploaded
pmac
parents:
diff changeset
301 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
302 template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME)
64e75e21466e Uploaded
pmac
parents:
diff changeset
303 rendered_template = template.render(template_values)
64e75e21466e Uploaded
pmac
parents:
diff changeset
304 htmldoc.write(rendered_template)
64e75e21466e Uploaded
pmac
parents:
diff changeset
305 htmldoc.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
306 return rendered_template
64e75e21466e Uploaded
pmac
parents:
diff changeset
307
64e75e21466e Uploaded
pmac
parents:
diff changeset
308 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
309 Generate the expected directory structure of the root output directory,
64e75e21466e Uploaded
pmac
parents:
diff changeset
310 given the basname, number of iterations, and number of expected plots output by
64e75e21466e Uploaded
pmac
parents:
diff changeset
311 the R script
64e75e21466e Uploaded
pmac
parents:
diff changeset
312 """
64e75e21466e Uploaded
pmac
parents:
diff changeset
313 def create_output_dict(outdir, basename, n, num_plots):
64e75e21466e Uploaded
pmac
parents:
diff changeset
314 iteration_list = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
315 for i in range(n):
64e75e21466e Uploaded
pmac
parents:
diff changeset
316 it_dict = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
317 i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i)
64e75e21466e Uploaded
pmac
parents:
diff changeset
318 it_dict['outdir'] = i_outdir
64e75e21466e Uploaded
pmac
parents:
diff changeset
319 it_dict['ofname'] = "{}_outliers.txt".format(basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
320 it_dict['xsname'] = "{}_xfile.txt".format(basename)
64e75e21466e Uploaded
pmac
parents:
diff changeset
321 it_dict['count'] = i
64e75e21466e Uploaded
pmac
parents:
diff changeset
322 it_dict['plot_paths'] = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
323 for j in range(num_plots):
64e75e21466e Uploaded
pmac
parents:
diff changeset
324 it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1))
64e75e21466e Uploaded
pmac
parents:
diff changeset
325 iteration_list.append(it_dict)
64e75e21466e Uploaded
pmac
parents:
diff changeset
326 return iteration_list
64e75e21466e Uploaded
pmac
parents:
diff changeset
327
64e75e21466e Uploaded
pmac
parents:
diff changeset
328
64e75e21466e Uploaded
pmac
parents:
diff changeset
329 ###########################UTILITY########################################
64e75e21466e Uploaded
pmac
parents:
diff changeset
330 def prepare_directory(outdir, logfile):
64e75e21466e Uploaded
pmac
parents:
diff changeset
331 # set up output directory
64e75e21466e Uploaded
pmac
parents:
diff changeset
332 logfile.write("##Setting up output directory:{}##\n".format(outdir))
64e75e21466e Uploaded
pmac
parents:
diff changeset
333 subprocess.call("rm -rf '{}'".format(outdir), shell=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
334 logfile.write("rm -rf '{}'\n".format(outdir))
64e75e21466e Uploaded
pmac
parents:
diff changeset
335 subprocess.call("mkdir -p '{}'".format(outdir), shell=True)
64e75e21466e Uploaded
pmac
parents:
diff changeset
336 logfile.write("mkdir -p '{}'\n".format(outdir))
64e75e21466e Uploaded
pmac
parents:
diff changeset
337
64e75e21466e Uploaded
pmac
parents:
diff changeset
338 # takes a list of dicts, merges them into a single big dict
64e75e21466e Uploaded
pmac
parents:
diff changeset
339 def merge_dicts(dictList):
64e75e21466e Uploaded
pmac
parents:
diff changeset
340 result = {}
64e75e21466e Uploaded
pmac
parents:
diff changeset
341 for d in dictList:
64e75e21466e Uploaded
pmac
parents:
diff changeset
342 if type(d) == dict:
64e75e21466e Uploaded
pmac
parents:
diff changeset
343 result.update(d)
64e75e21466e Uploaded
pmac
parents:
diff changeset
344 return result
64e75e21466e Uploaded
pmac
parents:
diff changeset
345
64e75e21466e Uploaded
pmac
parents:
diff changeset
346 # Some odd rules regarding character escaping and such in galaxy: fix those here
64e75e21466e Uploaded
pmac
parents:
diff changeset
347 def format_galaxy_config_file(cfname):
64e75e21466e Uploaded
pmac
parents:
diff changeset
348 CHAR_CONVERSION = {
64e75e21466e Uploaded
pmac
parents:
diff changeset
349 'g': '>',
64e75e21466e Uploaded
pmac
parents:
diff changeset
350 'l': '<',
64e75e21466e Uploaded
pmac
parents:
diff changeset
351 'e': '==',
64e75e21466e Uploaded
pmac
parents:
diff changeset
352 'ge': '<=',
64e75e21466e Uploaded
pmac
parents:
diff changeset
353 'le': '>='
64e75e21466e Uploaded
pmac
parents:
diff changeset
354 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
355 cfile = open(cfname, 'r')
64e75e21466e Uploaded
pmac
parents:
diff changeset
356 lines = cfile.readlines()
64e75e21466e Uploaded
pmac
parents:
diff changeset
357 cfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
358
64e75e21466e Uploaded
pmac
parents:
diff changeset
359 newlines = []
64e75e21466e Uploaded
pmac
parents:
diff changeset
360 section = None
64e75e21466e Uploaded
pmac
parents:
diff changeset
361 for l in lines:
64e75e21466e Uploaded
pmac
parents:
diff changeset
362 nl = l
64e75e21466e Uploaded
pmac
parents:
diff changeset
363 if l[0] == '#':
64e75e21466e Uploaded
pmac
parents:
diff changeset
364 if l.strip() == "#numeric_filters":
64e75e21466e Uploaded
pmac
parents:
diff changeset
365 section = "numeric_filters"
64e75e21466e Uploaded
pmac
parents:
diff changeset
366 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
367 section = None
64e75e21466e Uploaded
pmac
parents:
diff changeset
368 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
369 if section:
64e75e21466e Uploaded
pmac
parents:
diff changeset
370 tokens = l.split(',')
64e75e21466e Uploaded
pmac
parents:
diff changeset
371 op = tokens[2]
64e75e21466e Uploaded
pmac
parents:
diff changeset
372 if op in CHAR_CONVERSION.keys():
64e75e21466e Uploaded
pmac
parents:
diff changeset
373 newop = CHAR_CONVERSION[op]
64e75e21466e Uploaded
pmac
parents:
diff changeset
374 elif op in CHAR_CONVERSION.values():
64e75e21466e Uploaded
pmac
parents:
diff changeset
375 newop = op
64e75e21466e Uploaded
pmac
parents:
diff changeset
376 else:
64e75e21466e Uploaded
pmac
parents:
diff changeset
377 # error code 1; bad op codes
64e75e21466e Uploaded
pmac
parents:
diff changeset
378 return 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
379 tokens[2] = newop
64e75e21466e Uploaded
pmac
parents:
diff changeset
380 nl = ','.join(tokens)
64e75e21466e Uploaded
pmac
parents:
diff changeset
381
64e75e21466e Uploaded
pmac
parents:
diff changeset
382
64e75e21466e Uploaded
pmac
parents:
diff changeset
383 nl = nl.replace("__pd__", '#')
64e75e21466e Uploaded
pmac
parents:
diff changeset
384 newlines.append(nl)
64e75e21466e Uploaded
pmac
parents:
diff changeset
385
64e75e21466e Uploaded
pmac
parents:
diff changeset
386 cfile = open(cfname, 'w')
64e75e21466e Uploaded
pmac
parents:
diff changeset
387 cfile.writelines(newlines)
64e75e21466e Uploaded
pmac
parents:
diff changeset
388 cfile.close()
64e75e21466e Uploaded
pmac
parents:
diff changeset
389 return 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
390 ##############################################################################
64e75e21466e Uploaded
pmac
parents:
diff changeset
391
64e75e21466e Uploaded
pmac
parents:
diff changeset
392 if __name__ == "__main__":
64e75e21466e Uploaded
pmac
parents:
diff changeset
393 main()