Mercurial > repos > pmac > iterativepca
comparison R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:64e75e21466e |
---|---|
1 ### pipeline ### | |
2 # Complete a single iteration, which consists of: | |
3 # - Doing pca | |
4 # - Clustering, if required | |
5 # - Finding outliers | |
6 # - Setting up plots | |
7 # Outputs a list containing all the data regarding this iteration | |
8 single_iteration = function(outdir, basename, ped_data, xsamples, numsds, | |
9 cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) { | |
10 it_data = list() | |
11 it_data$old_xsamples = xsamples | |
12 # get data and do pca | |
13 pca_data = do_pca(ped_data) | |
14 it_data$pca_data = pca_data | |
15 it_data$plots = list() | |
16 plot_number = 1 | |
17 | |
18 # plot controls and cases | |
19 if (!is.null(control_tag) || !is.null(cases_tag)) { | |
20 it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag) | |
21 plot_number = plot_number + 1 | |
22 } | |
23 | |
24 # if we have ethnicity data, setup a special plot | |
25 if (!is.null(ethnicity_data)) { | |
26 it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data) | |
27 plot_number = plot_number + 1 | |
28 } | |
29 | |
30 if (cmethod == "none") { | |
31 # get outliers by sd | |
32 all_outliers = outliers_by_sd(pca_data, xsamples, numsds) | |
33 new_xsamples = union(xsamples, pca_data$ids[all_outliers]) | |
34 it_data$xsamples = new_xsamples | |
35 # prepare outlier plot | |
36 it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers) | |
37 plot_number = plot_number + 1 | |
38 # prepare sd plot | |
39 it_data$plots[[plot_number]] = setup_sd_plot(pca_data) | |
40 plot_number = plot_number + 1 | |
41 } else { | |
42 # do clustering | |
43 if (cmethod == "dbscan") { | |
44 emax = 2 | |
45 mp = 7 | |
46 clusters = automated_dbscan(pca_data, emax, mp) | |
47 } else if (cmethod == "hclust") { | |
48 clusters = automated_hclust(pca_data) | |
49 } else { | |
50 clusters = automated_hclust(pca_data) | |
51 } | |
52 | |
53 # get outliers | |
54 cluster_outliers = which(clusters == 0) | |
55 # get rejected clusters | |
56 centers = find_cluster_centers(clusters, pca_data$values) | |
57 if (tmethod == "mcd") { | |
58 rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2) | |
59 } else if (tmethod == "sd") { | |
60 rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds) | |
61 } else if (tmethod == "dbscan_outliers_only") { | |
62 rc = 0 | |
63 } | |
64 rc_indices = which(clusters %in% rc) | |
65 all_ol = union(cluster_outliers, rc_indices) | |
66 # it is possible that all samples get removed, in which case program will crash. | |
67 # so do not remove them | |
68 if (length(all_ol) == nrow(ped_data)) { | |
69 new_xsamples = xsamples | |
70 } else { | |
71 new_xsamples = union(xsamples, pca_data$ids[all_ol]) | |
72 } | |
73 it_data$xsamples = new_xsamples | |
74 # prepare plot | |
75 it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc) | |
76 plot_number = plot_number + 1 | |
77 } | |
78 it_data$outliers = setdiff(new_xsamples, xsamples) | |
79 it_data$num_plots = plot_number - 1 | |
80 return(it_data) | |
81 } | |
82 | |
83 # basically an inner join on a list of ids, and a table of ethnicity data | |
84 # if eth_data == null, then the second column is filled with NAs | |
85 add_ethnicity_data = function(ids, eth_data) { | |
86 n = length(ids) | |
87 if (!is.null(eth_data)) { | |
88 output = matrix(nrow=n, ncol=ncol(eth_data)) | |
89 colnames(output) = colnames(eth_data) | |
90 if (n > 0) { | |
91 for (i in 1:n) { | |
92 this_id = ids[i] | |
93 if (this_id %in% rownames(eth_data)) { | |
94 this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE) | |
95 } else { | |
96 this_row = c(this_id, rep(NA, ncol(output)-1)) | |
97 } | |
98 output[i, ] = this_row | |
99 } | |
100 } | |
101 } else { | |
102 output = cbind(ids, rep(NA, n)) | |
103 colnames(output) = c("IID", "Population") | |
104 } | |
105 return(output) | |
106 } | |
107 | |
108 generate_directory_name = function(outdir, basename, iteration) { | |
109 newdir = paste0("output_", basename, "_iteration_", iteration - 1) | |
110 full_path = paste0(outdir, "/", newdir) | |
111 return(full_path) | |
112 } |