annotate R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64e75e21466e Uploaded
pmac
parents:
diff changeset
1 ### pipeline ###
64e75e21466e Uploaded
pmac
parents:
diff changeset
2 # Complete a single iteration, which consists of:
64e75e21466e Uploaded
pmac
parents:
diff changeset
3 # - Doing pca
64e75e21466e Uploaded
pmac
parents:
diff changeset
4 # - Clustering, if required
64e75e21466e Uploaded
pmac
parents:
diff changeset
5 # - Finding outliers
64e75e21466e Uploaded
pmac
parents:
diff changeset
6 # - Setting up plots
64e75e21466e Uploaded
pmac
parents:
diff changeset
7 # Outputs a list containing all the data regarding this iteration
64e75e21466e Uploaded
pmac
parents:
diff changeset
8 single_iteration = function(outdir, basename, ped_data, xsamples, numsds,
64e75e21466e Uploaded
pmac
parents:
diff changeset
9 cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
10 it_data = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
11 it_data$old_xsamples = xsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
12 # get data and do pca
64e75e21466e Uploaded
pmac
parents:
diff changeset
13 pca_data = do_pca(ped_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
14 it_data$pca_data = pca_data
64e75e21466e Uploaded
pmac
parents:
diff changeset
15 it_data$plots = list()
64e75e21466e Uploaded
pmac
parents:
diff changeset
16 plot_number = 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
17
64e75e21466e Uploaded
pmac
parents:
diff changeset
18 # plot controls and cases
64e75e21466e Uploaded
pmac
parents:
diff changeset
19 if (!is.null(control_tag) || !is.null(cases_tag)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
20 it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag)
64e75e21466e Uploaded
pmac
parents:
diff changeset
21 plot_number = plot_number + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
22 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
23
64e75e21466e Uploaded
pmac
parents:
diff changeset
24 # if we have ethnicity data, setup a special plot
64e75e21466e Uploaded
pmac
parents:
diff changeset
25 if (!is.null(ethnicity_data)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
26 it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
27 plot_number = plot_number + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
28 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
29
64e75e21466e Uploaded
pmac
parents:
diff changeset
30 if (cmethod == "none") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
31 # get outliers by sd
64e75e21466e Uploaded
pmac
parents:
diff changeset
32 all_outliers = outliers_by_sd(pca_data, xsamples, numsds)
64e75e21466e Uploaded
pmac
parents:
diff changeset
33 new_xsamples = union(xsamples, pca_data$ids[all_outliers])
64e75e21466e Uploaded
pmac
parents:
diff changeset
34 it_data$xsamples = new_xsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
35 # prepare outlier plot
64e75e21466e Uploaded
pmac
parents:
diff changeset
36 it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers)
64e75e21466e Uploaded
pmac
parents:
diff changeset
37 plot_number = plot_number + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
38 # prepare sd plot
64e75e21466e Uploaded
pmac
parents:
diff changeset
39 it_data$plots[[plot_number]] = setup_sd_plot(pca_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
40 plot_number = plot_number + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
41 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
42 # do clustering
64e75e21466e Uploaded
pmac
parents:
diff changeset
43 if (cmethod == "dbscan") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
44 emax = 2
64e75e21466e Uploaded
pmac
parents:
diff changeset
45 mp = 7
64e75e21466e Uploaded
pmac
parents:
diff changeset
46 clusters = automated_dbscan(pca_data, emax, mp)
64e75e21466e Uploaded
pmac
parents:
diff changeset
47 } else if (cmethod == "hclust") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
48 clusters = automated_hclust(pca_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
49 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
50 clusters = automated_hclust(pca_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
51 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
52
64e75e21466e Uploaded
pmac
parents:
diff changeset
53 # get outliers
64e75e21466e Uploaded
pmac
parents:
diff changeset
54 cluster_outliers = which(clusters == 0)
64e75e21466e Uploaded
pmac
parents:
diff changeset
55 # get rejected clusters
64e75e21466e Uploaded
pmac
parents:
diff changeset
56 centers = find_cluster_centers(clusters, pca_data$values)
64e75e21466e Uploaded
pmac
parents:
diff changeset
57 if (tmethod == "mcd") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
58 rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2)
64e75e21466e Uploaded
pmac
parents:
diff changeset
59 } else if (tmethod == "sd") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
60 rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds)
64e75e21466e Uploaded
pmac
parents:
diff changeset
61 } else if (tmethod == "dbscan_outliers_only") {
64e75e21466e Uploaded
pmac
parents:
diff changeset
62 rc = 0
64e75e21466e Uploaded
pmac
parents:
diff changeset
63 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
64 rc_indices = which(clusters %in% rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
65 all_ol = union(cluster_outliers, rc_indices)
64e75e21466e Uploaded
pmac
parents:
diff changeset
66 # it is possible that all samples get removed, in which case program will crash.
64e75e21466e Uploaded
pmac
parents:
diff changeset
67 # so do not remove them
64e75e21466e Uploaded
pmac
parents:
diff changeset
68 if (length(all_ol) == nrow(ped_data)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
69 new_xsamples = xsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
70 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
71 new_xsamples = union(xsamples, pca_data$ids[all_ol])
64e75e21466e Uploaded
pmac
parents:
diff changeset
72 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
73 it_data$xsamples = new_xsamples
64e75e21466e Uploaded
pmac
parents:
diff changeset
74 # prepare plot
64e75e21466e Uploaded
pmac
parents:
diff changeset
75 it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc)
64e75e21466e Uploaded
pmac
parents:
diff changeset
76 plot_number = plot_number + 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
77 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
78 it_data$outliers = setdiff(new_xsamples, xsamples)
64e75e21466e Uploaded
pmac
parents:
diff changeset
79 it_data$num_plots = plot_number - 1
64e75e21466e Uploaded
pmac
parents:
diff changeset
80 return(it_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
81 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
82
64e75e21466e Uploaded
pmac
parents:
diff changeset
83 # basically an inner join on a list of ids, and a table of ethnicity data
64e75e21466e Uploaded
pmac
parents:
diff changeset
84 # if eth_data == null, then the second column is filled with NAs
64e75e21466e Uploaded
pmac
parents:
diff changeset
85 add_ethnicity_data = function(ids, eth_data) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
86 n = length(ids)
64e75e21466e Uploaded
pmac
parents:
diff changeset
87 if (!is.null(eth_data)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
88 output = matrix(nrow=n, ncol=ncol(eth_data))
64e75e21466e Uploaded
pmac
parents:
diff changeset
89 colnames(output) = colnames(eth_data)
64e75e21466e Uploaded
pmac
parents:
diff changeset
90 if (n > 0) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
91 for (i in 1:n) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
92 this_id = ids[i]
64e75e21466e Uploaded
pmac
parents:
diff changeset
93 if (this_id %in% rownames(eth_data)) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
94 this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE)
64e75e21466e Uploaded
pmac
parents:
diff changeset
95 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
96 this_row = c(this_id, rep(NA, ncol(output)-1))
64e75e21466e Uploaded
pmac
parents:
diff changeset
97 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
98 output[i, ] = this_row
64e75e21466e Uploaded
pmac
parents:
diff changeset
99 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
100 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
101 } else {
64e75e21466e Uploaded
pmac
parents:
diff changeset
102 output = cbind(ids, rep(NA, n))
64e75e21466e Uploaded
pmac
parents:
diff changeset
103 colnames(output) = c("IID", "Population")
64e75e21466e Uploaded
pmac
parents:
diff changeset
104 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
105 return(output)
64e75e21466e Uploaded
pmac
parents:
diff changeset
106 }
64e75e21466e Uploaded
pmac
parents:
diff changeset
107
64e75e21466e Uploaded
pmac
parents:
diff changeset
108 generate_directory_name = function(outdir, basename, iteration) {
64e75e21466e Uploaded
pmac
parents:
diff changeset
109 newdir = paste0("output_", basename, "_iteration_", iteration - 1)
64e75e21466e Uploaded
pmac
parents:
diff changeset
110 full_path = paste0(outdir, "/", newdir)
64e75e21466e Uploaded
pmac
parents:
diff changeset
111 return(full_path)
64e75e21466e Uploaded
pmac
parents:
diff changeset
112 }