Mercurial > repos > pmac > iterativepca
comparison R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip
Uploaded
| author | pmac |
|---|---|
| date | Wed, 01 Jun 2016 03:38:39 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:64e75e21466e |
|---|---|
| 1 ### pipeline ### | |
| 2 # Complete a single iteration, which consists of: | |
| 3 # - Doing pca | |
| 4 # - Clustering, if required | |
| 5 # - Finding outliers | |
| 6 # - Setting up plots | |
| 7 # Outputs a list containing all the data regarding this iteration | |
| 8 single_iteration = function(outdir, basename, ped_data, xsamples, numsds, | |
| 9 cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) { | |
| 10 it_data = list() | |
| 11 it_data$old_xsamples = xsamples | |
| 12 # get data and do pca | |
| 13 pca_data = do_pca(ped_data) | |
| 14 it_data$pca_data = pca_data | |
| 15 it_data$plots = list() | |
| 16 plot_number = 1 | |
| 17 | |
| 18 # plot controls and cases | |
| 19 if (!is.null(control_tag) || !is.null(cases_tag)) { | |
| 20 it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag) | |
| 21 plot_number = plot_number + 1 | |
| 22 } | |
| 23 | |
| 24 # if we have ethnicity data, setup a special plot | |
| 25 if (!is.null(ethnicity_data)) { | |
| 26 it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data) | |
| 27 plot_number = plot_number + 1 | |
| 28 } | |
| 29 | |
| 30 if (cmethod == "none") { | |
| 31 # get outliers by sd | |
| 32 all_outliers = outliers_by_sd(pca_data, xsamples, numsds) | |
| 33 new_xsamples = union(xsamples, pca_data$ids[all_outliers]) | |
| 34 it_data$xsamples = new_xsamples | |
| 35 # prepare outlier plot | |
| 36 it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers) | |
| 37 plot_number = plot_number + 1 | |
| 38 # prepare sd plot | |
| 39 it_data$plots[[plot_number]] = setup_sd_plot(pca_data) | |
| 40 plot_number = plot_number + 1 | |
| 41 } else { | |
| 42 # do clustering | |
| 43 if (cmethod == "dbscan") { | |
| 44 emax = 2 | |
| 45 mp = 7 | |
| 46 clusters = automated_dbscan(pca_data, emax, mp) | |
| 47 } else if (cmethod == "hclust") { | |
| 48 clusters = automated_hclust(pca_data) | |
| 49 } else { | |
| 50 clusters = automated_hclust(pca_data) | |
| 51 } | |
| 52 | |
| 53 # get outliers | |
| 54 cluster_outliers = which(clusters == 0) | |
| 55 # get rejected clusters | |
| 56 centers = find_cluster_centers(clusters, pca_data$values) | |
| 57 if (tmethod == "mcd") { | |
| 58 rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2) | |
| 59 } else if (tmethod == "sd") { | |
| 60 rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds) | |
| 61 } else if (tmethod == "dbscan_outliers_only") { | |
| 62 rc = 0 | |
| 63 } | |
| 64 rc_indices = which(clusters %in% rc) | |
| 65 all_ol = union(cluster_outliers, rc_indices) | |
| 66 # it is possible that all samples get removed, in which case program will crash. | |
| 67 # so do not remove them | |
| 68 if (length(all_ol) == nrow(ped_data)) { | |
| 69 new_xsamples = xsamples | |
| 70 } else { | |
| 71 new_xsamples = union(xsamples, pca_data$ids[all_ol]) | |
| 72 } | |
| 73 it_data$xsamples = new_xsamples | |
| 74 # prepare plot | |
| 75 it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc) | |
| 76 plot_number = plot_number + 1 | |
| 77 } | |
| 78 it_data$outliers = setdiff(new_xsamples, xsamples) | |
| 79 it_data$num_plots = plot_number - 1 | |
| 80 return(it_data) | |
| 81 } | |
| 82 | |
| 83 # basically an inner join on a list of ids, and a table of ethnicity data | |
| 84 # if eth_data == null, then the second column is filled with NAs | |
| 85 add_ethnicity_data = function(ids, eth_data) { | |
| 86 n = length(ids) | |
| 87 if (!is.null(eth_data)) { | |
| 88 output = matrix(nrow=n, ncol=ncol(eth_data)) | |
| 89 colnames(output) = colnames(eth_data) | |
| 90 if (n > 0) { | |
| 91 for (i in 1:n) { | |
| 92 this_id = ids[i] | |
| 93 if (this_id %in% rownames(eth_data)) { | |
| 94 this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE) | |
| 95 } else { | |
| 96 this_row = c(this_id, rep(NA, ncol(output)-1)) | |
| 97 } | |
| 98 output[i, ] = this_row | |
| 99 } | |
| 100 } | |
| 101 } else { | |
| 102 output = cbind(ids, rep(NA, n)) | |
| 103 colnames(output) = c("IID", "Population") | |
| 104 } | |
| 105 return(output) | |
| 106 } | |
| 107 | |
| 108 generate_directory_name = function(outdir, basename, iteration) { | |
| 109 newdir = paste0("output_", basename, "_iteration_", iteration - 1) | |
| 110 full_path = paste0(outdir, "/", newdir) | |
| 111 return(full_path) | |
| 112 } |
