Mercurial > repos > pmac > iterativepca
diff R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/pipeline_code.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,112 @@ +### pipeline ### +# Complete a single iteration, which consists of: +# - Doing pca +# - Clustering, if required +# - Finding outliers +# - Setting up plots +# Outputs a list containing all the data regarding this iteration +single_iteration = function(outdir, basename, ped_data, xsamples, numsds, + cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) { + it_data = list() + it_data$old_xsamples = xsamples + # get data and do pca + pca_data = do_pca(ped_data) + it_data$pca_data = pca_data + it_data$plots = list() + plot_number = 1 + + # plot controls and cases + if (!is.null(control_tag) || !is.null(cases_tag)) { + it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag) + plot_number = plot_number + 1 + } + + # if we have ethnicity data, setup a special plot + if (!is.null(ethnicity_data)) { + it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data) + plot_number = plot_number + 1 + } + + if (cmethod == "none") { + # get outliers by sd + all_outliers = outliers_by_sd(pca_data, xsamples, numsds) + new_xsamples = union(xsamples, pca_data$ids[all_outliers]) + it_data$xsamples = new_xsamples + # prepare outlier plot + it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers) + plot_number = plot_number + 1 + # prepare sd plot + it_data$plots[[plot_number]] = setup_sd_plot(pca_data) + plot_number = plot_number + 1 + } else { + # do clustering + if (cmethod == "dbscan") { + emax = 2 + mp = 7 + clusters = automated_dbscan(pca_data, emax, mp) + } else if (cmethod == "hclust") { + clusters = automated_hclust(pca_data) + } else { + clusters = automated_hclust(pca_data) + } + + # get outliers + cluster_outliers = which(clusters == 0) + # get rejected clusters + centers = find_cluster_centers(clusters, pca_data$values) + if (tmethod == "mcd") { + rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2) + } else if (tmethod == "sd") { + rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds) + } else if (tmethod == "dbscan_outliers_only") { + rc = 0 + } + rc_indices = which(clusters %in% rc) + all_ol = union(cluster_outliers, rc_indices) + # it is possible that all samples get removed, in which case program will crash. + # so do not remove them + if (length(all_ol) == nrow(ped_data)) { + new_xsamples = xsamples + } else { + new_xsamples = union(xsamples, pca_data$ids[all_ol]) + } + it_data$xsamples = new_xsamples + # prepare plot + it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc) + plot_number = plot_number + 1 + } + it_data$outliers = setdiff(new_xsamples, xsamples) + it_data$num_plots = plot_number - 1 + return(it_data) +} + +# basically an inner join on a list of ids, and a table of ethnicity data +# if eth_data == null, then the second column is filled with NAs +add_ethnicity_data = function(ids, eth_data) { + n = length(ids) + if (!is.null(eth_data)) { + output = matrix(nrow=n, ncol=ncol(eth_data)) + colnames(output) = colnames(eth_data) + if (n > 0) { + for (i in 1:n) { + this_id = ids[i] + if (this_id %in% rownames(eth_data)) { + this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE) + } else { + this_row = c(this_id, rep(NA, ncol(output)-1)) + } + output[i, ] = this_row + } + } + } else { + output = cbind(ids, rep(NA, n)) + colnames(output) = c("IID", "Population") + } + return(output) +} + +generate_directory_name = function(outdir, basename, iteration) { + newdir = paste0("output_", basename, "_iteration_", iteration - 1) + full_path = paste0(outdir, "/", newdir) + return(full_path) +} \ No newline at end of file