Mercurial > repos > pmac > iterativepca
view R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line source
### pipeline ### # Complete a single iteration, which consists of: # - Doing pca # - Clustering, if required # - Finding outliers # - Setting up plots # Outputs a list containing all the data regarding this iteration single_iteration = function(outdir, basename, ped_data, xsamples, numsds, cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) { it_data = list() it_data$old_xsamples = xsamples # get data and do pca pca_data = do_pca(ped_data) it_data$pca_data = pca_data it_data$plots = list() plot_number = 1 # plot controls and cases if (!is.null(control_tag) || !is.null(cases_tag)) { it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag) plot_number = plot_number + 1 } # if we have ethnicity data, setup a special plot if (!is.null(ethnicity_data)) { it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data) plot_number = plot_number + 1 } if (cmethod == "none") { # get outliers by sd all_outliers = outliers_by_sd(pca_data, xsamples, numsds) new_xsamples = union(xsamples, pca_data$ids[all_outliers]) it_data$xsamples = new_xsamples # prepare outlier plot it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers) plot_number = plot_number + 1 # prepare sd plot it_data$plots[[plot_number]] = setup_sd_plot(pca_data) plot_number = plot_number + 1 } else { # do clustering if (cmethod == "dbscan") { emax = 2 mp = 7 clusters = automated_dbscan(pca_data, emax, mp) } else if (cmethod == "hclust") { clusters = automated_hclust(pca_data) } else { clusters = automated_hclust(pca_data) } # get outliers cluster_outliers = which(clusters == 0) # get rejected clusters centers = find_cluster_centers(clusters, pca_data$values) if (tmethod == "mcd") { rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2) } else if (tmethod == "sd") { rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds) } else if (tmethod == "dbscan_outliers_only") { rc = 0 } rc_indices = which(clusters %in% rc) all_ol = union(cluster_outliers, rc_indices) # it is possible that all samples get removed, in which case program will crash. # so do not remove them if (length(all_ol) == nrow(ped_data)) { new_xsamples = xsamples } else { new_xsamples = union(xsamples, pca_data$ids[all_ol]) } it_data$xsamples = new_xsamples # prepare plot it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc) plot_number = plot_number + 1 } it_data$outliers = setdiff(new_xsamples, xsamples) it_data$num_plots = plot_number - 1 return(it_data) } # basically an inner join on a list of ids, and a table of ethnicity data # if eth_data == null, then the second column is filled with NAs add_ethnicity_data = function(ids, eth_data) { n = length(ids) if (!is.null(eth_data)) { output = matrix(nrow=n, ncol=ncol(eth_data)) colnames(output) = colnames(eth_data) if (n > 0) { for (i in 1:n) { this_id = ids[i] if (this_id %in% rownames(eth_data)) { this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE) } else { this_row = c(this_id, rep(NA, ncol(output)-1)) } output[i, ] = this_row } } } else { output = cbind(ids, rep(NA, n)) colnames(output) = c("IID", "Population") } return(output) } generate_directory_name = function(outdir, basename, iteration) { newdir = paste0("output_", basename, "_iteration_", iteration - 1) full_path = paste0(outdir, "/", newdir) return(full_path) }