iterativepca: R_functions/pipeline

comparison R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip

Uploaded

author	pmac
date	Wed, 01 Jun 2016 03:38:39 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:64e75e21466e
+### pipeline ###
+# Complete a single iteration, which consists of:
+# - Doing pca
+# - Clustering, if required
+# - Finding outliers
+# - Setting up plots
+# Outputs a list containing all the data regarding this iteration
+single_iteration = function(outdir, basename, ped_data, xsamples, numsds,
+cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) {
+it_data = list()
+it_data$old_xsamples = xsamples
+# get data and do pca
+pca_data = do_pca(ped_data)
+it_data$pca_data = pca_data
+it_data$plots = list()
+plot_number = 1
+# plot controls and cases
+if (!is.null(control_tag) || !is.null(cases_tag)) {
+it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag)
+plot_number = plot_number + 1
+}
+# if we have ethnicity data, setup a special plot
+if (!is.null(ethnicity_data)) {
+it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data)
+plot_number = plot_number + 1
+}
+if (cmethod == "none") {
+# get outliers by sd
+all_outliers = outliers_by_sd(pca_data, xsamples, numsds)
+new_xsamples = union(xsamples, pca_data$ids[all_outliers])
+it_data$xsamples = new_xsamples
+# prepare outlier plot
+it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers)
+plot_number = plot_number + 1
+# prepare sd plot
+it_data$plots[[plot_number]] = setup_sd_plot(pca_data)
+plot_number = plot_number + 1
+} else {
+# do clustering
+if (cmethod == "dbscan") {
+emax = 2
+mp = 7
+clusters = automated_dbscan(pca_data, emax, mp)
+} else if (cmethod == "hclust") {
+clusters = automated_hclust(pca_data)
+} else {
+clusters = automated_hclust(pca_data)
+}
+# get outliers
+cluster_outliers = which(clusters == 0)
+# get rejected clusters
+centers = find_cluster_centers(clusters, pca_data$values)
+if (tmethod == "mcd") {
+rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2)
+} else if (tmethod == "sd") {
+rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds)
+} else if (tmethod == "dbscan_outliers_only") {
+rc = 0
+}
+rc_indices = which(clusters %in% rc)
+all_ol = union(cluster_outliers, rc_indices)
+# it is possible that all samples get removed, in which case program will crash.
+# so do not remove them
+if (length(all_ol) == nrow(ped_data)) {
+new_xsamples = xsamples
+} else {
+new_xsamples = union(xsamples, pca_data$ids[all_ol])
+}
+it_data$xsamples = new_xsamples
+# prepare plot
+it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc)
+plot_number = plot_number + 1
+}
+it_data$outliers = setdiff(new_xsamples, xsamples)
+it_data$num_plots = plot_number - 1
+return(it_data)
+}
+# basically an inner join on a list of ids, and a table of ethnicity data
+# if eth_data == null, then the second column is filled with NAs
+add_ethnicity_data = function(ids, eth_data) {
+n = length(ids)
+if (!is.null(eth_data)) {
+output = matrix(nrow=n, ncol=ncol(eth_data))
+colnames(output) = colnames(eth_data)
+if (n > 0) {
+for (i in 1:n) {
+this_id = ids[i]
+if (this_id %in% rownames(eth_data)) {
+this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE)
+} else {
+this_row = c(this_id, rep(NA, ncol(output)-1))
+}
+output[i, ] = this_row
+}
+}
+} else {
+output = cbind(ids, rep(NA, n))
+colnames(output) = c("IID", "Population")
+}
+return(output)
+}
+generate_directory_name = function(outdir, basename, iteration) {
+newdir = paste0("output_", basename, "_iteration_", iteration - 1)
+full_path = paste0(outdir, "/", newdir)
+return(full_path)
+}

Mercurial > repos > pmac > iterativepca

comparison R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip