diff R_functions/pipeline_code.R @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/R_functions/pipeline_code.R	Wed Jun 01 03:38:39 2016 -0400
@@ -0,0 +1,112 @@
+### pipeline ###
+# Complete a single iteration, which consists of:
+# - Doing pca
+# - Clustering, if required
+# - Finding outliers
+# - Setting up plots
+# Outputs a list containing all the data regarding this iteration
+single_iteration = function(outdir, basename, ped_data, xsamples, numsds,
+                           cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) {
+  it_data = list()
+  it_data$old_xsamples = xsamples
+  # get data and do pca
+  pca_data = do_pca(ped_data)
+  it_data$pca_data = pca_data
+  it_data$plots = list()
+  plot_number = 1
+  
+  # plot controls and cases
+  if (!is.null(control_tag) || !is.null(cases_tag)) {
+    it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag)
+    plot_number = plot_number + 1
+  }
+  
+  # if we have ethnicity data, setup a special plot
+  if (!is.null(ethnicity_data)) {
+    it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data)
+    plot_number = plot_number + 1
+  } 
+  
+  if (cmethod == "none") {
+    # get outliers by sd
+    all_outliers = outliers_by_sd(pca_data, xsamples, numsds)
+    new_xsamples = union(xsamples, pca_data$ids[all_outliers])
+    it_data$xsamples = new_xsamples
+    # prepare outlier plot
+    it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers)
+    plot_number = plot_number + 1
+    # prepare sd plot
+    it_data$plots[[plot_number]] = setup_sd_plot(pca_data)
+    plot_number = plot_number + 1
+  } else {
+    # do clustering
+    if (cmethod == "dbscan") {
+      emax = 2
+      mp = 7
+      clusters = automated_dbscan(pca_data, emax, mp)
+    } else if (cmethod == "hclust") {
+      clusters = automated_hclust(pca_data)
+    } else {
+      clusters = automated_hclust(pca_data)
+    }
+    
+    # get outliers
+    cluster_outliers = which(clusters == 0)
+    # get rejected clusters
+    centers = find_cluster_centers(clusters, pca_data$values)
+    if (tmethod == "mcd") {
+      rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2)
+    } else if (tmethod == "sd") {
+      rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds)
+    } else if (tmethod == "dbscan_outliers_only") {
+      rc = 0
+    }
+    rc_indices = which(clusters %in% rc)
+    all_ol = union(cluster_outliers, rc_indices)
+    # it is possible that all samples get removed, in which case program will crash.
+    # so do not remove them
+    if (length(all_ol) == nrow(ped_data)) {
+      new_xsamples = xsamples
+    } else {
+      new_xsamples = union(xsamples, pca_data$ids[all_ol])
+    }
+    it_data$xsamples = new_xsamples
+    # prepare plot
+    it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc)
+    plot_number = plot_number + 1
+  }
+  it_data$outliers = setdiff(new_xsamples, xsamples)
+  it_data$num_plots = plot_number - 1
+  return(it_data)
+}
+
+# basically an inner join on a list of ids, and a table of ethnicity data
+# if eth_data == null, then the second column is filled with NAs
+add_ethnicity_data = function(ids, eth_data) {
+  n = length(ids)
+  if (!is.null(eth_data)) {
+    output = matrix(nrow=n, ncol=ncol(eth_data))
+    colnames(output) = colnames(eth_data)
+    if (n > 0) {
+      for (i in 1:n) {
+        this_id = ids[i]
+        if (this_id %in% rownames(eth_data)) {
+          this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE)
+        } else {
+          this_row = c(this_id, rep(NA, ncol(output)-1))
+        }
+        output[i, ] = this_row
+      }
+    }
+  } else {
+    output = cbind(ids, rep(NA, n))
+    colnames(output) = c("IID", "Population")
+  }
+  return(output)
+}
+
+generate_directory_name = function(outdir, basename, iteration) {
+  newdir = paste0("output_", basename, "_iteration_", iteration - 1)
+  full_path = paste0(outdir, "/", newdir)
+  return(full_path)
+}
\ No newline at end of file