annotate coral_multilocus_genotype.R @ 9:3c42de11ea1d draft default tip

Uploaded
author greg
date Wed, 24 Nov 2021 20:17:10 +0000
parents 33d759858625
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env Rscript
adaf89535d2e Uploaded
greg
parents:
diff changeset
2
adaf89535d2e Uploaded
greg
parents:
diff changeset
3 suppressPackageStartupMessages(library("adegenet"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
4 suppressPackageStartupMessages(library("ape"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
5 suppressPackageStartupMessages(library("data.table"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
6 suppressPackageStartupMessages(library("dbplyr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
7 suppressPackageStartupMessages(library("dplyr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
8 suppressPackageStartupMessages(library("ggplot2"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
9 suppressPackageStartupMessages(library("knitr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
10 suppressPackageStartupMessages(library("maps"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
11 suppressPackageStartupMessages(library("mapproj"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
12 suppressPackageStartupMessages(library("optparse"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
13 suppressPackageStartupMessages(library("poppr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
14 suppressPackageStartupMessages(library("RColorBrewer"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
15 suppressPackageStartupMessages(library("RPostgres"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
16 suppressPackageStartupMessages(library("SNPRelate"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
17 suppressPackageStartupMessages(library("tidyr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
18 suppressPackageStartupMessages(library("vcfR"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
19 suppressPackageStartupMessages(library("vegan"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
20 suppressPackageStartupMessages(library("yarrr"))
adaf89535d2e Uploaded
greg
parents:
diff changeset
21 theme_set(theme_bw())
adaf89535d2e Uploaded
greg
parents:
diff changeset
22
adaf89535d2e Uploaded
greg
parents:
diff changeset
23 DEFAULT_MISSING_NUMERIC_VALUE <- -9.000000;
adaf89535d2e Uploaded
greg
parents:
diff changeset
24
adaf89535d2e Uploaded
greg
parents:
diff changeset
25 option_list <- list(
adaf89535d2e Uploaded
greg
parents:
diff changeset
26 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
27 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
28 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
29 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
30 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
31 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file")
adaf89535d2e Uploaded
greg
parents:
diff changeset
32 )
adaf89535d2e Uploaded
greg
parents:
diff changeset
33
adaf89535d2e Uploaded
greg
parents:
diff changeset
34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list);
adaf89535d2e Uploaded
greg
parents:
diff changeset
35 args <- parse_args(parser, positional_arguments=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
36 opt <- args$options;
adaf89535d2e Uploaded
greg
parents:
diff changeset
37
adaf89535d2e Uploaded
greg
parents:
diff changeset
38 get_file_path = function(dir, file_name) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
39 file_path = paste(dir, file_name, sep="/");
adaf89535d2e Uploaded
greg
parents:
diff changeset
40 return(file_path);
adaf89535d2e Uploaded
greg
parents:
diff changeset
41 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
42
adaf89535d2e Uploaded
greg
parents:
diff changeset
43 get_database_connection <- function(db_conn_string) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
44 # Instantiate database connection.
adaf89535d2e Uploaded
greg
parents:
diff changeset
45 # The connection string has this format:
adaf89535d2e Uploaded
greg
parents:
diff changeset
46 # postgresql://user:password@host/dbname
adaf89535d2e Uploaded
greg
parents:
diff changeset
47 conn_items <- strsplit(db_conn_string, "://")[[1]];
adaf89535d2e Uploaded
greg
parents:
diff changeset
48 string_needed <- conn_items[2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
49 items_needed <- strsplit(string_needed, "@")[[1]];
adaf89535d2e Uploaded
greg
parents:
diff changeset
50 user_pass_string <- items_needed[1];
adaf89535d2e Uploaded
greg
parents:
diff changeset
51 host_dbname_string <- items_needed[2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
52 user_pass_items <- strsplit(user_pass_string, ":")[[1]];
adaf89535d2e Uploaded
greg
parents:
diff changeset
53 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]];
adaf89535d2e Uploaded
greg
parents:
diff changeset
54 user <- user_pass_items[1];
adaf89535d2e Uploaded
greg
parents:
diff changeset
55 pass <- user_pass_items[2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
56 host <- host_dbname_items[1];
adaf89535d2e Uploaded
greg
parents:
diff changeset
57 dbname <- host_dbname_items[2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
58 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass);
adaf89535d2e Uploaded
greg
parents:
diff changeset
59 return (conn);
adaf89535d2e Uploaded
greg
parents:
diff changeset
60 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
61
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
62 log_data_frame <- function(name, df) {
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
63 cat("\n", name, ":\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
64 show(df);
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
65 cat("\n\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
66 }
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
67
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
68 time_elapsed <- function(start_time) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
69 cat("Elapsed time: ", proc.time() - start_time, "\n\n");
adaf89535d2e Uploaded
greg
parents:
diff changeset
70 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
71
adaf89535d2e Uploaded
greg
parents:
diff changeset
72 time_start <- function(msg) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
73 start_time <- proc.time();
adaf89535d2e Uploaded
greg
parents:
diff changeset
74 cat(msg, "...\n");
adaf89535d2e Uploaded
greg
parents:
diff changeset
75 return(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
76 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
77
adaf89535d2e Uploaded
greg
parents:
diff changeset
78 write_data_frame <- function(dir, file_name, data_frame) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
79 cat("\nWriting file: ", file_name, "\n");
adaf89535d2e Uploaded
greg
parents:
diff changeset
80 file_path <- get_file_path(dir, file_name);
adaf89535d2e Uploaded
greg
parents:
diff changeset
81 write.table(data_frame, file=file_path, quote=FALSE, row.names=FALSE, sep="\t");
adaf89535d2e Uploaded
greg
parents:
diff changeset
82 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
83
adaf89535d2e Uploaded
greg
parents:
diff changeset
84 # Prepare for processing.
adaf89535d2e Uploaded
greg
parents:
diff changeset
85 output_data_dir = "output_data_dir";
adaf89535d2e Uploaded
greg
parents:
diff changeset
86 output_plots_dir = "output_plots_dir";
adaf89535d2e Uploaded
greg
parents:
diff changeset
87 # Read in VCF input file.
adaf89535d2e Uploaded
greg
parents:
diff changeset
88 start_time <- time_start("Reading VCF input");
adaf89535d2e Uploaded
greg
parents:
diff changeset
89 vcf <- read.vcfR(opt$input_vcf);
adaf89535d2e Uploaded
greg
parents:
diff changeset
90 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
91
adaf89535d2e Uploaded
greg
parents:
diff changeset
92 # Convert VCF file into a genind for the Poppr package.
adaf89535d2e Uploaded
greg
parents:
diff changeset
93 start_time <- time_start("Converting VCF data to a genind object");
adaf89535d2e Uploaded
greg
parents:
diff changeset
94 genind_obj <- vcfR2genind(vcf);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
95 cat("\ngenind_obj:\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
96 genind_obj
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
97 cat("\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
98 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
99
adaf89535d2e Uploaded
greg
parents:
diff changeset
100 # Add population information to the genind object.
adaf89535d2e Uploaded
greg
parents:
diff changeset
101 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t", quote="");
adaf89535d2e Uploaded
greg
parents:
diff changeset
102 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region");
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
103 cat("\npopulation_info_data_table:\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
104 population_info_data_table
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
105 cat("\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
106 #write_data_frame(output_data_dir, "population_info_data_table", population_info_data_table);
adaf89535d2e Uploaded
greg
parents:
diff changeset
107 genind_obj@pop <- as.factor(population_info_data_table$region);
adaf89535d2e Uploaded
greg
parents:
diff changeset
108 strata(genind_obj) <- data.frame(pop(genind_obj));
adaf89535d2e Uploaded
greg
parents:
diff changeset
109
adaf89535d2e Uploaded
greg
parents:
diff changeset
110 # Convert genind object to a genclone object.
adaf89535d2e Uploaded
greg
parents:
diff changeset
111 start_time <- time_start("Converting the genind object to a genclone object");
adaf89535d2e Uploaded
greg
parents:
diff changeset
112 genind_clone <- as.genclone(genind_obj);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
113 cat("\ngenind_clone:\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
114 genind_clone
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
115 cat("\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
116 time_elapsed(start_time);
7
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
117 # Remove genind object from memory.
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
118 rm(genind_obj);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
119
adaf89535d2e Uploaded
greg
parents:
diff changeset
120 # Calculate the bitwise distance between individuals.
adaf89535d2e Uploaded
greg
parents:
diff changeset
121 start_time <- time_start("Calculating the bitwise distance between individuals");
adaf89535d2e Uploaded
greg
parents:
diff changeset
122 bitwise_distance <- bitwise.dist(genind_clone);
adaf89535d2e Uploaded
greg
parents:
diff changeset
123 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
124
adaf89535d2e Uploaded
greg
parents:
diff changeset
125 # Multilocus genotypes (threshold of 3.2%).
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
126 cat("\nFiltering multilocus genotypes with threshold of 3.2%...\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
127 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032;
adaf89535d2e Uploaded
greg
parents:
diff changeset
128
adaf89535d2e Uploaded
greg
parents:
diff changeset
129 # Create list of MLGs.
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
130 cat("\nCreating list of mlg_ids...\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
131 mlg_ids <- mlg.id(genind_clone);
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
132 cat("\nCreated list of mlg_ids...\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
133
adaf89535d2e Uploaded
greg
parents:
diff changeset
134 # Read user's Affymetrix 96 well plate tabular file.
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
135 cat("\nCreating affy_metadata_data_frame...\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
136 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings=c("", "NA"), quote="");
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
137 cat("\nCreated affy_metadata_data_frame...\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
138 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
adaf89535d2e Uploaded
greg
parents:
diff changeset
139 "region", "latitude", "longitude", "geographic_origin", "colony_location",
adaf89535d2e Uploaded
greg
parents:
diff changeset
140 "depth", "disease_resist", "bleach_resist", "mortality","tle",
adaf89535d2e Uploaded
greg
parents:
diff changeset
141 "spawning", "collector_last_name", "collector_first_name", "organization", "collection_date",
adaf89535d2e Uploaded
greg
parents:
diff changeset
142 "email", "seq_facility", "array_version", "public", "public_after_date",
adaf89535d2e Uploaded
greg
parents:
diff changeset
143 "sperm_motility", "healing_time", "dna_extraction_method", "dna_concentration", "registry_id",
adaf89535d2e Uploaded
greg
parents:
diff changeset
144 "result_folder_name", "plate_barcode");
adaf89535d2e Uploaded
greg
parents:
diff changeset
145 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
146 log_data_frame("affy_metadata_data_frame", affy_metadata_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
147 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
148 cat("\nuser_specimen_ids:\n", toString(user_specimen_ids), "\n\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
149 # The specimen_id_field_call_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
150 # user_specimen_ids V2
adaf89535d2e Uploaded
greg
parents:
diff changeset
151 # 1090 prolifera
adaf89535d2e Uploaded
greg
parents:
diff changeset
152 # 1091 prolifera
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
153 cat("\nCreating specimen_id_field_call_data_table...\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
154 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call);
adaf89535d2e Uploaded
greg
parents:
diff changeset
155 # Rename the user_specimen_ids column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
156 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
157 # Rename the V2 column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
158 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
159
adaf89535d2e Uploaded
greg
parents:
diff changeset
160 # Connect to database.
adaf89535d2e Uploaded
greg
parents:
diff changeset
161 conn <- get_database_connection(opt$database_connection_string);
adaf89535d2e Uploaded
greg
parents:
diff changeset
162 # Import the sample table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
163 sample_table <- tbl(conn, "sample");
adaf89535d2e Uploaded
greg
parents:
diff changeset
164 # Import the genotype table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
165 genotype_table <- tbl(conn, "genotype");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
166 # Import the probe_annotation table.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
167 probe_annotation_table <- tbl(conn, "probe_annotation");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
168 # Select columns from the sample table and the
adaf89535d2e Uploaded
greg
parents:
diff changeset
169 # genotype table joined by genotype_id.
adaf89535d2e Uploaded
greg
parents:
diff changeset
170 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, bcoral_genet_id, genotype_id);
adaf89535d2e Uploaded
greg
parents:
diff changeset
171 smlg <- sample_table_columns %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
172 left_join(genotype_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
173 select("id", "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
174 by=c("genotype_id"="id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
175 # Name the columns.
adaf89535d2e Uploaded
greg
parents:
diff changeset
176 smlg_data_frame <- as.data.frame(smlg);
adaf89535d2e Uploaded
greg
parents:
diff changeset
177 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id",
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
178 "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call");
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
179 log_data_frame("smlg_data_frame", smlg_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
180 # Missing GT in samples submitted.
adaf89535d2e Uploaded
greg
parents:
diff changeset
181 start_time <- time_start("Discovering missing GT in samples");
adaf89535d2e Uploaded
greg
parents:
diff changeset
182 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
183 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
adaf89535d2e Uploaded
greg
parents:
diff changeset
184 missing_gt <- (missing_gt / nrow(vcf)) * 100;
adaf89535d2e Uploaded
greg
parents:
diff changeset
185 missing_gt_data_frame <- data.frame(missing_gt);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
186 log_data_frame("missing_gt_data_frame", missing_gt_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
187 # The specimen_id_field_call_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
188 # rn missing_gt
adaf89535d2e Uploaded
greg
parents:
diff changeset
189 # a100000-4368120-060520-256_I07.CEL 0.06092608
adaf89535d2e Uploaded
greg
parents:
diff changeset
190 # a100000-4368120-060520-256_K07.CEL 0.05077173
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
191 cat("\nCreating missing_gt_data_table...\n");
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
192 missing_gt_data_table <- setDT(missing_gt_data_frame, keep.rownames=TRUE)[];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
193 # Rename the rn column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
194 setnames(missing_gt_data_table, c("rn"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
195 # Rename the missing_gt column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
196 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
197 # Round data to two digits.
adaf89535d2e Uploaded
greg
parents:
diff changeset
198 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2);
adaf89535d2e Uploaded
greg
parents:
diff changeset
199 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
200
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
201 # Subset genotypes for the fixed SNPs by probe id.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
202 # Select columns from the probe_annotation table.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
203 probe_annotation_table_columns <- probe_annotation_table %>% select(probe_set_id, custid, fixed_status, acerv_allele);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
204 # Convert to data frame.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
205 fixed_snp_data_frame <- as.data.frame(probe_annotation_table_columns);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
206 # Name the columns.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
207 colnames(fixed_snp_data_frame) <- c("probe_set_id", "custid", "fixed_status", "acerv_allele");
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
208 # Filter unwanted rows.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
209 fixed_snp_data_frame <- subset(fixed_snp_data_frame, fixed_snp_data_frame$fixed_status=="keep");
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
210 log_data_frame("fixed_snp_data_frame", fixed_snp_data_frame);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
211 gt_fixed <- gt[rownames(gt) %in% fixed_snp_data_frame$probe_set_id, ];
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
212 log_data_frame("gt_fixed", gt_fixed);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
213
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
214 # Missing GT in fixed SNPs.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
215 missing_gt_fixed <- apply(gt_fixed, MARGIN=2, function(x){ sum(is.na(x))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
216 missing_gt_fixed <- (missing_gt_fixed / nrow(gt_fixed)) * 100;
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
217 missing_gt_fixed_data_frame <- data.frame(missing_gt_fixed);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
218 log_data_frame("missing_gt_fixed_data_frame", missing_gt_fixed_data_frame);
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
219 cat("\nCreating missing_gt_fixed_data_table...\n");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
220 missing_gt_fixed_data_table <- setDT(missing_gt_fixed_data_frame, keep.rownames=TRUE)[];
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
221 # Rename the rn column.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
222 setnames(missing_gt_fixed_data_table, c("rn"), c("affy_id"));
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
223 # Rename the missing_gt column.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
224 setnames(missing_gt_fixed_data_table, c("missing_gt_fixed"), c("percent_missing_data_fixed"));
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
225 # Round data to two digits.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
226 missing_gt_fixed_data_table$percent_missing_data_fixed <- round(missing_gt_fixed_data_table$percent_missing_data_fixed, digits=2);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
227
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
228 # Heterozygous alleles for fixed SNPs.
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
229 start_time <- time_start("Discovering heterozygous alleles");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
230 heterozygous_alleles <- apply(gt_fixed, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
231 heterozygous_alleles <- (heterozygous_alleles / nrow(gt_fixed)) * 100;
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
232 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
233 log_data_frame("heterozygous_alleles_data_frame", heterozygous_alleles_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
234 # The heterozygous_alleles_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
235 # rn heterozygous_alleles
adaf89535d2e Uploaded
greg
parents:
diff changeset
236 # a100000-4368120-060520-256_I07.CEL 73.94903
adaf89535d2e Uploaded
greg
parents:
diff changeset
237 # a100000-4368120-060520-256_K07.CEL 74.40089
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
238 cat("\nCreating heterozygous_alleles_data_table...\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
239 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[];
adaf89535d2e Uploaded
greg
parents:
diff changeset
240 # Rename the rn column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
241 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
242 # Rename the heterozygous_alleles column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
243 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
244 # Round data to two digits.
adaf89535d2e Uploaded
greg
parents:
diff changeset
245 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2);
adaf89535d2e Uploaded
greg
parents:
diff changeset
246 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
247
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
248 # Create list of Acerv reference and alternative probes.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
249 rAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="reference");
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
250 aAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="alternative");
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
251
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
252 # Subset probes for the reference and alternative SNPs in Acerv.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
253 ref_ac <- gt_fixed[rownames(gt_fixed) %in% rAC$probe,];
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
254 alt_ac <- gt_fixed[rownames(gt_fixed) %in% aAC$probe,];
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
255
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
256 # Reference alleles for each species.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
257 reference_alleles_ac <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
258 reference_alleles_ap <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
259
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
260 # Alternative alleles for each species.
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
261 alternative_alleles_ac <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
262 alternative_alleles_ap <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
263
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
264 # Apalm alleles.
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
265 start_time <- time_start("Discovering reference alleles");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
266 ap_sum <- rowSums(cbind(reference_alleles_ap,alternative_alleles_ap));
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
267 ap_alleles <- (ap_sum / nrow(gt_fixed)) * 100;
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
268 ap_alleles_data_frame <- data.frame(ap_alleles);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
269 log_data_frame("ap_alleles_data_frame", ap_alleles_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
270 # The reference_alleles_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
271 # rn reference_alleles
adaf89535d2e Uploaded
greg
parents:
diff changeset
272 # a100000-4368120-060520-256_I07.CEL 11.60642
adaf89535d2e Uploaded
greg
parents:
diff changeset
273 # a100000-4368120-060520-256_K07.CEL 11.45918
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
274 cat("\nCreating ap_alleles_data_table...\n");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
275 ap_alleles_data_table <- setDT(ap_alleles_data_frame, keep.rownames=TRUE)[];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
276 # Rename the rn column.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
277 setnames(ap_alleles_data_table, c("rn"), c("affy_id"));
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
278 # Rename the reference_alleles column.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
279 setnames(ap_alleles_data_table, c("ap_alleles"), c("percent_apalm_coral"));
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
280 # Round data to two digits.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
281 ap_alleles_data_table$percent_apalm_coral <- round(ap_alleles_data_table$percent_apalm_coral, digits=2);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
282 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
283
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
284 # Acerv alleles.
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
285 start_time <- time_start("Discovering alternative alleles");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
286 ac_sum <- rowSums(cbind(reference_alleles_ac,alternative_alleles_ac));
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
287 ac_alleles <- (ac_sum / nrow(gt_fixed)) * 100;
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
288 ac_alleles_data_frame <- data.frame(ac_alleles);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
289 log_data_frame("ac_alleles_data_frame", ac_alleles_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
290 # The alternative_alleles_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
291 # rn alternative_alleles
adaf89535d2e Uploaded
greg
parents:
diff changeset
292 # a100000-4368120-060520-256_I07.CEL 14.38363
adaf89535d2e Uploaded
greg
parents:
diff changeset
293 # a100000-4368120-060520-256_K07.CEL 14.08916
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
294 cat("\nCreating ac_alleles_data_table...\n");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
295 ac_alleles_data_table <- setDT(ac_alleles_data_frame, keep.rownames=TRUE)[];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
296 # Rename the rn column.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
297 setnames(ac_alleles_data_table, c("rn"), c("affy_id"));
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
298 # Rename the alternative_alleles column.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
299 setnames(ac_alleles_data_table, c("ac_alleles"), c("percent_acerv_coral"));
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
300 # Round data to two digits.
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
301 ac_alleles_data_table$percent_acerv_coral <- round(ac_alleles_data_table$percent_acerv_coral, digits=2);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
302 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
303
adaf89535d2e Uploaded
greg
parents:
diff changeset
304 # The mlg_ids_data_table looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
305 # mlg_ids
adaf89535d2e Uploaded
greg
parents:
diff changeset
306 # a550962-4368120-060520-500_M23.CEL
adaf89535d2e Uploaded
greg
parents:
diff changeset
307 # a550962-4368120-060520-256_A19.CEL
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
308 cat("\nCreating mlg_ids_data_table...\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
309 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
310 # Rename the mlg_ids column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
311 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
312
adaf89535d2e Uploaded
greg
parents:
diff changeset
313 # sample_mlg_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
314 # A tibble: 262 x 3
adaf89535d2e Uploaded
greg
parents:
diff changeset
315 # Groups: group [?]
adaf89535d2e Uploaded
greg
parents:
diff changeset
316 # group affy_id coral_mlg_clonal_id coral_mlg_rep_sample_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
317 # <int> <chr> <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
318 # 1 a550962-4368.CEL NA 13905
adaf89535d2e Uploaded
greg
parents:
diff changeset
319 sample_mlg_tibble <- mlg_ids_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
320 group_by(row_number()) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
321 dplyr::rename(group="row_number()") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
322 unnest (affy_id) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
323 # Join with mlg table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
324 left_join(smlg_data_frame %>%
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
325 select("affy_id","coral_mlg_clonal_id", "coral_mlg_rep_sample_id",
33d759858625 Uploaded
greg
parents: 7
diff changeset
326 "genetic_coral_species_call", "bcoral_genet_id"),
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
327 by="affy_id");
adaf89535d2e Uploaded
greg
parents:
diff changeset
328
adaf89535d2e Uploaded
greg
parents:
diff changeset
329 # If found in database, group members on previous mlg id.
adaf89535d2e Uploaded
greg
parents:
diff changeset
330 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
331 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),];
adaf89535d2e Uploaded
greg
parents:
diff changeset
332 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id));
adaf89535d2e Uploaded
greg
parents:
diff changeset
333 na.group <- sample_mlg_tibble$group[na.mlg];
adaf89535d2e Uploaded
greg
parents:
diff changeset
334 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
335
adaf89535d2e Uploaded
greg
parents:
diff changeset
336 # Find out if the sample mlg matched a previous genotyped sample.
adaf89535d2e Uploaded
greg
parents:
diff changeset
337 # sample_mlg_match_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
338 # A tibble: 262 x 4
adaf89535d2e Uploaded
greg
parents:
diff changeset
339 # Groups: group [230]
adaf89535d2e Uploaded
greg
parents:
diff changeset
340 # group affy_id coral_mlg_clonal_id db_match
adaf89535d2e Uploaded
greg
parents:
diff changeset
341 # <int> <chr> <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
342 # 1 a550962-436.CEL NA no_match
adaf89535d2e Uploaded
greg
parents:
diff changeset
343 sample_mlg_match_tibble <- sample_mlg_tibble %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
344 group_by(group) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
345 mutate(db_match = ifelse(is.na(coral_mlg_clonal_id), "no_match", "match"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
346
adaf89535d2e Uploaded
greg
parents:
diff changeset
347 # Create new mlg id for samples with no matches in the database.
adaf89535d2e Uploaded
greg
parents:
diff changeset
348 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
349 none <- none[is.na(none$coral_mlg_clonal_id),];
adaf89535d2e Uploaded
greg
parents:
diff changeset
350 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id));
adaf89535d2e Uploaded
greg
parents:
diff changeset
351 n.g <- sample_mlg_match_tibble$group[na.mlg2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
352 ct <- length(unique(n.g));
adaf89535d2e Uploaded
greg
parents:
diff changeset
353
adaf89535d2e Uploaded
greg
parents:
diff changeset
354 # List of new group ids, the sequence starts at the number of
adaf89535d2e Uploaded
greg
parents:
diff changeset
355 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids
adaf89535d2e Uploaded
greg
parents:
diff changeset
356 # plus 1.
adaf89535d2e Uploaded
greg
parents:
diff changeset
357 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct));
adaf89535d2e Uploaded
greg
parents:
diff changeset
358
adaf89535d2e Uploaded
greg
parents:
diff changeset
359 # Assign the new id iteratively for all that have NA.
adaf89535d2e Uploaded
greg
parents:
diff changeset
360 for (i in 1:length(na.mlg2)) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
361 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))];
adaf89535d2e Uploaded
greg
parents:
diff changeset
362 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
363
adaf89535d2e Uploaded
greg
parents:
diff changeset
364 # Subset population_info_data_table for all samples.
adaf89535d2e Uploaded
greg
parents:
diff changeset
365 # affy_id_user_specimen_id_vector looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
366 # affy_id user_specimen_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
367 # a100000-432.CEL 13704
adaf89535d2e Uploaded
greg
parents:
diff changeset
368 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
369
adaf89535d2e Uploaded
greg
parents:
diff changeset
370 # Merge data frames for final table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
371 start_time <- time_start("Merging data frames");
adaf89535d2e Uploaded
greg
parents:
diff changeset
372 stag_db_report <- specimen_id_field_call_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
373 left_join(affy_id_user_specimen_id_vector %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
374 select("affy_id", "user_specimen_id"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
375 by="user_specimen_id") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
376 mutate(db_record = ifelse(affy_id %in% smlg_data_frame$affy_id, "genotyped", "new")) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
377 filter(db_record=="new") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
378 left_join(sample_mlg_match_tibble %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
379 select("affy_id", "coral_mlg_clonal_id", "db_match"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
380 by="affy_id") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
381 left_join(missing_gt_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
382 select("affy_id", "percent_missing_data_coral"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
383 by="affy_id") %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
384 left_join(missing_gt_fixed_data_table %>%
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
385 select("affy_id", "percent_missing_data_fixed"),
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
386 by="affy_id") %>%
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
387 left_join(heterozygous_alleles_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
388 select("affy_id", "percent_heterozygous_coral"),
adaf89535d2e Uploaded
greg
parents:
diff changeset
389 by="affy_id") %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
390 left_join(ac_alleles_data_table %>%
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
391 select("affy_id", "percent_acerv_coral"),
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
392 by="affy_id") %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
393 left_join(ap_alleles_data_table %>%
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
394 select("affy_id", "percent_apalm_coral"),
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
395 by="affy_id") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
396 mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
397 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
398 mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 85 & percent_acerv_coral <= 10, "A.palmata", "other")) %>%
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
399 mutate(genetic_coral_species_call = ifelse(percent_acerv_coral >= 85 & percent_apalm_coral <= 10, "A.cervicornis", genetic_coral_species_call)) %>%
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
400 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
401 ungroup() %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
402 select(-group, -db_record);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
403 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
404
adaf89535d2e Uploaded
greg
parents:
diff changeset
405 start_time <- time_start("Writing csv output");
adaf89535d2e Uploaded
greg
parents:
diff changeset
406 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
407 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
408
adaf89535d2e Uploaded
greg
parents:
diff changeset
409 # Representative clone for genotype table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
410 start_time <- time_start("Creating representative clone for genotype table");
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
411 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata=~pop.genind_obj.);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
412 id_rep <- mlg.id(no_dup_genotypes_genind);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
413 cat("\nCreating id_data_table...\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
414 id_data_table <- data.table(id_rep, keep.rownames=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
415 # Rename the id_rep column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
416 setnames(id_data_table, c("id_rep"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
417 time_elapsed(start_time);
7
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
418 # Remove clonecorrect genind from memory.
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
419 rm(no_dup_genotypes_genind);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
420
adaf89535d2e Uploaded
greg
parents:
diff changeset
421 # Table of alleles for the new samples subset to new plate data.
adaf89535d2e Uploaded
greg
parents:
diff changeset
422 # Create vector indicating number of individuals desired from
adaf89535d2e Uploaded
greg
parents:
diff changeset
423 # affy_id column of stag_db_report data table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
424 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
425 i <- i[!apply(i== "", 1, all), ];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
426
adaf89535d2e Uploaded
greg
parents:
diff changeset
427 # Subset VCF to the user samples.
adaf89535d2e Uploaded
greg
parents:
diff changeset
428 start_time <- time_start("Subsetting vcf to the user samples");
6
0488ca4a8c4e Uploaded
greg
parents: 4
diff changeset
429 affy_list <- append(stag_db_report$affy_id,"FORMAT");
0488ca4a8c4e Uploaded
greg
parents: 4
diff changeset
430 svcf <- vcf[,colnames(vcf@gt) %in% affy_list];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
431 write.vcf(svcf, "subset.vcf.gz");
7
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
432
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
433 # Remove original and subset VCFs written to file from R memory.
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
434 rm(svcf);
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
435 rm(vcf);
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
436
bcb28b49b0cc Uploaded
greg
parents: 6
diff changeset
437 # Load in subset VCF.
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
438 vcf.fn <- "subset.vcf.gz";
adaf89535d2e Uploaded
greg
parents:
diff changeset
439 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only");
adaf89535d2e Uploaded
greg
parents:
diff changeset
440 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
441 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
442 # gds_array looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
443 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL"
adaf89535d2e Uploaded
greg
parents:
diff changeset
444 gds_data_frame <- data.frame(gds_array);
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
445 log_data_frame("gds_data_frame", gds_data_frame);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
446 # gds_data_frame looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
447 # gds_array
adaf89535d2e Uploaded
greg
parents:
diff changeset
448 # a550962-4368120-060520-500_A03.CEL
adaf89535d2e Uploaded
greg
parents:
diff changeset
449 # a550962-4368120-060520-500_A05.CEL
4
34e02e8b4232 Uploaded
greg
parents: 3
diff changeset
450 cat("\nCreating gds_data_table...\n");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
451 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[];
adaf89535d2e Uploaded
greg
parents:
diff changeset
452 # Rename the gds_array column.
adaf89535d2e Uploaded
greg
parents:
diff changeset
453 setnames(gds_data_table, c("gds_array"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
454 # affy_id_region_list looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
455 # affy_id region
adaf89535d2e Uploaded
greg
parents:
diff changeset
456 # a100000-4368120-060520-256_I07.CEL USVI
adaf89535d2e Uploaded
greg
parents:
diff changeset
457 # a100000-4368120-060520-256_K07.CEL USVI
adaf89535d2e Uploaded
greg
parents:
diff changeset
458 affy_id_region_list <- population_info_data_table[c(2,3,4)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
459 gds_data_table_join <- gds_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
460 left_join(affy_id_region_list %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
461 select("affy_id", "user_specimen_id", "region"),
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
462 by='affy_id')%>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
463 drop_na();
adaf89535d2e Uploaded
greg
parents:
diff changeset
464 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region));
adaf89535d2e Uploaded
greg
parents:
diff changeset
465 add.gdsn(genofile, "sample.annot", samp.annot);
adaf89535d2e Uploaded
greg
parents:
diff changeset
466 # population_code looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
467 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
adaf89535d2e Uploaded
greg
parents:
diff changeset
468 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
adaf89535d2e Uploaded
greg
parents:
diff changeset
469 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
470 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group")));
adaf89535d2e Uploaded
greg
parents:
diff changeset
471 # pop.group looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
472 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733
adaf89535d2e Uploaded
greg
parents:
diff changeset
473 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009
adaf89535d2e Uploaded
greg
parents:
diff changeset
474 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
475
adaf89535d2e Uploaded
greg
parents:
diff changeset
476 # Distance matrix calculation and sample labels change to user specimen ids.
adaf89535d2e Uploaded
greg
parents:
diff changeset
477 start_time <- time_start("Calculating distance matrix");
adaf89535d2e Uploaded
greg
parents:
diff changeset
478 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
479 ibs$sample.id <-gds_data_table_join$user_specimen_id;
adaf89535d2e Uploaded
greg
parents:
diff changeset
480 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
481
adaf89535d2e Uploaded
greg
parents:
diff changeset
482 # Cluster analysis on the genome-wide IBS pairwise distance matrix.
adaf89535d2e Uploaded
greg
parents:
diff changeset
483 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix");
adaf89535d2e Uploaded
greg
parents:
diff changeset
484 set.seed(100);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
485 par(cex=0.6, cex.lab=1, cex.axis=1.5, cex.main=2);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
486 ibs.hc <- snpgdsHCluster(ibs);
adaf89535d2e Uploaded
greg
parents:
diff changeset
487 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
488
adaf89535d2e Uploaded
greg
parents:
diff changeset
489 # cols looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
490 # blue1 red green pink orange blue2
adaf89535d2e Uploaded
greg
parents:
diff changeset
491 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF"
adaf89535d2e Uploaded
greg
parents:
diff changeset
492 # green2 yellow turquoise poop
adaf89535d2e Uploaded
greg
parents:
diff changeset
493 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF"
adaf89535d2e Uploaded
greg
parents:
diff changeset
494 cols <- piratepal("basel");
adaf89535d2e Uploaded
greg
parents:
diff changeset
495 set.seed(999);
adaf89535d2e Uploaded
greg
parents:
diff changeset
496
adaf89535d2e Uploaded
greg
parents:
diff changeset
497 # Generate plots.
adaf89535d2e Uploaded
greg
parents:
diff changeset
498 # Default clustering.
adaf89535d2e Uploaded
greg
parents:
diff changeset
499 start_time <- time_start("Creating ibs_default.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
500 # Start PDF device driver.
adaf89535d2e Uploaded
greg
parents:
diff changeset
501 dev.new(width=40, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
502 file_path = get_file_path(output_plots_dir, "ibs_default.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
503 pdf(file=file_path, width=40, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
504 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15);
adaf89535d2e Uploaded
greg
parents:
diff changeset
505 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", yaxis.kinship=FALSE);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
506 abline(h=0.032, lty=2);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
507 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2);
adaf89535d2e Uploaded
greg
parents:
diff changeset
508 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
509 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
510
adaf89535d2e Uploaded
greg
parents:
diff changeset
511 # Color cluster by region.
adaf89535d2e Uploaded
greg
parents:
diff changeset
512 start_time <- time_start("Creating ibs_region.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
513 # Start PDF device driver.
adaf89535d2e Uploaded
greg
parents:
diff changeset
514 dev.new(width=40, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
515 file_path = get_file_path(output_plots_dir, "ibs_region.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
516 pdf(file=file_path, width=40, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
517 race <- as.factor(population_code);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
518 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race, col.list=cols, pch.list=15);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
519 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", yaxis.kinship=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
520 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2);
adaf89535d2e Uploaded
greg
parents:
diff changeset
521 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
522 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
523
adaf89535d2e Uploaded
greg
parents:
diff changeset
524 # Missing data barplot.
adaf89535d2e Uploaded
greg
parents:
diff changeset
525 start_time <- time_start("Creating missing_data.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
526 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
527 test2 <- which(!is.na(population_info_data_table$miss));
adaf89535d2e Uploaded
greg
parents:
diff changeset
528 miss96 <- population_info_data_table$miss[test2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
529 name96 <- population_info_data_table$user_specimen_id[test2];
adaf89535d2e Uploaded
greg
parents:
diff changeset
530 # Start PDF device driver.
adaf89535d2e Uploaded
greg
parents:
diff changeset
531 dev.new(width=20, height=10);
adaf89535d2e Uploaded
greg
parents:
diff changeset
532 file_path = get_file_path(output_plots_dir, "missing_data.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
533 pdf(file=file_path, width=20, height=10);
adaf89535d2e Uploaded
greg
parents:
diff changeset
534 par(mar = c(8, 4, 4, 2));
adaf89535d2e Uploaded
greg
parents:
diff changeset
535 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n");
adaf89535d2e Uploaded
greg
parents:
diff changeset
536 text(cex=0.8, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1);
adaf89535d2e Uploaded
greg
parents:
diff changeset
537 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
538 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
539
adaf89535d2e Uploaded
greg
parents:
diff changeset
540 # Sample MLG on a map.
adaf89535d2e Uploaded
greg
parents:
diff changeset
541 start_time <- time_start("Creating mlg_map.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
542 # Get the lattitude and longitude boundaries for rendering
adaf89535d2e Uploaded
greg
parents:
diff changeset
543 # the map. Tese boundaries will restrict the map to focus
adaf89535d2e Uploaded
greg
parents:
diff changeset
544 # (i.e., zoom) on the region of the world map from which
adaf89535d2e Uploaded
greg
parents:
diff changeset
545 # the samples were taken.
adaf89535d2e Uploaded
greg
parents:
diff changeset
546 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
547 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
548 latitude_range_vector <- c(min_latitude-3, max_latitude+3);
adaf89535d2e Uploaded
greg
parents:
diff changeset
549 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
550 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
551 longitude_range_vector <- c(min_longitude-3, max_longitude+3);
adaf89535d2e Uploaded
greg
parents:
diff changeset
552 # Get the palette colors for rendering plots.
adaf89535d2e Uploaded
greg
parents:
diff changeset
553 colors <- length(unique(stag_db_report$coral_mlg_clonal_id));
adaf89535d2e Uploaded
greg
parents:
diff changeset
554 # Get a color palette.
adaf89535d2e Uploaded
greg
parents:
diff changeset
555 palette <- colorRampPalette(piratepal("basel"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
556 # Start PDF device driver.
adaf89535d2e Uploaded
greg
parents:
diff changeset
557 dev.new(width=20, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
558 file_path = get_file_path(output_plots_dir, "mlg_map.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
559 pdf(file=file_path, width=20, height=20);
adaf89535d2e Uploaded
greg
parents:
diff changeset
560 world_data = map_data("world");
adaf89535d2e Uploaded
greg
parents:
diff changeset
561 # Add the coral_mlg_clonal_id column from the stag_db_report
adaf89535d2e Uploaded
greg
parents:
diff changeset
562 # data fram to the affy_metadata_data_frame.
adaf89535d2e Uploaded
greg
parents:
diff changeset
563 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id;
adaf89535d2e Uploaded
greg
parents:
diff changeset
564 # Get the number of colors needed from the palette for plotting
adaf89535d2e Uploaded
greg
parents:
diff changeset
565 # the sample locations on the world map.
adaf89535d2e Uploaded
greg
parents:
diff changeset
566 num_colors = length(unique(affy_metadata_data_frame$mlg));
adaf89535d2e Uploaded
greg
parents:
diff changeset
567 # Get a color palette.
adaf89535d2e Uploaded
greg
parents:
diff changeset
568 palette = colorRampPalette(piratepal("basel"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
569 ggplot() +
adaf89535d2e Uploaded
greg
parents:
diff changeset
570 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") +
adaf89535d2e Uploaded
greg
parents:
diff changeset
571 coord_quickmap(xlim=longitude_range_vector, ylim=latitude_range_vector) +
adaf89535d2e Uploaded
greg
parents:
diff changeset
572 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.7, size=3) +
adaf89535d2e Uploaded
greg
parents:
diff changeset
573 scale_color_manual(values=palette(num_colors)) +
adaf89535d2e Uploaded
greg
parents:
diff changeset
574 theme(legend.position="bottom") +
adaf89535d2e Uploaded
greg
parents:
diff changeset
575 guides(color=guide_legend(nrow=8, byrow=F));
adaf89535d2e Uploaded
greg
parents:
diff changeset
576
adaf89535d2e Uploaded
greg
parents:
diff changeset
577 # Sample MLG on a map for each region.
adaf89535d2e Uploaded
greg
parents:
diff changeset
578 for (i in unique(affy_metadata_data_frame$region)) {
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
579 m <- i;
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
580 num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)]));
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
581 max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
582 min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
583 latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
584 max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
585 min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
586 longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5);
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
587 print(ggplot() +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
588 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region),
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
589 fill="grey", colour="#7f7f7f") +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
590 coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
591 geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),],
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
592 aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.5, size=3) +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
593 scale_color_manual(values=palette(num_colors_2)) +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
594 theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) +
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
595 guides(color=guide_legend(nrow=8, byrow=F)));
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
596 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
597 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
598 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
599
adaf89535d2e Uploaded
greg
parents:
diff changeset
600 if (!is.null(opt$output_nj_phylogeny_tree)) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
601 # Create a phylogeny tree of samples based on distance matrices.
adaf89535d2e Uploaded
greg
parents:
diff changeset
602 # Start PDF device driver.
adaf89535d2e Uploaded
greg
parents:
diff changeset
603 start_time <- time_start("Creating nj_phylogeny_tree.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
604 # Table of alleles for the new samples subset to new plate data.
adaf89535d2e Uploaded
greg
parents:
diff changeset
605 # Create vector indicating number of individuals desired from
adaf89535d2e Uploaded
greg
parents:
diff changeset
606 # affy_id column of stag_db_report data table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
607 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
608 i <- i[!apply(i== "", 1, all),];
adaf89535d2e Uploaded
greg
parents:
diff changeset
609 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
adaf89535d2e Uploaded
greg
parents:
diff changeset
610 dev.new(width=40, height=80);
adaf89535d2e Uploaded
greg
parents:
diff changeset
611 file_path = get_file_path(output_plots_dir, "nj_phylogeny_tree.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
612 pdf(file=file_path, width=40, height=80);
adaf89535d2e Uploaded
greg
parents:
diff changeset
613 # Organize branches by clade.
adaf89535d2e Uploaded
greg
parents:
diff changeset
614 nj_phylogeny_tree <- sample_alleles_vector %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
615 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE, showtree = FALSE) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
616 ladderize();
adaf89535d2e Uploaded
greg
parents:
diff changeset
617 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
618 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0025, cex=0.6, font=2, lwd=4, align.tip.label=F, no.margin=T);
adaf89535d2e Uploaded
greg
parents:
diff changeset
619 # Add a scale bar showing 5% difference.
adaf89535d2e Uploaded
greg
parents:
diff changeset
620 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=2);
adaf89535d2e Uploaded
greg
parents:
diff changeset
621 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
622 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8);
adaf89535d2e Uploaded
greg
parents:
diff changeset
623 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
624 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
625 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
626
adaf89535d2e Uploaded
greg
parents:
diff changeset
627 # Generate a pie chart for each sample with a genotype.
adaf89535d2e Uploaded
greg
parents:
diff changeset
628 # Store the numerical and user_specimen_id values from
adaf89535d2e Uploaded
greg
parents:
diff changeset
629 # stag_db_report for the charts (user_specimen_id names
adaf89535d2e Uploaded
greg
parents:
diff changeset
630 # will be used to label each chart).
adaf89535d2e Uploaded
greg
parents:
diff changeset
631 start_time <- time_start("Creating percent_breakdown.pdf");
3
43d00c21b135 Uploaded
greg
parents: 2
diff changeset
632 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4, -5)];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
633 # Remove NA and NaN values.
adaf89535d2e Uploaded
greg
parents:
diff changeset
634 stag_db_report_data_table <- na.omit(stag_db_report_data_table);
adaf89535d2e Uploaded
greg
parents:
diff changeset
635 # Translate to N (i.e., number of samples with a genotype)
adaf89535d2e Uploaded
greg
parents:
diff changeset
636 # columns and 5 rows.
adaf89535d2e Uploaded
greg
parents:
diff changeset
637 translated_stag_db_report_data_table <- t(stag_db_report_data_table);
adaf89535d2e Uploaded
greg
parents:
diff changeset
638 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
639 # Set the storage mode of the matrix to numeric. In some
adaf89535d2e Uploaded
greg
parents:
diff changeset
640 # cases this could result in the following:
adaf89535d2e Uploaded
greg
parents:
diff changeset
641 # Warning message:
adaf89535d2e Uploaded
greg
parents:
diff changeset
642 # In mde(x) : NAs introduced by coercion
adaf89535d2e Uploaded
greg
parents:
diff changeset
643 mode(translated_stag_db_report_matrix) <- "numeric";
adaf89535d2e Uploaded
greg
parents:
diff changeset
644 # Remove NA and NaN values that may have been introduced
adaf89535d2e Uploaded
greg
parents:
diff changeset
645 # by coercion.
adaf89535d2e Uploaded
greg
parents:
diff changeset
646 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix);
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
647 translated_stag_db_report_matrix<-translated_stag_db_report_matrix[-c(1),];
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
648 #tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
649 dev.new(width=10, height=7);
adaf89535d2e Uploaded
greg
parents:
diff changeset
650 file_path = get_file_path(output_plots_dir, "percent_breakdown.pdf");
adaf89535d2e Uploaded
greg
parents:
diff changeset
651 pdf(file=file_path, width=10, height=7);
adaf89535d2e Uploaded
greg
parents:
diff changeset
652 par(mfrow=c(3, 2));
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
653 col <- c("#A6A6A6","#FFA626","#EB0ACF", "#80FF00" );
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
654 # Generate a pie chart for each sample with genotypes.
adaf89535d2e Uploaded
greg
parents:
diff changeset
655 for (i in 1:ncol(translated_stag_db_report_matrix)) {
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
656 tmp_labels <- paste(c("no call", "heterozygous", "A. cervicornis", "A. palmata"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep="");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
657 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
658 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75);
adaf89535d2e Uploaded
greg
parents:
diff changeset
659 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
660 dev.off()
adaf89535d2e Uploaded
greg
parents:
diff changeset
661 time_elapsed(start_time);
adaf89535d2e Uploaded
greg
parents:
diff changeset
662
adaf89535d2e Uploaded
greg
parents:
diff changeset
663 # close GDS file.
adaf89535d2e Uploaded
greg
parents:
diff changeset
664 snpgdsClose(genofile);
adaf89535d2e Uploaded
greg
parents:
diff changeset
665
adaf89535d2e Uploaded
greg
parents:
diff changeset
666 # Prepare to output data frames for input to a downstream
adaf89535d2e Uploaded
greg
parents:
diff changeset
667 # tool that will use them to update the stag database.
adaf89535d2e Uploaded
greg
parents:
diff changeset
668 start_time <- time_start("Building data frames for insertion into database tables");
adaf89535d2e Uploaded
greg
parents:
diff changeset
669 # sample_prep_data_frame looks like this (split across comment lines):
adaf89535d2e Uploaded
greg
parents:
diff changeset
670 # user_specimen_id field_call bcoral_genet_id bsym_genet_id reef
adaf89535d2e Uploaded
greg
parents:
diff changeset
671 # test_002 prolifera NA NA JohnsonsReef
adaf89535d2e Uploaded
greg
parents:
diff changeset
672 # region latitude longitude geographic_origin colony_location
adaf89535d2e Uploaded
greg
parents:
diff changeset
673 # Bahamas 18.36173 -64.77430 Reef NA
adaf89535d2e Uploaded
greg
parents:
diff changeset
674 # depth disease_resist bleach_resist
adaf89535d2e Uploaded
greg
parents:
diff changeset
675 # 5 NA N
adaf89535d2e Uploaded
greg
parents:
diff changeset
676 # mortality tle spawning collector_last_name collector_first_name organization
adaf89535d2e Uploaded
greg
parents:
diff changeset
677 # NA NA False Kitchen Sheila Penn State
adaf89535d2e Uploaded
greg
parents:
diff changeset
678 # collection_date email seq_facility array_version public
adaf89535d2e Uploaded
greg
parents:
diff changeset
679 # 2018-11-08 k89@psu.edu Affymetrix 1 True
adaf89535d2e Uploaded
greg
parents:
diff changeset
680 # public_after_date sperm_motility healing_time dna_extraction_method
adaf89535d2e Uploaded
greg
parents:
diff changeset
681 # NA -9 -9 NA
adaf89535d2e Uploaded
greg
parents:
diff changeset
682 # dna_concentration registry_id result_folder_name plate_barcode mlg
adaf89535d2e Uploaded
greg
parents:
diff changeset
683 # NA NA PRO100175_PSU175_SAX_b02 P9SR10074 HG0227
adaf89535d2e Uploaded
greg
parents:
diff changeset
684 # affy_id percent_missing_data_coral percent_heterozygous_coral
adaf89535d2e Uploaded
greg
parents:
diff changeset
685 # a550962-436.CEL 1.06 19.10
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
686 # percent_acerv_coral percent_apalm_coral
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
687 # 40.10459 39.73396
adaf89535d2e Uploaded
greg
parents:
diff changeset
688 sample_prep_data_frame <- affy_metadata_data_frame %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
689 left_join(stag_db_report %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
690 select("user_specimen_id", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
691 "percent_acerv_coral", "percent_apalm_coral"),
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
692 by='user_specimen_id');
adaf89535d2e Uploaded
greg
parents:
diff changeset
693 # Get the number of rows for all data frames.
adaf89535d2e Uploaded
greg
parents:
diff changeset
694 num_rows <- nrow(sample_prep_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
695 # Set the column names so that we can extract only those columns
adaf89535d2e Uploaded
greg
parents:
diff changeset
696 # needed for the sample table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
697 colnames(sample_prep_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef",
adaf89535d2e Uploaded
greg
parents:
diff changeset
698 "region", "latitude", "longitude", "geographic_origin", "colony_location",
adaf89535d2e Uploaded
greg
parents:
diff changeset
699 "depth", "disease_resist", "bleach_resist", "mortality", "tle",
adaf89535d2e Uploaded
greg
parents:
diff changeset
700 "spawning", "collector_last_name", "collector_first_name", "organization",
adaf89535d2e Uploaded
greg
parents:
diff changeset
701 "collection_date", "email", "seq_facility", "array_version", "public",
adaf89535d2e Uploaded
greg
parents:
diff changeset
702 "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method",
adaf89535d2e Uploaded
greg
parents:
diff changeset
703 "dna_concentration", "registry_id", "result_folder_name", "plate_barcode",
adaf89535d2e Uploaded
greg
parents:
diff changeset
704 "mlg", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
705 "percent_acerv_coral", "percent_apalm_coral");
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
706
adaf89535d2e Uploaded
greg
parents:
diff changeset
707 # Output the data frame for updating the alleles table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
708 # Subset to only the new plate data.
adaf89535d2e Uploaded
greg
parents:
diff changeset
709 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
710 # Create a vector indicating the number of individuals desired
adaf89535d2e Uploaded
greg
parents:
diff changeset
711 # from the affy_id collumn in the report_user data table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
712 i <- i[!apply(i=="", 1, all),];
adaf89535d2e Uploaded
greg
parents:
diff changeset
713 # Subset the genclone object to the user data.
adaf89535d2e Uploaded
greg
parents:
diff changeset
714 allele_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE];
adaf89535d2e Uploaded
greg
parents:
diff changeset
715 # Convert the subset genclone to a data frame.
adaf89535d2e Uploaded
greg
parents:
diff changeset
716 allele_data_frame <- genind2df(allele_vector, sep="");
adaf89535d2e Uploaded
greg
parents:
diff changeset
717 allele_data_frame <- allele_data_frame %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
718 select(-pop);
adaf89535d2e Uploaded
greg
parents:
diff changeset
719 # Allele string for Allele.table in database.
adaf89535d2e Uploaded
greg
parents:
diff changeset
720 allele_table_data_frame <- unite(allele_data_frame, alleles, 1:19696, sep=" ", remove=TRUE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
721 allele_table_data_frame <- setDT(allele_table_data_frame, keep.rownames=TRUE)[];
adaf89535d2e Uploaded
greg
parents:
diff changeset
722 setnames(allele_table_data_frame, c("rn"), c("affy_id"));
adaf89535d2e Uploaded
greg
parents:
diff changeset
723 # write.csv(concat_sample_alleles,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE);
adaf89535d2e Uploaded
greg
parents:
diff changeset
724 write_data_frame(output_data_dir, "allele.tabular", allele_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
725
adaf89535d2e Uploaded
greg
parents:
diff changeset
726 # Output the data frame for updating the experiment table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
727 experiment_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
728 colnames(experiment_table_data_frame) <- c("seq_facility", "array_version", "result_folder_name", "plate_barcode");
adaf89535d2e Uploaded
greg
parents:
diff changeset
729 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
730 experiment_table_data_frame$seq_facility[i] <- sample_prep_data_frame$seq_facility[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
731 experiment_table_data_frame$array_version[i] <- sample_prep_data_frame$array_version[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
732 experiment_table_data_frame$result_folder_name[i] <- sample_prep_data_frame$result_folder_name[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
733 experiment_table_data_frame$plate_barcode[i] <- sample_prep_data_frame$plate_barcode[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
734 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
735 write_data_frame(output_data_dir, "experiment.tabular", experiment_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
736
adaf89535d2e Uploaded
greg
parents:
diff changeset
737 # Output the data frame for updating the colony table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
738 # The geographic_origin value is used for deciding into which table
adaf89535d2e Uploaded
greg
parents:
diff changeset
739 # to insert the latitude and longitude values. If the geographic_origin
adaf89535d2e Uploaded
greg
parents:
diff changeset
740 # is "reef", the values will be inserted into the reef table, and if it is
adaf89535d2e Uploaded
greg
parents:
diff changeset
741 # "colony", the values will be inserted into the colony table. We insert
adaf89535d2e Uploaded
greg
parents:
diff changeset
742 # these values in both data frames so that the downstream tool that parses
adaf89535d2e Uploaded
greg
parents:
diff changeset
743 # them can determine the appropriate table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
744 colony_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
745 colnames(colony_table_data_frame) <- c("latitude", "longitude", "depth", "geographic_origin");
adaf89535d2e Uploaded
greg
parents:
diff changeset
746 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
747 colony_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
748 colony_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
749 colony_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
750 colony_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
751 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
752 write_data_frame(output_data_dir, "colony.tabular", colony_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
753
adaf89535d2e Uploaded
greg
parents:
diff changeset
754 # Output the data frame for populating the genotype table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
755 # Combine with previously genotyped samples.
adaf89535d2e Uploaded
greg
parents:
diff changeset
756 # prep_genotype_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
757 # A tibble: 220 x 7
adaf89535d2e Uploaded
greg
parents:
diff changeset
758 # Groups: group [?]
adaf89535d2e Uploaded
greg
parents:
diff changeset
759 # group affy_id coral_mlg_clona… user_specimen_id db_match
adaf89535d2e Uploaded
greg
parents:
diff changeset
760 # <int> <chr> <chr> <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
761 # 1 a10000… 13905 HG0048 match
adaf89535d2e Uploaded
greg
parents:
diff changeset
762 # genetic_coral_species_call coral_mlg_rep_sample_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
763 # <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
764 # A.palmata 1104
adaf89535d2e Uploaded
greg
parents:
diff changeset
765 prep_genotype_tibble <- id_data_table %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
766 group_by(row_number()) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
767 dplyr::rename(group='row_number()') %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
768 unnest(affy_id) %>%
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
769 left_join(sample_mlg_match_tibble %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
770 select("affy_id", "coral_mlg_rep_sample_id", "coral_mlg_clonal_id",
33d759858625 Uploaded
greg
parents: 7
diff changeset
771 "genetic_coral_species_call", "bcoral_genet_id", "db_match"),
33d759858625 Uploaded
greg
parents: 7
diff changeset
772 by='affy_id') %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
773 right_join(sample_mlg_match_tibble %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
774 select("coral_mlg_rep_sample_id", "coral_mlg_clonal_id"),
33d759858625 Uploaded
greg
parents: 7
diff changeset
775 by='coral_mlg_clonal_id') %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
776 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id.x),coral_mlg_rep_sample_id.y,coral_mlg_rep_sample_id.x)) %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
777 ungroup() %>%
33d759858625 Uploaded
greg
parents: 7
diff changeset
778 dplyr::select(-coral_mlg_rep_sample_id.x,-coral_mlg_rep_sample_id.y, -group.x,-group.y) %>%
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
779 group_by(coral_mlg_clonal_id) %>%
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
780 arrange(coral_mlg_rep_sample_id) %>%
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
781 slice(1);
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
782
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
783 # Confirm that the representative mlg is the same between runs.
adaf89535d2e Uploaded
greg
parents:
diff changeset
784 uniques2 <- unique(prep_genotype_tibble[c("group", "coral_mlg_rep_sample_id")]);
adaf89535d2e Uploaded
greg
parents:
diff changeset
785 uniques2 <- uniques2[!is.na(uniques2$coral_mlg_rep_sample_id),];
adaf89535d2e Uploaded
greg
parents:
diff changeset
786 na.mlg3 <- which(is.na(prep_genotype_tibble$coral_mlg_rep_sample_id));
adaf89535d2e Uploaded
greg
parents:
diff changeset
787 na.group2 <- prep_genotype_tibble$group[na.mlg3];
adaf89535d2e Uploaded
greg
parents:
diff changeset
788 prep_genotype_tibble$coral_mlg_rep_sample_id[na.mlg3] <- uniques2$coral_mlg_rep_sample_id[match(na.group2, uniques2$group)];
adaf89535d2e Uploaded
greg
parents:
diff changeset
789 # Transform the representative mlg column with new genotyped samples.
adaf89535d2e Uploaded
greg
parents:
diff changeset
790 # representative_mlg_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
791 # A tibble: 220 x 5
adaf89535d2e Uploaded
greg
parents:
diff changeset
792 # affy_id coral_mlg_rep_sa… coral_mlg_clona… user_specimen_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
793 # <chr> <chr> <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
794 # a100000-… 13905 HG0048 13905
adaf89535d2e Uploaded
greg
parents:
diff changeset
795 # genetic_coral_species_call bcoral_genet_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
796 # <chr> <chr>
adaf89535d2e Uploaded
greg
parents:
diff changeset
797 # A.palmata C1651
adaf89535d2e Uploaded
greg
parents:
diff changeset
798 representative_mlg_tibble <- prep_genotype_tibble %>%
8
33d759858625 Uploaded
greg
parents: 7
diff changeset
799 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id) & (db_match =="no_match"), affy_id, coral_mlg_rep_sample_id)) %>%
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
800 ungroup() %>%
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
801 select(-group);
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
802 # prep_genotype_table_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
803 # affy_id coral_mlg_clonal_id user_specimen_id db_match
adaf89535d2e Uploaded
greg
parents:
diff changeset
804 # a550962...CEL HG0120 1090 match
adaf89535d2e Uploaded
greg
parents:
diff changeset
805 # genetic_coral_species_call coral_mlg_rep_sample_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
806 # A.palmata 1104
adaf89535d2e Uploaded
greg
parents:
diff changeset
807 prep_genotype_table_tibble <- stag_db_report %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
808 select("affy_id", "coral_mlg_clonal_id", "user_specimen_id", "db_match", "genetic_coral_species_call") %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
809 left_join(representative_mlg_tibble %>%
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
810 select("coral_mlg_rep_sample_id", "coral_mlg_clonal_id"),
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
811 by='coral_mlg_clonal_id');
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
812 # genotype_table_tibble looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
813 # affy_id coral_mlg_clonal_id user_specimen_id db_match
adaf89535d2e Uploaded
greg
parents:
diff changeset
814 # a550962-436.CEL HG0120 1090 match
adaf89535d2e Uploaded
greg
parents:
diff changeset
815 # genetic_coral_species_call coral_mlg_rep_sample_id bcoral_genet_id
adaf89535d2e Uploaded
greg
parents:
diff changeset
816 # A.palmata 1104 <NA>
adaf89535d2e Uploaded
greg
parents:
diff changeset
817 genotype_table_tibble <- prep_genotype_table_tibble %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
818 left_join(affy_metadata_data_frame %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
819 select("user_specimen_id", "bcoral_genet_id"),
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
820 by='user_specimen_id');
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
821 write_data_frame(output_data_dir, "genotype.tabular", genotype_table_tibble);
adaf89535d2e Uploaded
greg
parents:
diff changeset
822
adaf89535d2e Uploaded
greg
parents:
diff changeset
823 # Output the file needed for populating the person table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
824 person_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
825 colnames(person_table_data_frame) <- c("last_name", "first_name", "organization", "email");
adaf89535d2e Uploaded
greg
parents:
diff changeset
826 # person_table_data_frame looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
827 # last_name first_name organization email
adaf89535d2e Uploaded
greg
parents:
diff changeset
828 # Kitchen Sheila Penn State s89@psu.edu
adaf89535d2e Uploaded
greg
parents:
diff changeset
829 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
830 person_table_data_frame$last_name[i] <- sample_prep_data_frame$collector_last_name[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
831 person_table_data_frame$first_name[i] <- sample_prep_data_frame$collector_first_name[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
832 person_table_data_frame$organization[i] <- sample_prep_data_frame$organization[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
833 person_table_data_frame$email[i] <- sample_prep_data_frame$email[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
834 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
835 write_data_frame(output_data_dir, "person.tabular", person_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
836
adaf89535d2e Uploaded
greg
parents:
diff changeset
837 # Output the file needed for populating the phenotype table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
838 phenotype_table_data_frame <- data.frame(matrix(ncol=7, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
839 colnames(phenotype_table_data_frame) <- c("disease_resist", "bleach_resist", "mortality", "tle",
adaf89535d2e Uploaded
greg
parents:
diff changeset
840 "spawning", "sperm_motility", "healing_time");
adaf89535d2e Uploaded
greg
parents:
diff changeset
841 # phenotype_table_data_frame looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
842 # disease_resist bleach_resist mortality tle spawning sperm_motility healing_time
adaf89535d2e Uploaded
greg
parents:
diff changeset
843 # NA NA NA NA False NA NA
adaf89535d2e Uploaded
greg
parents:
diff changeset
844 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
845 phenotype_table_data_frame$disease_resist[i] <- sample_prep_data_frame$disease_resist[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
846 phenotype_table_data_frame$bleach_resist[i] <- sample_prep_data_frame$bleach_resist[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
847 phenotype_table_data_frame$mortality[i] <- sample_prep_data_frame$mortality[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
848 phenotype_table_data_frame$tle[i] <- sample_prep_data_frame$tle[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
849 phenotype_table_data_frame$spawning[i] <- sample_prep_data_frame$spawning[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
850 phenotype_table_data_frame$sperm_motility[i] <- sample_prep_data_frame$sperm_motility[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
851 phenotype_table_data_frame$healing_time[i] <- sample_prep_data_frame$healing_time[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
852 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
853 write_data_frame(output_data_dir, "phenotype.tabular", phenotype_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
854
adaf89535d2e Uploaded
greg
parents:
diff changeset
855 # Output the file needed for populating the reef table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
856 reef_table_data_frame <- data.frame(matrix(ncol=5, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
857 colnames(reef_table_data_frame) <- c("name", "region", "latitude", "longitude", "geographic_origin");
adaf89535d2e Uploaded
greg
parents:
diff changeset
858 # The geographic_origin value is used for deciding into which table
adaf89535d2e Uploaded
greg
parents:
diff changeset
859 # to insert the latitude and longitude values. If the geographic_origin
adaf89535d2e Uploaded
greg
parents:
diff changeset
860 # is "reef", the values will be inserted into the reef table, and if it is
adaf89535d2e Uploaded
greg
parents:
diff changeset
861 # "colony", the values will be inserted into the colony table. We insert
adaf89535d2e Uploaded
greg
parents:
diff changeset
862 # these values in both data frames so that the downstream tool that parses
adaf89535d2e Uploaded
greg
parents:
diff changeset
863 # them can determine the appropriate table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
864 # reef_table_data_frame looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
865 # name region latitude longitude geographic_origin
adaf89535d2e Uploaded
greg
parents:
diff changeset
866 # JohnsonsReef Bahamas 18.361733 -64.7743 Reef
adaf89535d2e Uploaded
greg
parents:
diff changeset
867 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
868 reef_table_data_frame$name[i] <- sample_prep_data_frame$reef[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
869 reef_table_data_frame$region[i] <- sample_prep_data_frame$region[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
870 reef_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
871 reef_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
872 reef_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
873 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
874 write_data_frame(output_data_dir, "reef.tabular", reef_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
875
adaf89535d2e Uploaded
greg
parents:
diff changeset
876 # Output the file needed for populating the sample table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
877 sample_table_data_frame <- data.frame(matrix(ncol=20, nrow=num_rows));
adaf89535d2e Uploaded
greg
parents:
diff changeset
878 colnames(sample_table_data_frame) <- c("affy_id", "colony_location", "collection_date", "user_specimen_id",
adaf89535d2e Uploaded
greg
parents:
diff changeset
879 "registry_id", "depth", "dna_extraction_method", "dna_concentration",
adaf89535d2e Uploaded
greg
parents:
diff changeset
880 "public", "public_after_date", "percent_missing_data_coral",
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
881 "percent_missing_data_sym", "percent_acerv_coral",
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
882 "percent_reference_sym", "percent_apalm_coral",
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
883 "percent_alternative_sym", "percent_heterozygous_coral",
adaf89535d2e Uploaded
greg
parents:
diff changeset
884 "percent_heterozygous_sym", "field_call", "bcoral_genet_id");
adaf89535d2e Uploaded
greg
parents:
diff changeset
885 for (i in 1:num_rows) {
adaf89535d2e Uploaded
greg
parents:
diff changeset
886 sample_table_data_frame$affy_id[i] <- sample_prep_data_frame$affy_id[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
887 sample_table_data_frame$colony_location[i] <- sample_prep_data_frame$colony_location[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
888 sample_table_data_frame$collection_date[i] <- sample_prep_data_frame$collection_date[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
889 sample_table_data_frame$user_specimen_id[i] <- sample_prep_data_frame$user_specimen_id[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
890 sample_table_data_frame$registry_id[i] <- sample_prep_data_frame$registry_id[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
891 sample_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
892 sample_table_data_frame$dna_extraction_method[i] <- sample_prep_data_frame$dna_extraction_method[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
893 sample_table_data_frame$dna_concentration[i] <- sample_prep_data_frame$dna_concentration[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
894 sample_table_data_frame$public[i] <- sample_prep_data_frame$public[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
895 sample_table_data_frame$public_after_date[i] <- sample_prep_data_frame$public_after_date[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
896 sample_table_data_frame$percent_missing_data_coral[i] <- sample_prep_data_frame$percent_missing_data_coral[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
897 sample_table_data_frame$percent_missing_data_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
898 sample_table_data_frame$percent_acerv_coral[i] <- sample_prep_data_frame$percent_acerv_coral[i];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
899 sample_table_data_frame$percent_reference_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
1
a690e0382ce4 Uploaded
greg
parents: 0
diff changeset
900 sample_table_data_frame$percent_apalm_coral[i] <- sample_prep_data_frame$percent_apalm_coral[i];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
901 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
adaf89535d2e Uploaded
greg
parents:
diff changeset
902 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i];
adaf89535d2e Uploaded
greg
parents:
diff changeset
903 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
adaf89535d2e Uploaded
greg
parents:
diff changeset
904 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i];
9
3c42de11ea1d Uploaded
greg
parents: 8
diff changeset
905 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i];
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
906 }
adaf89535d2e Uploaded
greg
parents:
diff changeset
907 write_data_frame(output_data_dir, "sample.tabular", sample_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
908
adaf89535d2e Uploaded
greg
parents:
diff changeset
909 # Output the file needed for populating the taxonomy table.
adaf89535d2e Uploaded
greg
parents:
diff changeset
910 # taxonomy_table_data_frame looks like this:
adaf89535d2e Uploaded
greg
parents:
diff changeset
911 # genetic_coral_species_call affy_id genus_name species_name
adaf89535d2e Uploaded
greg
parents:
diff changeset
912 # A.palmata a550962-4368120-060520-500_A05.CEL Acropora palmata
adaf89535d2e Uploaded
greg
parents:
diff changeset
913 taxonomy_table_data_frame <- stag_db_report %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
914 select(genetic_coral_species_call, affy_id) %>%
2
9d0ddd9d62c5 Uploaded
greg
parents: 1
diff changeset
915 mutate(genus_name = ifelse(grepl("^A.*", genetic_coral_species_call), "Acropora",ifelse(!is.na(genetic_coral_species_call), "other", NA))) %>%
0
adaf89535d2e Uploaded
greg
parents:
diff changeset
916 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
917 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>%
adaf89535d2e Uploaded
greg
parents:
diff changeset
918 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));
adaf89535d2e Uploaded
greg
parents:
diff changeset
919 colnames(taxonomy_table_data_frame) <- c("genetic_coral_species_call", "affy_id", "genus_name", "species_name");
adaf89535d2e Uploaded
greg
parents:
diff changeset
920 write_data_frame(output_data_dir, "taxonomy.tabular", taxonomy_table_data_frame);
adaf89535d2e Uploaded
greg
parents:
diff changeset
921 time_elapsed(start_time);