Mercurial > repos > greg > coral_multilocus_genotype
comparison coral_multilocus_genotype.R @ 0:adaf89535d2e draft
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 10:02:15 -0400 |
parents | |
children | a690e0382ce4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:adaf89535d2e |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 suppressPackageStartupMessages(library("adegenet")) | |
4 suppressPackageStartupMessages(library("ape")) | |
5 suppressPackageStartupMessages(library("data.table")) | |
6 suppressPackageStartupMessages(library("dbplyr")) | |
7 suppressPackageStartupMessages(library("dplyr")) | |
8 suppressPackageStartupMessages(library("ggplot2")) | |
9 suppressPackageStartupMessages(library("knitr")) | |
10 suppressPackageStartupMessages(library("maps")) | |
11 suppressPackageStartupMessages(library("mapproj")) | |
12 suppressPackageStartupMessages(library("optparse")) | |
13 suppressPackageStartupMessages(library("poppr")) | |
14 suppressPackageStartupMessages(library("RColorBrewer")) | |
15 suppressPackageStartupMessages(library("RPostgres")) | |
16 suppressPackageStartupMessages(library("SNPRelate")) | |
17 suppressPackageStartupMessages(library("tidyr")) | |
18 suppressPackageStartupMessages(library("vcfR")) | |
19 suppressPackageStartupMessages(library("vegan")) | |
20 suppressPackageStartupMessages(library("yarrr")) | |
21 theme_set(theme_bw()) | |
22 | |
23 DEFAULT_MISSING_NUMERIC_VALUE <- -9.000000; | |
24 | |
25 option_list <- list( | |
26 make_option(c("--database_connection_string"), action="store", dest="database_connection_string", help="Corals (stag) database connection string"), | |
27 make_option(c("--input_affy_metadata"), action="store", dest="input_affy_metadata", help="Affymetrix 96 well plate input file"), | |
28 make_option(c("--input_pop_info"), action="store", dest="input_pop_info", help="Population information input file"), | |
29 make_option(c("--input_vcf"), action="store", dest="input_vcf", help="VCF input file"), | |
30 make_option(c("--output_nj_phylogeny_tree"), action="store", dest="output_nj_phylogeny_tree", default=NULL, help="Flag to plot neighbor-joining phylogeny tree"), | |
31 make_option(c("--output_stag_db_report"), action="store", dest="output_stag_db_report", help="Flag to output stag db report file") | |
32 ) | |
33 | |
34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list); | |
35 args <- parse_args(parser, positional_arguments=TRUE); | |
36 opt <- args$options; | |
37 | |
38 get_file_path = function(dir, file_name) { | |
39 file_path = paste(dir, file_name, sep="/"); | |
40 return(file_path); | |
41 } | |
42 | |
43 get_database_connection <- function(db_conn_string) { | |
44 # Instantiate database connection. | |
45 # The connection string has this format: | |
46 # postgresql://user:password@host/dbname | |
47 conn_items <- strsplit(db_conn_string, "://")[[1]]; | |
48 string_needed <- conn_items[2]; | |
49 items_needed <- strsplit(string_needed, "@")[[1]]; | |
50 user_pass_string <- items_needed[1]; | |
51 host_dbname_string <- items_needed[2]; | |
52 user_pass_items <- strsplit(user_pass_string, ":")[[1]]; | |
53 host_dbname_items <- strsplit(host_dbname_string, "/")[[1]]; | |
54 user <- user_pass_items[1]; | |
55 pass <- user_pass_items[2]; | |
56 host <- host_dbname_items[1]; | |
57 dbname <- host_dbname_items[2]; | |
58 conn <- DBI::dbConnect(RPostgres::Postgres(), host=host, port="5432", dbname=dbname, user=user, password=pass); | |
59 return (conn); | |
60 } | |
61 | |
62 time_elapsed <- function(start_time) { | |
63 cat("Elapsed time: ", proc.time() - start_time, "\n\n"); | |
64 } | |
65 | |
66 time_start <- function(msg) { | |
67 start_time <- proc.time(); | |
68 cat(msg, "...\n"); | |
69 return(start_time); | |
70 } | |
71 | |
72 write_data_frame <- function(dir, file_name, data_frame) { | |
73 cat("\nWriting file: ", file_name, "\n"); | |
74 file_path <- get_file_path(dir, file_name); | |
75 write.table(data_frame, file=file_path, quote=FALSE, row.names=FALSE, sep="\t"); | |
76 } | |
77 | |
78 # Prepare for processing. | |
79 output_data_dir = "output_data_dir"; | |
80 output_plots_dir = "output_plots_dir"; | |
81 # Read in VCF input file. | |
82 start_time <- time_start("Reading VCF input"); | |
83 vcf <- read.vcfR(opt$input_vcf); | |
84 time_elapsed(start_time); | |
85 | |
86 # Convert VCF file into a genind for the Poppr package. | |
87 start_time <- time_start("Converting VCF data to a genind object"); | |
88 genind_obj <- vcfR2genind(vcf); | |
89 time_elapsed(start_time); | |
90 | |
91 # Add population information to the genind object. | |
92 population_info_data_table <- read.table(opt$input_pop_info, check.names=FALSE, header=F, na.strings=c("", "NA"), stringsAsFactors=FALSE, sep="\t", quote=""); | |
93 colnames(population_info_data_table) <- c("row_id", "affy_id", "user_specimen_id", "region"); | |
94 #write_data_frame(output_data_dir, "population_info_data_table", population_info_data_table); | |
95 genind_obj@pop <- as.factor(population_info_data_table$region); | |
96 strata(genind_obj) <- data.frame(pop(genind_obj)); | |
97 | |
98 # Convert genind object to a genclone object. | |
99 start_time <- time_start("Converting the genind object to a genclone object"); | |
100 genind_clone <- as.genclone(genind_obj); | |
101 time_elapsed(start_time); | |
102 | |
103 # Calculate the bitwise distance between individuals. | |
104 start_time <- time_start("Calculating the bitwise distance between individuals"); | |
105 bitwise_distance <- bitwise.dist(genind_clone); | |
106 time_elapsed(start_time); | |
107 | |
108 # Multilocus genotypes (threshold of 3.2%). | |
109 mlg.filter(genind_clone, distance=bitwise_distance) <- 0.032; | |
110 m <- mlg.table(genind_clone, background=TRUE, color=TRUE); | |
111 | |
112 # Create list of MLGs. | |
113 mlg_ids <- mlg.id(genind_clone); | |
114 | |
115 # Read user's Affymetrix 96 well plate tabular file. | |
116 affy_metadata_data_frame <- read.table(opt$input_affy_metadata, header=FALSE, stringsAsFactors=FALSE, sep="\t", na.strings=c("", "NA"), quote=""); | |
117 colnames(affy_metadata_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", | |
118 "region", "latitude", "longitude", "geographic_origin", "colony_location", | |
119 "depth", "disease_resist", "bleach_resist", "mortality","tle", | |
120 "spawning", "collector_last_name", "collector_first_name", "organization", "collection_date", | |
121 "email", "seq_facility", "array_version", "public", "public_after_date", | |
122 "sperm_motility", "healing_time", "dna_extraction_method", "dna_concentration", "registry_id", | |
123 "result_folder_name", "plate_barcode"); | |
124 affy_metadata_data_frame$user_specimen_id <- as.character(affy_metadata_data_frame$user_specimen_id); | |
125 user_specimen_ids <- as.character(affy_metadata_data_frame$user_specimen_id); | |
126 # The specimen_id_field_call_data_table looks like this: | |
127 # user_specimen_ids V2 | |
128 # 1090 prolifera | |
129 # 1091 prolifera | |
130 specimen_id_field_call_data_table <- data.table(user_specimen_ids, affy_metadata_data_frame$field_call); | |
131 # Rename the user_specimen_ids column. | |
132 setnames(specimen_id_field_call_data_table, c("user_specimen_ids"), c("user_specimen_id")); | |
133 # Rename the V2 column. | |
134 setnames(specimen_id_field_call_data_table, c("V2"), c("field_call")); | |
135 | |
136 # Connect to database. | |
137 conn <- get_database_connection(opt$database_connection_string); | |
138 # Import the sample table. | |
139 sample_table <- tbl(conn, "sample"); | |
140 # Import the genotype table. | |
141 genotype_table <- tbl(conn, "genotype"); | |
142 # Select columns from the sample table and the | |
143 # genotype table joined by genotype_id. | |
144 sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, bcoral_genet_id, genotype_id); | |
145 smlg <- sample_table_columns %>% | |
146 left_join(genotype_table %>% | |
147 select("id", "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"), | |
148 by=c("genotype_id"="id")); | |
149 # Name the columns. | |
150 smlg_data_frame <- as.data.frame(smlg); | |
151 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id", | |
152 "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"); | |
153 # Missing GT in samples submitted. | |
154 start_time <- time_start("Discovering missing GT in samples"); | |
155 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); | |
156 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); | |
157 missing_gt <- (missing_gt / nrow(vcf)) * 100; | |
158 missing_gt_data_frame <- data.frame(missing_gt); | |
159 # The specimen_id_field_call_data_table looks like this: | |
160 # rn missing_gt | |
161 # a100000-4368120-060520-256_I07.CEL 0.06092608 | |
162 # a100000-4368120-060520-256_K07.CEL 0.05077173 | |
163 missing_gt_data_table <-setDT(missing_gt_data_frame, keep.rownames=TRUE)[]; | |
164 # Rename the rn column. | |
165 setnames(missing_gt_data_table, c("rn"), c("affy_id")); | |
166 # Rename the missing_gt column. | |
167 setnames(missing_gt_data_table, c("missing_gt"), c("percent_missing_data_coral")); | |
168 # Round data to two digits. | |
169 missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); | |
170 time_elapsed(start_time); | |
171 | |
172 # Heterozygous alleles. | |
173 start_time <- time_start("Discovering heterozygous alleles"); | |
174 heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); | |
175 heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; | |
176 heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); | |
177 # The heterozygous_alleles_data_table looks like this: | |
178 # rn heterozygous_alleles | |
179 # a100000-4368120-060520-256_I07.CEL 73.94903 | |
180 # a100000-4368120-060520-256_K07.CEL 74.40089 | |
181 heterozygous_alleles_data_table <- setDT(heterozygous_alleles_data_frame, keep.rownames=TRUE)[]; | |
182 # Rename the rn column. | |
183 setnames(heterozygous_alleles_data_table, c("rn"), c("affy_id")); | |
184 # Rename the heterozygous_alleles column. | |
185 setnames(heterozygous_alleles_data_table, c("heterozygous_alleles"), c("percent_heterozygous_coral")); | |
186 # Round data to two digits. | |
187 heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); | |
188 time_elapsed(start_time); | |
189 | |
190 # Reference alleles. | |
191 start_time <- time_start("Discovering reference alleles"); | |
192 reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); | |
193 reference_alleles <- (reference_alleles / nrow(vcf)) * 100; | |
194 reference_alleles_data_frame <- data.frame(reference_alleles); | |
195 # The reference_alleles_data_table looks like this: | |
196 # rn reference_alleles | |
197 # a100000-4368120-060520-256_I07.CEL 11.60642 | |
198 # a100000-4368120-060520-256_K07.CEL 11.45918 | |
199 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; | |
200 # Rename the rn column. | |
201 setnames(reference_alleles_data_table, c("rn"), c("affy_id")); | |
202 # Rename the reference_alleles column. | |
203 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); | |
204 # Round data to two digits. | |
205 reference_alleles_data_table$percent_reference_coral <- round(reference_alleles_data_table$percent_reference_coral, digits=2); | |
206 time_elapsed(start_time); | |
207 | |
208 # Alternative alleles | |
209 start_time <- time_start("Discovering alternative alleles"); | |
210 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); | |
211 alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; | |
212 alternative_alleles_data_frame <- data.frame(alternative_alleles); | |
213 # The alternative_alleles_data_table looks like this: | |
214 # rn alternative_alleles | |
215 # a100000-4368120-060520-256_I07.CEL 14.38363 | |
216 # a100000-4368120-060520-256_K07.CEL 14.08916 | |
217 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; | |
218 # Rename the rn column. | |
219 setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); | |
220 # Rename the alternative_alleles column. | |
221 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); | |
222 # Round data to two digits. | |
223 alternative_alleles_data_table$percent_alternative_coral <- round(alternative_alleles_data_table$percent_alternative_coral, digits=2); | |
224 time_elapsed(start_time); | |
225 | |
226 # The mlg_ids_data_table looks like this: | |
227 # mlg_ids | |
228 # a550962-4368120-060520-500_M23.CEL | |
229 # a550962-4368120-060520-256_A19.CEL | |
230 mlg_ids_data_table <- data.table(mlg_ids, keep.rownames=TRUE); | |
231 # Rename the mlg_ids column. | |
232 setnames(mlg_ids_data_table, c("mlg_ids"), c("affy_id")); | |
233 | |
234 # sample_mlg_tibble looks like this: | |
235 # A tibble: 262 x 3 | |
236 # Groups: group [?] | |
237 # group affy_id coral_mlg_clonal_id coral_mlg_rep_sample_id | |
238 # <int> <chr> <chr> <chr> | |
239 # 1 a550962-4368.CEL NA 13905 | |
240 sample_mlg_tibble <- mlg_ids_data_table %>% | |
241 group_by(row_number()) %>% | |
242 dplyr::rename(group="row_number()") %>% | |
243 unnest (affy_id) %>% | |
244 # Join with mlg table. | |
245 left_join(smlg_data_frame %>% | |
246 select("affy_id","coral_mlg_clonal_id", "coral_mlg_rep_sample_id"), | |
247 by="affy_id"); | |
248 | |
249 # If found in database, group members on previous mlg id. | |
250 uniques <- unique(sample_mlg_tibble[c("group", "coral_mlg_clonal_id")]); | |
251 uniques <- uniques[!is.na(uniques$coral_mlg_clonal_id),]; | |
252 na.mlg <- which(is.na(sample_mlg_tibble$coral_mlg_clonal_id)); | |
253 na.group <- sample_mlg_tibble$group[na.mlg]; | |
254 sample_mlg_tibble$coral_mlg_clonal_id[na.mlg] <- uniques$coral_mlg_clonal_id[match(na.group, uniques$group)]; | |
255 | |
256 # Find out if the sample mlg matched a previous genotyped sample. | |
257 # sample_mlg_match_tibble looks like this: | |
258 # A tibble: 262 x 4 | |
259 # Groups: group [230] | |
260 # group affy_id coral_mlg_clonal_id db_match | |
261 # <int> <chr> <chr> <chr> | |
262 # 1 a550962-436.CEL NA no_match | |
263 sample_mlg_match_tibble <- sample_mlg_tibble %>% | |
264 group_by(group) %>% | |
265 mutate(db_match = ifelse(is.na(coral_mlg_clonal_id), "no_match", "match")); | |
266 | |
267 # Create new mlg id for samples with no matches in the database. | |
268 none <- unique(sample_mlg_match_tibble[c("group", "coral_mlg_clonal_id")]); | |
269 none <- none[is.na(none$coral_mlg_clonal_id),]; | |
270 na.mlg2 <- which(is.na(sample_mlg_match_tibble$coral_mlg_clonal_id)); | |
271 n.g <- sample_mlg_match_tibble$group[na.mlg2]; | |
272 ct <- length(unique(n.g)); | |
273 | |
274 # List of new group ids, the sequence starts at the number of | |
275 # ids present in sample_mlg_match_tibble$coral_mlg_clonal_ids | |
276 # plus 1. | |
277 n.g_ids <- sprintf("HG%04d", seq((sum(!is.na(unique(sample_mlg_match_tibble["coral_mlg_clonal_id"]))) + 1), by=1, length=ct)); | |
278 | |
279 # Assign the new id iteratively for all that have NA. | |
280 for (i in 1:length(na.mlg2)) { | |
281 sample_mlg_match_tibble$coral_mlg_clonal_id[na.mlg2[i]] <- n.g_ids[match(sample_mlg_match_tibble$group[na.mlg2[i]], unique(n.g))]; | |
282 } | |
283 | |
284 # Subset population_info_data_table for all samples. | |
285 # affy_id_user_specimen_id_vector looks like this: | |
286 # affy_id user_specimen_id | |
287 # a100000-432.CEL 13704 | |
288 affy_id_user_specimen_id_vector <- population_info_data_table[c(2, 3)]; | |
289 | |
290 # Merge data frames for final table. | |
291 start_time <- time_start("Merging data frames"); | |
292 stag_db_report <- specimen_id_field_call_data_table %>% | |
293 left_join(affy_id_user_specimen_id_vector %>% | |
294 select("affy_id", "user_specimen_id"), | |
295 by="user_specimen_id") %>% | |
296 mutate(db_record = ifelse(affy_id %in% smlg_data_frame$affy_id, "genotyped", "new")) %>% | |
297 filter(db_record=="new") %>% | |
298 left_join(sample_mlg_match_tibble %>% | |
299 select("affy_id", "coral_mlg_clonal_id", "db_match"), | |
300 by="affy_id") %>% | |
301 left_join(missing_gt_data_table %>% | |
302 select("affy_id", "percent_missing_data_coral"), | |
303 by="affy_id") %>% | |
304 left_join(heterozygous_alleles_data_table %>% | |
305 select("affy_id", "percent_heterozygous_coral"), | |
306 by="affy_id") %>% | |
307 left_join(reference_alleles_data_table %>% | |
308 select("affy_id", "percent_reference_coral"), | |
309 by="affy_id") %>% | |
310 left_join(alternative_alleles_data_table %>% | |
311 select("affy_id", "percent_alternative_coral"), | |
312 by="affy_id") %>% | |
313 mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>% | |
314 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% | |
315 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.99, "A.palmata","other")) %>% | |
316 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45 & percent_alternative_coral <= 51, "A.cervicornis", genetic_coral_species_call)) %>% | |
317 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% | |
318 ungroup() %>% | |
319 select(-group,-db_record); | |
320 time_elapsed(start_time); | |
321 | |
322 start_time <- time_start("Writing csv output"); | |
323 write.csv(stag_db_report, file=opt$output_stag_db_report, quote=FALSE); | |
324 time_elapsed(start_time); | |
325 | |
326 # Representative clone for genotype table. | |
327 start_time <- time_start("Creating representative clone for genotype table"); | |
328 no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); | |
329 id_rep <- mlg.id(no_dup_genotypes_genind); | |
330 id_data_table <- data.table(id_rep, keep.rownames=TRUE); | |
331 # Rename the id_rep column. | |
332 setnames(id_data_table, c("id_rep"), c("affy_id")); | |
333 time_elapsed(start_time); | |
334 | |
335 # Table of alleles for the new samples subset to new plate data. | |
336 # Create vector indicating number of individuals desired from | |
337 # affy_id column of stag_db_report data table. | |
338 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]); | |
339 i <- i[!apply(i== "", 1, all),]; | |
340 | |
341 # Subset VCF to the user samples. | |
342 start_time <- time_start("Subsetting vcf to the user samples"); | |
343 l <- length(i)+1; | |
344 #n <- ncol(vcf@gt); | |
345 #s <- n - l; | |
346 svcf <- vcf[, 1:l]; | |
347 write.vcf(svcf, "subset.vcf.gz"); | |
348 vcf.fn <- "subset.vcf.gz"; | |
349 snpgdsVCF2GDS(vcf.fn, "test3.gds", method="biallelic.only"); | |
350 genofile <- snpgdsOpen(filename="test3.gds", readonly=FALSE); | |
351 gds_array <- read.gdsn(index.gdsn(genofile, "sample.id")); | |
352 # gds_array looks like this: | |
353 # [1] "a550962-4368120-060520-500_A03.CEL" "a550962-4368120-060520-500_A05.CEL" | |
354 gds_data_frame <- data.frame(gds_array); | |
355 # gds_data_frame looks like this: | |
356 # gds_array | |
357 # a550962-4368120-060520-500_A03.CEL | |
358 # a550962-4368120-060520-500_A05.CEL | |
359 gds_data_table <- setDT(gds_data_frame, keep.rownames=FALSE)[]; | |
360 # Rename the gds_array column. | |
361 setnames(gds_data_table, c("gds_array"), c("affy_id")); | |
362 # affy_id_region_list looks like this: | |
363 # affy_id region | |
364 # a100000-4368120-060520-256_I07.CEL USVI | |
365 # a100000-4368120-060520-256_K07.CEL USVI | |
366 affy_id_region_list <- population_info_data_table[c(2,3,4)]; | |
367 gds_data_table_join <- gds_data_table %>% | |
368 left_join(affy_id_region_list %>% | |
369 select("affy_id", "user_specimen_id","region"), | |
370 by='affy_id')%>% | |
371 drop_na(); | |
372 samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); | |
373 add.gdsn(genofile, "sample.annot", samp.annot); | |
374 # population_code looks like this: | |
375 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 | |
376 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 | |
377 population_code <- read.gdsn(index.gdsn(genofile, path="sample.annot/pop.group")); | |
378 pop.group <- as.factor(read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))); | |
379 # pop.group looks like this: | |
380 # [1] 18.361733 18.361733 18.361733 18.361733 18.361733 18.361733 | |
381 # [7] 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 25.11844009 | |
382 time_elapsed(start_time); | |
383 | |
384 # Distance matrix calculation and sample labels change to user specimen ids. | |
385 start_time <- time_start("Calculating distance matrix"); | |
386 ibs <- snpgdsIBS(genofile, num.thread=2, autosome.only=FALSE); | |
387 ibs$sample.id <-gds_data_table_join$user_specimen_id; | |
388 time_elapsed(start_time); | |
389 | |
390 # Cluster analysis on the genome-wide IBS pairwise distance matrix. | |
391 start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); | |
392 set.seed(100); | |
393 par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); | |
394 ibs.hc <- snpgdsHCluster(ibs); | |
395 time_elapsed(start_time); | |
396 | |
397 # cols looks like this: | |
398 # blue1 red green pink orange blue2 | |
399 # "#0C5BB0FF" "#EE0011FF" "#15983DFF" "#EC579AFF" "#FA6B09FF" "#149BEDFF" | |
400 # green2 yellow turquoise poop | |
401 # "#A1C720FF" "#FEC10BFF" "#16A08CFF" "#9A703EFF" | |
402 cols <- piratepal("basel"); | |
403 set.seed(999); | |
404 | |
405 # Generate plots. | |
406 # Default clustering. | |
407 start_time <- time_start("Creating ibs_default.pdf"); | |
408 # Start PDF device driver. | |
409 dev.new(width=40, height=20); | |
410 file_path = get_file_path(output_plots_dir, "ibs_default.pdf"); | |
411 pdf(file=file_path, width=40, height=20); | |
412 rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); | |
413 snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", yaxis.kinship=FALSE); | |
414 abline(h = 0.032, lty = 2); | |
415 legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); | |
416 dev.off() | |
417 time_elapsed(start_time); | |
418 | |
419 # Color cluster by region. | |
420 start_time <- time_start("Creating ibs_region.pdf"); | |
421 # Start PDF device driver. | |
422 dev.new(width=40, height=20); | |
423 file_path = get_file_path(output_plots_dir, "ibs_region.pdf"); | |
424 pdf(file=file_path, width=40, height=20); | |
425 race <- as.factor(population_code); | |
426 rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); | |
427 snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", yaxis.kinship=FALSE); | |
428 legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); | |
429 dev.off() | |
430 time_elapsed(start_time); | |
431 | |
432 # Missing data barplot. | |
433 start_time <- time_start("Creating missing_data.pdf"); | |
434 population_info_data_table$miss <- stag_db_report$percent_missing_data_coral[match(missing_gt_data_frame$affy_id, stag_db_report$affy_id)]; | |
435 test2 <- which(!is.na(population_info_data_table$miss)); | |
436 miss96 <- population_info_data_table$miss[test2]; | |
437 name96 <- population_info_data_table$user_specimen_id[test2]; | |
438 # Start PDF device driver. | |
439 dev.new(width=20, height=10); | |
440 file_path = get_file_path(output_plots_dir, "missing_data.pdf"); | |
441 pdf(file=file_path, width=20, height=10); | |
442 par(mar = c(8, 4, 4, 2)); | |
443 x <- barplot(miss96, las=2, col=cols, ylim=c(0, 3), cex.axis=0.8, space=0.8, ylab="Missingness (%)", xaxt="n"); | |
444 text(cex=0.8, x=x-0.25, y=-.05, name96, xpd=TRUE, srt=60, adj=1); | |
445 dev.off() | |
446 time_elapsed(start_time); | |
447 | |
448 # Sample MLG on a map. | |
449 start_time <- time_start("Creating mlg_map.pdf"); | |
450 # Get the lattitude and longitude boundaries for rendering | |
451 # the map. Tese boundaries will restrict the map to focus | |
452 # (i.e., zoom) on the region of the world map from which | |
453 # the samples were taken. | |
454 max_latitude <- max(affy_metadata_data_frame$latitude, na.rm=TRUE); | |
455 min_latitude <- min(affy_metadata_data_frame$latitude, na.rm=TRUE); | |
456 latitude_range_vector <- c(min_latitude-3, max_latitude+3); | |
457 max_longitude <- max(affy_metadata_data_frame$longitude, na.rm=TRUE); | |
458 min_longitude <- min(affy_metadata_data_frame$longitude, na.rm=TRUE); | |
459 longitude_range_vector <- c(min_longitude-3, max_longitude+3); | |
460 # Get the palette colors for rendering plots. | |
461 colors <- length(unique(stag_db_report$coral_mlg_clonal_id)); | |
462 # Get a color palette. | |
463 palette <- colorRampPalette(piratepal("basel")); | |
464 # Start PDF device driver. | |
465 dev.new(width=20, height=20); | |
466 file_path = get_file_path(output_plots_dir, "mlg_map.pdf"); | |
467 pdf(file=file_path, width=20, height=20); | |
468 world_data = map_data("world"); | |
469 # Add the coral_mlg_clonal_id column from the stag_db_report | |
470 # data fram to the affy_metadata_data_frame. | |
471 affy_metadata_data_frame$mlg <- stag_db_report$coral_mlg_clonal_id; | |
472 # Get the number of colors needed from the palette for plotting | |
473 # the sample locations on the world map. | |
474 num_colors = length(unique(affy_metadata_data_frame$mlg)); | |
475 # Get a color palette. | |
476 palette = colorRampPalette(piratepal("basel")); | |
477 ggplot() + | |
478 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), fill="white", colour="#7f7f7f") + | |
479 coord_quickmap(xlim=longitude_range_vector, ylim=latitude_range_vector) + | |
480 geom_point(data=affy_metadata_data_frame, aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.7, size=3) + | |
481 scale_color_manual(values=palette(num_colors)) + | |
482 theme(legend.position="bottom") + | |
483 guides(color=guide_legend(nrow=8, byrow=F)); | |
484 | |
485 # Sample MLG on a map for each region. | |
486 for (i in unique(affy_metadata_data_frame$region)) { | |
487 m <- i; | |
488 num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)])); | |
489 max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)],na.rm=TRUE); | |
490 min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); | |
491 latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5); | |
492 max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); | |
493 min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); | |
494 longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5); | |
495 print(ggplot() + | |
496 geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), | |
497 fill="grey", colour="#7f7f7f") + | |
498 coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") + | |
499 geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),], aes(x=longitude, y=latitude, | |
500 group=mlg, colour=mlg), alpha=.5, size=3) + | |
501 scale_color_manual(values=palette(num_colors_2)) + | |
502 theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) + | |
503 guides(color=guide_legend(nrow=8, byrow=F))); | |
504 } | |
505 dev.off() | |
506 time_elapsed(start_time); | |
507 | |
508 if (!is.null(opt$output_nj_phylogeny_tree)) { | |
509 # Create a phylogeny tree of samples based on distance matrices. | |
510 # Start PDF device driver. | |
511 start_time <- time_start("Creating nj_phylogeny_tree.pdf"); | |
512 # Table of alleles for the new samples subset to new plate data. | |
513 # Create vector indicating number of individuals desired from | |
514 # affy_id column of stag_db_report data table. | |
515 i <- ifelse(is.na(stag_db_report[1]), "", stag_db_report[[1]]); | |
516 i <- i[!apply(i== "", 1, all),]; | |
517 sample_alleles_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; | |
518 dev.new(width=40, height=80); | |
519 file_path = get_file_path(output_plots_dir, "nj_phylogeny_tree.pdf"); | |
520 pdf(file=file_path, width=40, height=80); | |
521 # Organize branches by clade. | |
522 nj_phylogeny_tree <- sample_alleles_vector %>% | |
523 aboot(dist=provesti.dist, sample=100, tree="nj", cutoff=50, quiet=TRUE, showtree = FALSE) %>% | |
524 ladderize(); | |
525 nj_phylogeny_tree$tip.label <- stag_db_report$user_specimen_id[match(nj_phylogeny_tree$tip.label, stag_db_report$affy_id)]; | |
526 plot.phylo(nj_phylogeny_tree, tip.color=cols[sample_alleles_vector$pop], label.offset=0.0025, cex=0.6, font=2, lwd=4, align.tip.label=F, no.margin=T); | |
527 # Add a scale bar showing 5% difference. | |
528 add.scale.bar(0, 0.95, length=0.05, cex=0.65, lwd=2); | |
529 nodelabels(nj_phylogeny_tree$node.label, cex=.5, adj=c(1.5, -0.1), frame="n", font=3, xpd=TRUE); | |
530 legend("topright", legend=c(levels(sample_alleles_vector$pop)), text.col=cols, xpd=T, cex=0.8); | |
531 dev.off() | |
532 time_elapsed(start_time); | |
533 } | |
534 | |
535 # Generate a pie chart for each sample with a genotype. | |
536 # Store the numerical and user_specimen_id values from | |
537 # stag_db_report for the charts (user_specimen_id names | |
538 # will be used to label each chart). | |
539 start_time <- time_start("Creating percent_breakdown.pdf"); | |
540 stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; | |
541 # Remove NA and NaN values. | |
542 stag_db_report_data_table <- na.omit(stag_db_report_data_table); | |
543 # Translate to N (i.e., number of samples with a genotype) | |
544 # columns and 5 rows. | |
545 translated_stag_db_report_data_table <- t(stag_db_report_data_table); | |
546 translated_stag_db_report_matrix <- as.matrix(translated_stag_db_report_data_table[-1,]); | |
547 # Set the storage mode of the matrix to numeric. In some | |
548 # cases this could result in the following: | |
549 # Warning message: | |
550 # In mde(x) : NAs introduced by coercion | |
551 mode(translated_stag_db_report_matrix) <- "numeric"; | |
552 # Remove NA and NaN values that may have been introduced | |
553 # by coercion. | |
554 translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); | |
555 tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); | |
556 dev.new(width=10, height=7); | |
557 file_path = get_file_path(output_plots_dir, "percent_breakdown.pdf"); | |
558 pdf(file=file_path, width=10, height=7); | |
559 # Average pie of all samples. | |
560 labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); | |
561 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | |
562 main <- "Average breakdown of SNP assignments across all samples"; | |
563 pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); | |
564 par(mfrow=c(3, 2)); | |
565 col <- c("GREY", "#006DDB", "#24FF24", "#920000"); | |
566 # Generate a pie chart for each sample with genotypes. | |
567 for (i in 1:ncol(translated_stag_db_report_matrix)) { | |
568 tmp_labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); | |
569 main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); | |
570 pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); | |
571 } | |
572 dev.off() | |
573 time_elapsed(start_time); | |
574 | |
575 # close GDS file. | |
576 snpgdsClose(genofile); | |
577 | |
578 # Prepare to output data frames for input to a downstream | |
579 # tool that will use them to update the stag database. | |
580 start_time <- time_start("Building data frames for insertion into database tables"); | |
581 # sample_prep_data_frame looks like this (split across comment lines): | |
582 # user_specimen_id field_call bcoral_genet_id bsym_genet_id reef | |
583 # test_002 prolifera NA NA JohnsonsReef | |
584 # region latitude longitude geographic_origin colony_location | |
585 # Bahamas 18.36173 -64.77430 Reef NA | |
586 # depth disease_resist bleach_resist | |
587 # 5 NA N | |
588 # mortality tle spawning collector_last_name collector_first_name organization | |
589 # NA NA False Kitchen Sheila Penn State | |
590 # collection_date email seq_facility array_version public | |
591 # 2018-11-08 k89@psu.edu Affymetrix 1 True | |
592 # public_after_date sperm_motility healing_time dna_extraction_method | |
593 # NA -9 -9 NA | |
594 # dna_concentration registry_id result_folder_name plate_barcode mlg | |
595 # NA NA PRO100175_PSU175_SAX_b02 P9SR10074 HG0227 | |
596 # affy_id percent_missing_data_coral percent_heterozygous_coral | |
597 # a550962-436.CEL 1.06 19.10 | |
598 # percent_reference_coral percent_alternative_coral | |
599 # 40.10459 39.73396 | |
600 sample_prep_data_frame <- affy_metadata_data_frame %>% | |
601 left_join(stag_db_report %>% | |
602 select("user_specimen_id", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", | |
603 "percent_reference_coral", "percent_alternative_coral"), | |
604 by='user_specimen_id'); | |
605 # Get the number of rows for all data frames. | |
606 num_rows <- nrow(sample_prep_data_frame); | |
607 # Set the column names so that we can extract only those columns | |
608 # needed for the sample table. | |
609 colnames(sample_prep_data_frame) <- c("user_specimen_id", "field_call", "bcoral_genet_id", "bsym_genet_id", "reef", | |
610 "region", "latitude", "longitude", "geographic_origin", "colony_location", | |
611 "depth", "disease_resist", "bleach_resist", "mortality", "tle", | |
612 "spawning", "collector_last_name", "collector_first_name", "organization", | |
613 "collection_date", "email", "seq_facility", "array_version", "public", | |
614 "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method", | |
615 "dna_concentration", "registry_id", "result_folder_name", "plate_barcode", | |
616 "mlg", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", | |
617 "percent_reference_coral", "percent_alternative_coral"); | |
618 | |
619 # Output the data frame for updating the alleles table. | |
620 # Subset to only the new plate data. | |
621 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]); | |
622 # Create a vector indicating the number of individuals desired | |
623 # from the affy_id collumn in the report_user data table. | |
624 i <- i[!apply(i=="", 1, all),]; | |
625 # Subset the genclone object to the user data. | |
626 allele_vector <- genind_clone[i, mlg.reset=FALSE, drop=FALSE]; | |
627 # Convert the subset genclone to a data frame. | |
628 allele_data_frame <- genind2df(allele_vector, sep=""); | |
629 allele_data_frame <- allele_data_frame %>% | |
630 select(-pop); | |
631 # Allele string for Allele.table in database. | |
632 allele_table_data_frame <- unite(allele_data_frame, alleles, 1:19696, sep=" ", remove=TRUE); | |
633 allele_table_data_frame <- setDT(allele_table_data_frame, keep.rownames=TRUE)[]; | |
634 setnames(allele_table_data_frame, c("rn"), c("affy_id")); | |
635 # write.csv(concat_sample_alleles,file=paste("Seed_genotype_alleles.csv",sep = ""),quote=FALSE,row.names=FALSE); | |
636 write_data_frame(output_data_dir, "allele.tabular", allele_table_data_frame); | |
637 | |
638 # Output the data frame for updating the experiment table. | |
639 experiment_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows)); | |
640 colnames(experiment_table_data_frame) <- c("seq_facility", "array_version", "result_folder_name", "plate_barcode"); | |
641 for (i in 1:num_rows) { | |
642 experiment_table_data_frame$seq_facility[i] <- sample_prep_data_frame$seq_facility[i]; | |
643 experiment_table_data_frame$array_version[i] <- sample_prep_data_frame$array_version[i]; | |
644 experiment_table_data_frame$result_folder_name[i] <- sample_prep_data_frame$result_folder_name[i]; | |
645 experiment_table_data_frame$plate_barcode[i] <- sample_prep_data_frame$plate_barcode[i]; | |
646 } | |
647 write_data_frame(output_data_dir, "experiment.tabular", experiment_table_data_frame); | |
648 | |
649 # Output the data frame for updating the colony table. | |
650 # The geographic_origin value is used for deciding into which table | |
651 # to insert the latitude and longitude values. If the geographic_origin | |
652 # is "reef", the values will be inserted into the reef table, and if it is | |
653 # "colony", the values will be inserted into the colony table. We insert | |
654 # these values in both data frames so that the downstream tool that parses | |
655 # them can determine the appropriate table. | |
656 colony_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows)); | |
657 colnames(colony_table_data_frame) <- c("latitude", "longitude", "depth", "geographic_origin"); | |
658 for (i in 1:num_rows) { | |
659 colony_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i]; | |
660 colony_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i]; | |
661 colony_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i]; | |
662 colony_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i]; | |
663 } | |
664 write_data_frame(output_data_dir, "colony.tabular", colony_table_data_frame); | |
665 | |
666 # Output the data frame for populating the genotype table. | |
667 # Combine with previously genotyped samples. | |
668 # prep_genotype_tibble looks like this: | |
669 # A tibble: 220 x 7 | |
670 # Groups: group [?] | |
671 # group affy_id coral_mlg_clona… user_specimen_id db_match | |
672 # <int> <chr> <chr> <chr> <chr> | |
673 # 1 a10000… 13905 HG0048 match | |
674 # genetic_coral_species_call coral_mlg_rep_sample_id | |
675 # <chr> <chr> | |
676 # A.palmata 1104 | |
677 prep_genotype_tibble <- id_data_table %>% | |
678 group_by(row_number()) %>% | |
679 dplyr::rename(group='row_number()') %>% | |
680 unnest(affy_id) %>% | |
681 left_join(smlg_data_frame %>% | |
682 select("affy_id", "coral_mlg_rep_sample_id", "coral_mlg_clonal_id", "user_specimen_id", | |
683 "genetic_coral_species_call", "bcoral_genet_id"), | |
684 by='affy_id'); | |
685 # Confirm that the representative mlg is the same between runs. | |
686 uniques2 <- unique(prep_genotype_tibble[c("group", "coral_mlg_rep_sample_id")]); | |
687 uniques2 <- uniques2[!is.na(uniques2$coral_mlg_rep_sample_id),]; | |
688 na.mlg3 <- which(is.na(prep_genotype_tibble$coral_mlg_rep_sample_id)); | |
689 na.group2 <- prep_genotype_tibble$group[na.mlg3]; | |
690 prep_genotype_tibble$coral_mlg_rep_sample_id[na.mlg3] <- uniques2$coral_mlg_rep_sample_id[match(na.group2, uniques2$group)]; | |
691 # Transform the representative mlg column with new genotyped samples. | |
692 # representative_mlg_tibble looks like this: | |
693 # A tibble: 220 x 5 | |
694 # affy_id coral_mlg_rep_sa… coral_mlg_clona… user_specimen_id | |
695 # <chr> <chr> <chr> <chr> | |
696 # a100000-… 13905 HG0048 13905 | |
697 # genetic_coral_species_call bcoral_genet_id | |
698 # <chr> <chr> | |
699 # A.palmata C1651 | |
700 representative_mlg_tibble <- prep_genotype_tibble %>% | |
701 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id), affy_id, coral_mlg_rep_sample_id)) %>% | |
702 ungroup() %>% | |
703 select(-group); | |
704 # prep_genotype_table_tibble looks like this: | |
705 # affy_id coral_mlg_clonal_id user_specimen_id db_match | |
706 # a550962...CEL HG0120 1090 match | |
707 # genetic_coral_species_call coral_mlg_rep_sample_id | |
708 # A.palmata 1104 | |
709 prep_genotype_table_tibble <- stag_db_report %>% | |
710 select("affy_id", "coral_mlg_clonal_id", "user_specimen_id", "db_match", "genetic_coral_species_call") %>% | |
711 left_join(representative_mlg_tibble %>% | |
712 select("affy_id", "coral_mlg_rep_sample_id"), | |
713 by='affy_id'); | |
714 # genotype_table_tibble looks like this: | |
715 # affy_id coral_mlg_clonal_id user_specimen_id db_match | |
716 # a550962-436.CEL HG0120 1090 match | |
717 # genetic_coral_species_call coral_mlg_rep_sample_id bcoral_genet_id | |
718 # A.palmata 1104 <NA> | |
719 genotype_table_tibble <- prep_genotype_table_tibble %>% | |
720 left_join(affy_metadata_data_frame %>% | |
721 select("user_specimen_id", "bcoral_genet_id"), | |
722 by='user_specimen_id'); | |
723 write_data_frame(output_data_dir, "genotype.tabular", genotype_table_tibble); | |
724 | |
725 # Output the file needed for populating the person table. | |
726 person_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows)); | |
727 colnames(person_table_data_frame) <- c("last_name", "first_name", "organization", "email"); | |
728 # person_table_data_frame looks like this: | |
729 # last_name first_name organization email | |
730 # Kitchen Sheila Penn State s89@psu.edu | |
731 for (i in 1:num_rows) { | |
732 person_table_data_frame$last_name[i] <- sample_prep_data_frame$collector_last_name[i]; | |
733 person_table_data_frame$first_name[i] <- sample_prep_data_frame$collector_first_name[i]; | |
734 person_table_data_frame$organization[i] <- sample_prep_data_frame$organization[i]; | |
735 person_table_data_frame$email[i] <- sample_prep_data_frame$email[i]; | |
736 } | |
737 write_data_frame(output_data_dir, "person.tabular", person_table_data_frame); | |
738 | |
739 # Output the file needed for populating the phenotype table. | |
740 phenotype_table_data_frame <- data.frame(matrix(ncol=7, nrow=num_rows)); | |
741 colnames(phenotype_table_data_frame) <- c("disease_resist", "bleach_resist", "mortality", "tle", | |
742 "spawning", "sperm_motility", "healing_time"); | |
743 # phenotype_table_data_frame looks like this: | |
744 # disease_resist bleach_resist mortality tle spawning sperm_motility healing_time | |
745 # NA NA NA NA False NA NA | |
746 for (i in 1:num_rows) { | |
747 phenotype_table_data_frame$disease_resist[i] <- sample_prep_data_frame$disease_resist[i]; | |
748 phenotype_table_data_frame$bleach_resist[i] <- sample_prep_data_frame$bleach_resist[i]; | |
749 phenotype_table_data_frame$mortality[i] <- sample_prep_data_frame$mortality[i]; | |
750 phenotype_table_data_frame$tle[i] <- sample_prep_data_frame$tle[i]; | |
751 phenotype_table_data_frame$spawning[i] <- sample_prep_data_frame$spawning[i]; | |
752 phenotype_table_data_frame$sperm_motility[i] <- sample_prep_data_frame$sperm_motility[i]; | |
753 phenotype_table_data_frame$healing_time[i] <- sample_prep_data_frame$healing_time[i]; | |
754 } | |
755 write_data_frame(output_data_dir, "phenotype.tabular", phenotype_table_data_frame); | |
756 | |
757 # Output the file needed for populating the reef table. | |
758 reef_table_data_frame <- data.frame(matrix(ncol=5, nrow=num_rows)); | |
759 colnames(reef_table_data_frame) <- c("name", "region", "latitude", "longitude", "geographic_origin"); | |
760 # The geographic_origin value is used for deciding into which table | |
761 # to insert the latitude and longitude values. If the geographic_origin | |
762 # is "reef", the values will be inserted into the reef table, and if it is | |
763 # "colony", the values will be inserted into the colony table. We insert | |
764 # these values in both data frames so that the downstream tool that parses | |
765 # them can determine the appropriate table. | |
766 # reef_table_data_frame looks like this: | |
767 # name region latitude longitude geographic_origin | |
768 # JohnsonsReef Bahamas 18.361733 -64.7743 Reef | |
769 for (i in 1:num_rows) { | |
770 reef_table_data_frame$name[i] <- sample_prep_data_frame$reef[i]; | |
771 reef_table_data_frame$region[i] <- sample_prep_data_frame$region[i]; | |
772 reef_table_data_frame$latitude[i] <- sample_prep_data_frame$latitude[i]; | |
773 reef_table_data_frame$longitude[i] <- sample_prep_data_frame$longitude[i]; | |
774 reef_table_data_frame$geographic_origin[i] <- sample_prep_data_frame$geographic_origin[i]; | |
775 } | |
776 write_data_frame(output_data_dir, "reef.tabular", reef_table_data_frame); | |
777 | |
778 # Output the file needed for populating the sample table. | |
779 sample_table_data_frame <- data.frame(matrix(ncol=20, nrow=num_rows)); | |
780 colnames(sample_table_data_frame) <- c("affy_id", "colony_location", "collection_date", "user_specimen_id", | |
781 "registry_id", "depth", "dna_extraction_method", "dna_concentration", | |
782 "public", "public_after_date", "percent_missing_data_coral", | |
783 "percent_missing_data_sym", "percent_reference_coral", | |
784 "percent_reference_sym", "percent_alternative_coral", | |
785 "percent_alternative_sym", "percent_heterozygous_coral", | |
786 "percent_heterozygous_sym", "field_call", "bcoral_genet_id"); | |
787 for (i in 1:num_rows) { | |
788 sample_table_data_frame$affy_id[i] <- sample_prep_data_frame$affy_id[i]; | |
789 sample_table_data_frame$colony_location[i] <- sample_prep_data_frame$colony_location[i]; | |
790 sample_table_data_frame$collection_date[i] <- sample_prep_data_frame$collection_date[i]; | |
791 sample_table_data_frame$user_specimen_id[i] <- sample_prep_data_frame$user_specimen_id[i]; | |
792 sample_table_data_frame$registry_id[i] <- sample_prep_data_frame$registry_id[i]; | |
793 sample_table_data_frame$depth[i] <- sample_prep_data_frame$depth[i]; | |
794 sample_table_data_frame$dna_extraction_method[i] <- sample_prep_data_frame$dna_extraction_method[i]; | |
795 sample_table_data_frame$dna_concentration[i] <- sample_prep_data_frame$dna_concentration[i]; | |
796 sample_table_data_frame$public[i] <- sample_prep_data_frame$public[i]; | |
797 sample_table_data_frame$public_after_date[i] <- sample_prep_data_frame$public_after_date[i]; | |
798 sample_table_data_frame$percent_missing_data_coral[i] <- sample_prep_data_frame$percent_missing_data_coral[i]; | |
799 sample_table_data_frame$percent_missing_data_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; | |
800 sample_table_data_frame$percent_reference_coral[i] <- sample_prep_data_frame$percent_reference_coral[i]; | |
801 sample_table_data_frame$percent_reference_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; | |
802 sample_table_data_frame$percent_alternative_coral[i] <- sample_prep_data_frame$percent_alternative_coral[i]; | |
803 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; | |
804 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i]; | |
805 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; | |
806 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i]; | |
807 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i]; | |
808 } | |
809 write_data_frame(output_data_dir, "sample.tabular", sample_table_data_frame); | |
810 | |
811 # Output the file needed for populating the taxonomy table. | |
812 # taxonomy_table_data_frame looks like this: | |
813 # genetic_coral_species_call affy_id genus_name species_name | |
814 # A.palmata a550962-4368120-060520-500_A05.CEL Acropora palmata | |
815 taxonomy_table_data_frame <- stag_db_report %>% | |
816 select(genetic_coral_species_call, affy_id) %>% | |
817 mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% | |
818 mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% | |
819 mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% | |
820 mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name)); | |
821 colnames(taxonomy_table_data_frame) <- c("genetic_coral_species_call", "affy_id", "genus_name", "species_name"); | |
822 write_data_frame(output_data_dir, "taxonomy.tabular", taxonomy_table_data_frame); | |
823 time_elapsed(start_time); |