comparison coral_multilocus_genotype.R @ 1:a690e0382ce4 draft

Uploaded
author greg
date Fri, 20 Sep 2019 09:17:28 -0400
parents adaf89535d2e
children 9d0ddd9d62c5
comparison
equal deleted inserted replaced
0:adaf89535d2e 1:a690e0382ce4
198 # a100000-4368120-060520-256_K07.CEL 11.45918 198 # a100000-4368120-060520-256_K07.CEL 11.45918
199 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; 199 reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[];
200 # Rename the rn column. 200 # Rename the rn column.
201 setnames(reference_alleles_data_table, c("rn"), c("affy_id")); 201 setnames(reference_alleles_data_table, c("rn"), c("affy_id"));
202 # Rename the reference_alleles column. 202 # Rename the reference_alleles column.
203 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_reference_coral")); 203 setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_acerv_coral"));
204 # Round data to two digits. 204 # Round data to two digits.
205 reference_alleles_data_table$percent_reference_coral <- round(reference_alleles_data_table$percent_reference_coral, digits=2); 205 reference_alleles_data_table$percent_acerv_coral <- round(reference_alleles_data_table$percent_acerv_coral, digits=2);
206 time_elapsed(start_time); 206 time_elapsed(start_time);
207 207
208 # Alternative alleles 208 # Alternative alleles
209 start_time <- time_start("Discovering alternative alleles"); 209 start_time <- time_start("Discovering alternative alleles");
210 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); 210 alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))});
216 # a100000-4368120-060520-256_K07.CEL 14.08916 216 # a100000-4368120-060520-256_K07.CEL 14.08916
217 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; 217 alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[];
218 # Rename the rn column. 218 # Rename the rn column.
219 setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); 219 setnames(alternative_alleles_data_table, c("rn"), c("affy_id"));
220 # Rename the alternative_alleles column. 220 # Rename the alternative_alleles column.
221 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_alternative_coral")); 221 setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_apalm_coral"));
222 # Round data to two digits. 222 # Round data to two digits.
223 alternative_alleles_data_table$percent_alternative_coral <- round(alternative_alleles_data_table$percent_alternative_coral, digits=2); 223 alternative_alleles_data_table$percent_apalm_coral <- round(alternative_alleles_data_table$percent_apalm_coral, digits=2);
224 time_elapsed(start_time); 224 time_elapsed(start_time);
225 225
226 # The mlg_ids_data_table looks like this: 226 # The mlg_ids_data_table looks like this:
227 # mlg_ids 227 # mlg_ids
228 # a550962-4368120-060520-500_M23.CEL 228 # a550962-4368120-060520-500_M23.CEL
303 by="affy_id") %>% 303 by="affy_id") %>%
304 left_join(heterozygous_alleles_data_table %>% 304 left_join(heterozygous_alleles_data_table %>%
305 select("affy_id", "percent_heterozygous_coral"), 305 select("affy_id", "percent_heterozygous_coral"),
306 by="affy_id") %>% 306 by="affy_id") %>%
307 left_join(reference_alleles_data_table %>% 307 left_join(reference_alleles_data_table %>%
308 select("affy_id", "percent_reference_coral"), 308 select("affy_id", "percent_acerv_coral"),
309 by="affy_id") %>% 309 by="affy_id") %>%
310 left_join(alternative_alleles_data_table %>% 310 left_join(alternative_alleles_data_table %>%
311 select("affy_id", "percent_alternative_coral"), 311 select("affy_id", "percent_apalm_coral"),
312 by="affy_id") %>% 312 by="affy_id") %>%
313 mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>% 313 mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>%
314 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% 314 mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>%
315 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 40 & percent_alternative_coral <= 44.99, "A.palmata","other")) %>% 315 mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 40 & percent_apalm_coral <= 44.99, "A.palmata","other")) %>%
316 mutate(genetic_coral_species_call = ifelse(percent_alternative_coral >= 45 & percent_alternative_coral <= 51, "A.cervicornis", genetic_coral_species_call)) %>% 316 mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 45 & percent_apalm_coral <= 51, "A.cervicornis", genetic_coral_species_call)) %>%
317 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% 317 mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>%
318 ungroup() %>% 318 ungroup() %>%
319 select(-group,-db_record); 319 select(-group,-db_record);
320 time_elapsed(start_time); 320 time_elapsed(start_time);
321 321
593 # NA -9 -9 NA 593 # NA -9 -9 NA
594 # dna_concentration registry_id result_folder_name plate_barcode mlg 594 # dna_concentration registry_id result_folder_name plate_barcode mlg
595 # NA NA PRO100175_PSU175_SAX_b02 P9SR10074 HG0227 595 # NA NA PRO100175_PSU175_SAX_b02 P9SR10074 HG0227
596 # affy_id percent_missing_data_coral percent_heterozygous_coral 596 # affy_id percent_missing_data_coral percent_heterozygous_coral
597 # a550962-436.CEL 1.06 19.10 597 # a550962-436.CEL 1.06 19.10
598 # percent_reference_coral percent_alternative_coral 598 # percent_acerv_coral percent_apalm_coral
599 # 40.10459 39.73396 599 # 40.10459 39.73396
600 sample_prep_data_frame <- affy_metadata_data_frame %>% 600 sample_prep_data_frame <- affy_metadata_data_frame %>%
601 left_join(stag_db_report %>% 601 left_join(stag_db_report %>%
602 select("user_specimen_id", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", 602 select("user_specimen_id", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
603 "percent_reference_coral", "percent_alternative_coral"), 603 "percent_acerv_coral", "percent_apalm_coral"),
604 by='user_specimen_id'); 604 by='user_specimen_id');
605 # Get the number of rows for all data frames. 605 # Get the number of rows for all data frames.
606 num_rows <- nrow(sample_prep_data_frame); 606 num_rows <- nrow(sample_prep_data_frame);
607 # Set the column names so that we can extract only those columns 607 # Set the column names so that we can extract only those columns
608 # needed for the sample table. 608 # needed for the sample table.
612 "spawning", "collector_last_name", "collector_first_name", "organization", 612 "spawning", "collector_last_name", "collector_first_name", "organization",
613 "collection_date", "email", "seq_facility", "array_version", "public", 613 "collection_date", "email", "seq_facility", "array_version", "public",
614 "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method", 614 "public_after_date", "sperm_motility", "healing_time", "dna_extraction_method",
615 "dna_concentration", "registry_id", "result_folder_name", "plate_barcode", 615 "dna_concentration", "registry_id", "result_folder_name", "plate_barcode",
616 "mlg", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral", 616 "mlg", "affy_id", "percent_missing_data_coral", "percent_heterozygous_coral",
617 "percent_reference_coral", "percent_alternative_coral"); 617 "percent_acerv_coral", "percent_apalm_coral");
618 618
619 # Output the data frame for updating the alleles table. 619 # Output the data frame for updating the alleles table.
620 # Subset to only the new plate data. 620 # Subset to only the new plate data.
621 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]); 621 i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]);
622 # Create a vector indicating the number of individuals desired 622 # Create a vector indicating the number of individuals desired
778 # Output the file needed for populating the sample table. 778 # Output the file needed for populating the sample table.
779 sample_table_data_frame <- data.frame(matrix(ncol=20, nrow=num_rows)); 779 sample_table_data_frame <- data.frame(matrix(ncol=20, nrow=num_rows));
780 colnames(sample_table_data_frame) <- c("affy_id", "colony_location", "collection_date", "user_specimen_id", 780 colnames(sample_table_data_frame) <- c("affy_id", "colony_location", "collection_date", "user_specimen_id",
781 "registry_id", "depth", "dna_extraction_method", "dna_concentration", 781 "registry_id", "depth", "dna_extraction_method", "dna_concentration",
782 "public", "public_after_date", "percent_missing_data_coral", 782 "public", "public_after_date", "percent_missing_data_coral",
783 "percent_missing_data_sym", "percent_reference_coral", 783 "percent_missing_data_sym", "percent_acerv_coral",
784 "percent_reference_sym", "percent_alternative_coral", 784 "percent_reference_sym", "percent_apalm_coral",
785 "percent_alternative_sym", "percent_heterozygous_coral", 785 "percent_alternative_sym", "percent_heterozygous_coral",
786 "percent_heterozygous_sym", "field_call", "bcoral_genet_id"); 786 "percent_heterozygous_sym", "field_call", "bcoral_genet_id");
787 for (i in 1:num_rows) { 787 for (i in 1:num_rows) {
788 sample_table_data_frame$affy_id[i] <- sample_prep_data_frame$affy_id[i]; 788 sample_table_data_frame$affy_id[i] <- sample_prep_data_frame$affy_id[i];
789 sample_table_data_frame$colony_location[i] <- sample_prep_data_frame$colony_location[i]; 789 sample_table_data_frame$colony_location[i] <- sample_prep_data_frame$colony_location[i];
795 sample_table_data_frame$dna_concentration[i] <- sample_prep_data_frame$dna_concentration[i]; 795 sample_table_data_frame$dna_concentration[i] <- sample_prep_data_frame$dna_concentration[i];
796 sample_table_data_frame$public[i] <- sample_prep_data_frame$public[i]; 796 sample_table_data_frame$public[i] <- sample_prep_data_frame$public[i];
797 sample_table_data_frame$public_after_date[i] <- sample_prep_data_frame$public_after_date[i]; 797 sample_table_data_frame$public_after_date[i] <- sample_prep_data_frame$public_after_date[i];
798 sample_table_data_frame$percent_missing_data_coral[i] <- sample_prep_data_frame$percent_missing_data_coral[i]; 798 sample_table_data_frame$percent_missing_data_coral[i] <- sample_prep_data_frame$percent_missing_data_coral[i];
799 sample_table_data_frame$percent_missing_data_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 799 sample_table_data_frame$percent_missing_data_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
800 sample_table_data_frame$percent_reference_coral[i] <- sample_prep_data_frame$percent_reference_coral[i]; 800 sample_table_data_frame$percent_acerv_coral[i] <- sample_prep_data_frame$percent_acerv_coral[i];
801 sample_table_data_frame$percent_reference_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 801 sample_table_data_frame$percent_reference_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
802 sample_table_data_frame$percent_alternative_coral[i] <- sample_prep_data_frame$percent_alternative_coral[i]; 802 sample_table_data_frame$percent_apalm_coral[i] <- sample_prep_data_frame$percent_apalm_coral[i];
803 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 803 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
804 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i]; 804 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i];
805 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 805 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
806 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i]; 806 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i];
807 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i]; 807 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i];