# HG changeset patch # User greg # Date 1570193230 14400 # Node ID 9d0ddd9d62c566d45cd931a45422bc7d279850be # Parent a690e0382ce482b44a5016c12f736b46525e4f93 Uploaded diff -r a690e0382ce4 -r 9d0ddd9d62c5 coral_multilocus_genotype.R --- a/coral_multilocus_genotype.R Fri Sep 20 09:17:28 2019 -0400 +++ b/coral_multilocus_genotype.R Fri Oct 04 08:47:10 2019 -0400 @@ -139,6 +139,8 @@ sample_table <- tbl(conn, "sample"); # Import the genotype table. genotype_table <- tbl(conn, "genotype"); +# Import the probe_annotation table. +probe_annotation_table <- tbl(conn, "probe_annotation"); # Select columns from the sample table and the # genotype table joined by genotype_id. sample_table_columns <- sample_table %>% select(user_specimen_id, affy_id, bcoral_genet_id, genotype_id); @@ -149,7 +151,7 @@ # Name the columns. smlg_data_frame <- as.data.frame(smlg); colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id", - "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"); + "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"); # Missing GT in samples submitted. start_time <- time_start("Discovering missing GT in samples"); gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); @@ -169,10 +171,33 @@ missing_gt_data_table$percent_missing_data_coral <- round(missing_gt_data_table$percent_missing_data_coral, digits=2); time_elapsed(start_time); -# Heterozygous alleles. +# Subset genotypes for the fixed SNPs by probe id. +# Select columns from the probe_annotation table. +probe_annotation_table_columns <- probe_annotation_table %>% select(probe_set_id, custid, fixed_status, acerv_allele); +# Convert to data frame. +fixed_snp_data_frame <- as.data.frame(probe_annotation_table_columns); +# Name the columns. +colnames(fixed_snp_data_frame) <- c("probe_set_id", "custid", "fixed_status", "acerv_allele"); +# Filter unwanted rows. +fixed_snp_data_frame <- subset(fixed_snp_data_frame, fixed_snp_data_frame$fixed_status=="keep"); +gt_fixed <- gt[rownames(gt) %in% fixed_snp_data_frame$probe_set_id, ]; + +# Missing GT in fixed SNPs. +missing_gt_fixed <- apply(gt_fixed, MARGIN=2, function(x){ sum(is.na(x))}); +missing_gt_fixed <- (missing_gt_fixed / nrow(gt_fixed)) * 100; +missing_gt_fixed_data_frame <- data.frame(missing_gt_fixed); +missing_gt_fixed_data_table <- setDT(missing_gt_fixed_data_frame, keep.rownames=TRUE)[]; +# Rename the rn column. +setnames(missing_gt_fixed_data_table, c("rn"), c("affy_id")); +# Rename the missing_gt column. +setnames(missing_gt_fixed_data_table, c("missing_gt_fixed"), c("percent_missing_data_fixed")); +# Round data to two digits. +missing_gt_fixed_data_table$percent_missing_data_fixed <- round(missing_gt_fixed_data_table$percent_missing_data_fixed, digits=2); + +# Heterozygous alleles for fixed SNPs. start_time <- time_start("Discovering heterozygous alleles"); -heterozygous_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); -heterozygous_alleles <- (heterozygous_alleles / nrow(vcf)) * 100; +heterozygous_alleles <- apply(gt_fixed, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/1", x))))}); +heterozygous_alleles <- (heterozygous_alleles / nrow(gt_fixed)) * 100; heterozygous_alleles_data_frame <- data.frame(heterozygous_alleles); # The heterozygous_alleles_data_table looks like this: # rn heterozygous_alleles @@ -187,40 +212,56 @@ heterozygous_alleles_data_table$percent_heterozygous_coral <- round(heterozygous_alleles_data_table$percent_heterozygous_coral, digits=2); time_elapsed(start_time); -# Reference alleles. +# Create list of Acerv reference and alternative probes. +rAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="reference"); +aAC <- subset(fixed_snp_data_frame, fixed_snp_data_frame$acerv_allele=="alternative"); + +# Subset probes for the reference and alternative SNPs in Acerv. +ref_ac <- gt_fixed[rownames(gt_fixed) %in% rAC$probe,]; +alt_ac <- gt_fixed[rownames(gt_fixed) %in% aAC$probe,]; + +# Reference alleles for each species. +reference_alleles_ac <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); +reference_alleles_ap <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); + +# Alternative alleles for each species. +alternative_alleles_ac <- apply(alt_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); +alternative_alleles_ap <- apply(ref_ac, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); + +# Apalm alleles. start_time <- time_start("Discovering reference alleles"); -reference_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("0/0", x))))}); -reference_alleles <- (reference_alleles / nrow(vcf)) * 100; -reference_alleles_data_frame <- data.frame(reference_alleles); +ap_sum <- rowSums(cbind(reference_alleles_ap,alternative_alleles_ap)); +ap_alleles <- (ap_sum / nrow(gt_fixed)) * 100; +ap_alleles_data_frame <- data.frame(ap_alleles); # The reference_alleles_data_table looks like this: # rn reference_alleles # a100000-4368120-060520-256_I07.CEL 11.60642 # a100000-4368120-060520-256_K07.CEL 11.45918 -reference_alleles_data_table <- setDT(reference_alleles_data_frame, keep.rownames=TRUE)[]; +ap_alleles_data_table <- setDT(ap_alleles_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. -setnames(reference_alleles_data_table, c("rn"), c("affy_id")); +setnames(ap_alleles_data_table, c("rn"), c("affy_id")); # Rename the reference_alleles column. -setnames(reference_alleles_data_table, c("reference_alleles"), c("percent_acerv_coral")); +setnames(ap_alleles_data_table, c("ap_alleles"), c("percent_apalm_coral")); # Round data to two digits. -reference_alleles_data_table$percent_acerv_coral <- round(reference_alleles_data_table$percent_acerv_coral, digits=2); +ap_alleles_data_table$percent_apalm_coral <- round(ap_alleles_data_table$percent_apalm_coral, digits=2); time_elapsed(start_time); -# Alternative alleles +# Acerv alleles. start_time <- time_start("Discovering alternative alleles"); -alternative_alleles <- apply(gt, MARGIN=2, function(x) {sum(lengths(regmatches(x, gregexpr("1/1", x))))}); -alternative_alleles <- (alternative_alleles / nrow(vcf)) * 100; -alternative_alleles_data_frame <- data.frame(alternative_alleles); +ac_sum <- rowSums(cbind(reference_alleles_ac,alternative_alleles_ac)); +ac_alleles <- (ac_sum / nrow(gt_fixed)) * 100; +ac_alleles_data_frame <- data.frame(ac_alleles); # The alternative_alleles_data_table looks like this: # rn alternative_alleles # a100000-4368120-060520-256_I07.CEL 14.38363 # a100000-4368120-060520-256_K07.CEL 14.08916 -alternative_alleles_data_table <- setDT(alternative_alleles_data_frame, keep.rownames=TRUE)[]; +ac_alleles_data_table <- setDT(ac_alleles_data_frame, keep.rownames=TRUE)[]; # Rename the rn column. -setnames(alternative_alleles_data_table, c("rn"), c("affy_id")); +setnames(ac_alleles_data_table, c("rn"), c("affy_id")); # Rename the alternative_alleles column. -setnames(alternative_alleles_data_table, c("alternative_alleles"), c("percent_apalm_coral")); +setnames(ac_alleles_data_table, c("ac_alleles"), c("percent_acerv_coral")); # Round data to two digits. -alternative_alleles_data_table$percent_apalm_coral <- round(alternative_alleles_data_table$percent_apalm_coral, digits=2); +ac_alleles_data_table$percent_acerv_coral <- round(ac_alleles_data_table$percent_acerv_coral, digits=2); time_elapsed(start_time); # The mlg_ids_data_table looks like this: @@ -301,22 +342,25 @@ left_join(missing_gt_data_table %>% select("affy_id", "percent_missing_data_coral"), by="affy_id") %>% + left_join(missing_gt_fixed_data_table %>% + select("affy_id", "percent_missing_data_fixed"), + by="affy_id") %>% left_join(heterozygous_alleles_data_table %>% select("affy_id", "percent_heterozygous_coral"), by="affy_id") %>% - left_join(reference_alleles_data_table %>% + left_join(ac_alleles_data_table %>% select("affy_id", "percent_acerv_coral"), by="affy_id") %>% - left_join(alternative_alleles_data_table %>% + left_join(ap_alleles_data_table %>% select("affy_id", "percent_apalm_coral"), by="affy_id") %>% mutate(db_match = ifelse(is.na(db_match), "failed", db_match))%>% mutate(coral_mlg_clonal_id = ifelse(is.na(coral_mlg_clonal_id), "failed", coral_mlg_clonal_id)) %>% - mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 40 & percent_apalm_coral <= 44.99, "A.palmata","other")) %>% - mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 45 & percent_apalm_coral <= 51, "A.cervicornis", genetic_coral_species_call)) %>% + mutate(genetic_coral_species_call = ifelse(percent_apalm_coral >= 85 & percent_acerv_coral <= 10, "A.palmata", "other")) %>% + mutate(genetic_coral_species_call = ifelse(percent_acerv_coral >= 85 & percent_apalm_coral <= 10, "A.cervicornis", genetic_coral_species_call)) %>% mutate(genetic_coral_species_call = ifelse(percent_heterozygous_coral > 40, "A.prolifera", genetic_coral_species_call)) %>% ungroup() %>% - select(-group,-db_record); + select(-group, -db_record); time_elapsed(start_time); start_time <- time_start("Writing csv output"); @@ -325,7 +369,7 @@ # Representative clone for genotype table. start_time <- time_start("Creating representative clone for genotype table"); -no_dup_genotypes_genind <- clonecorrect(genind_clone, strata = ~pop.genind_obj.); +no_dup_genotypes_genind <- clonecorrect(genind_clone, strata=~pop.genind_obj.); id_rep <- mlg.id(no_dup_genotypes_genind); id_data_table <- data.table(id_rep, keep.rownames=TRUE); # Rename the id_rep column. @@ -336,13 +380,11 @@ # Create vector indicating number of individuals desired from # affy_id column of stag_db_report data table. i <- ifelse(is.na(stag_db_report[3]), "", stag_db_report[[3]]); -i <- i[!apply(i== "", 1, all),]; +i <- i[!apply(i== "", 1, all), ]; # Subset VCF to the user samples. start_time <- time_start("Subsetting vcf to the user samples"); -l <- length(i)+1; -#n <- ncol(vcf@gt); -#s <- n - l; +l <- length(i) + 1; svcf <- vcf[, 1:l]; write.vcf(svcf, "subset.vcf.gz"); vcf.fn <- "subset.vcf.gz"; @@ -366,7 +408,7 @@ affy_id_region_list <- population_info_data_table[c(2,3,4)]; gds_data_table_join <- gds_data_table %>% left_join(affy_id_region_list %>% - select("affy_id", "user_specimen_id","region"), + select("affy_id", "user_specimen_id", "region"), by='affy_id')%>% drop_na(); samp.annot <- data.frame(pop.group=c(gds_data_table_join$region)); @@ -390,7 +432,7 @@ # Cluster analysis on the genome-wide IBS pairwise distance matrix. start_time <- time_start("Clustering the genome-wide IBS pairwise distance matrix"); set.seed(100); -par(cex=0.6, cex.lab=1, cex.axis=1.5,cex.main=2); +par(cex=0.6, cex.lab=1, cex.axis=1.5, cex.main=2); ibs.hc <- snpgdsHCluster(ibs); time_elapsed(start_time); @@ -411,7 +453,7 @@ pdf(file=file_path, width=40, height=20); rv <- snpgdsCutTree(ibs.hc, col.list=cols, pch.list=15); snpgdsDrawTree(rv, main="Color by Cluster", leaflab="perpendicular", yaxis.kinship=FALSE); -abline(h = 0.032, lty = 2); +abline(h=0.032, lty=2); legend("topleft", legend=levels(rv$samp.group), xpd=T, col=cols[1:nlevels(rv$samp.group)], pch=15, ncol=4, cex=1.2); dev.off() time_elapsed(start_time); @@ -423,7 +465,7 @@ file_path = get_file_path(output_plots_dir, "ibs_region.pdf"); pdf(file=file_path, width=40, height=20); race <- as.factor(population_code); -rv2 <- snpgdsCutTree(ibs.hc, samp.group=race,col.list=cols, pch.list=15); +rv2 <- snpgdsCutTree(ibs.hc, samp.group=race, col.list=cols, pch.list=15); snpgdsDrawTree(rv2, main="Color by Region", leaflab="perpendicular", yaxis.kinship=FALSE); legend("topleft", legend=levels(race), xpd=T, col=cols[1:nlevels(race)], pch=15, ncol=4, cex=1.2); dev.off() @@ -484,23 +526,23 @@ # Sample MLG on a map for each region. for (i in unique(affy_metadata_data_frame$region)) { - m <- i; - num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)])); - max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)],na.rm=TRUE); - min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); - latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5); - max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); - min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); - longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5); - print(ggplot() + - geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), - fill="grey", colour="#7f7f7f") + - coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") + - geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),], aes(x=longitude, y=latitude, - group=mlg, colour=mlg), alpha=.5, size=3) + - scale_color_manual(values=palette(num_colors_2)) + - theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) + - guides(color=guide_legend(nrow=8, byrow=F))); + m <- i; + num_colors_2 = length(unique(affy_metadata_data_frame$mlg[which(affy_metadata_data_frame$region == m)])); + max_latitude_region <- max(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + min_latitude_region <- min(affy_metadata_data_frame$latitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + latitude_range_vector_region <- c(min_latitude_region-0.5, max_latitude_region+0.5); + max_longitude_region <- max(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + min_longitude_region <- min(affy_metadata_data_frame$longitude[which(affy_metadata_data_frame$region == m)], na.rm=TRUE); + longitude_range_vector_region <- c(min_longitude_region-0.5, max_longitude_region+0.5); + print(ggplot() + + geom_map(data=world_data, map=world_data, aes(x=long, y=lat, group=group, map_id=region), + fill="grey", colour="#7f7f7f") + + coord_quickmap(xlim=longitude_range_vector_region, ylim=latitude_range_vector_region, clip = "on") + + geom_point(data=affy_metadata_data_frame[which(affy_metadata_data_frame$region == m),], + aes(x=longitude, y=latitude, group=mlg, colour=mlg), alpha=.5, size=3) + + scale_color_manual(values=palette(num_colors_2)) + + theme(legend.position="bottom") + labs(title=paste("MLG assignments for", m)) + + guides(color=guide_legend(nrow=8, byrow=F))); } dev.off() time_elapsed(start_time); @@ -537,7 +579,7 @@ # stag_db_report for the charts (user_specimen_id names # will be used to label each chart). start_time <- time_start("Creating percent_breakdown.pdf"); -stag_db_report_data_table <- stag_db_report[c(-2, -3, -4)]; +stag_db_report_data_table <- stag_db_report[c(-2, -3, -4,-5,-6)]; # Remove NA and NaN values. stag_db_report_data_table <- na.omit(stag_db_report_data_table); # Translate to N (i.e., number of samples with a genotype) @@ -552,20 +594,16 @@ # Remove NA and NaN values that may have been introduced # by coercion. translated_stag_db_report_matrix <- na.omit(translated_stag_db_report_matrix); -tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); +translated_stag_db_report_matrix<-translated_stag_db_report_matrix[-c(1),]; +#tsdbrm_row_means <- rowMeans(translated_stag_db_report_matrix, na.rm=TRUE); dev.new(width=10, height=7); file_path = get_file_path(output_plots_dir, "percent_breakdown.pdf"); pdf(file=file_path, width=10, height=7); -# Average pie of all samples. -labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(tsdbrm_row_means, 1), "%)", sep=""); -col <- c("GREY", "#006DDB", "#24FF24", "#920000"); -main <- "Average breakdown of SNP assignments across all samples"; -pie(tsdbrm_row_means, labels=labels, radius=0.60, col=col, main=main, cex.main=.75); par(mfrow=c(3, 2)); -col <- c("GREY", "#006DDB", "#24FF24", "#920000"); +col <- c("#A6A6A6","#FFA626","#EB0ACF", "#80FF00" ); # Generate a pie chart for each sample with genotypes. for (i in 1:ncol(translated_stag_db_report_matrix)) { - tmp_labels <- paste(c("missing data", "mixed", "reference", "alternative"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); + tmp_labels <- paste(c("no call", "heterozygous", "A. cervicornis", "A. palmata"), " (", round(translated_stag_db_report_matrix[,i], 1), "%)", sep=""); main <- paste("Breakdown of SNP assignments for", translated_stag_db_report_data_table[1, i]); pie(translated_stag_db_report_matrix[,i], labels=tmp_labels, radius=0.90, col=col, main=main, cex.main=.85, cex=0.75); } @@ -814,7 +852,7 @@ # A.palmata a550962-4368120-060520-500_A05.CEL Acropora palmata taxonomy_table_data_frame <- stag_db_report %>% select(genetic_coral_species_call, affy_id) %>% - mutate(genus_name = ifelse(genetic_coral_species_call == genetic_coral_species_call[grep("^A.*", genetic_coral_species_call)], "Acropora", "other")) %>% + mutate(genus_name = ifelse(grepl("^A.*", genetic_coral_species_call), "Acropora",ifelse(!is.na(genetic_coral_species_call), "other", NA))) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.palmata", "palmata", "other")) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.cervicornis", "cervicornis", species_name)) %>% mutate(species_name = ifelse(genetic_coral_species_call == "A.prolifera", "prolifera", species_name));