comparison coral_multilocus_genotype.R @ 9:3c42de11ea1d draft default tip

Uploaded
author greg
date Wed, 24 Nov 2021 20:17:10 +0000
parents 33d759858625
children
comparison
equal deleted inserted replaced
8:33d759858625 9:3c42de11ea1d
173 select("id", "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"), 173 select("id", "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"),
174 by=c("genotype_id"="id")); 174 by=c("genotype_id"="id"));
175 # Name the columns. 175 # Name the columns.
176 smlg_data_frame <- as.data.frame(smlg); 176 smlg_data_frame <- as.data.frame(smlg);
177 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id", 177 colnames(smlg_data_frame) <- c("user_specimen_id", "affy_id", "bcoral_genet_id", "genotype_id",
178 "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call"); 178 "coral_mlg_clonal_id", "coral_mlg_rep_sample_id", "genetic_coral_species_call");
179 log_data_frame("smlg_data_frame", smlg_data_frame); 179 log_data_frame("smlg_data_frame", smlg_data_frame);
180 # Missing GT in samples submitted. 180 # Missing GT in samples submitted.
181 start_time <- time_start("Discovering missing GT in samples"); 181 start_time <- time_start("Discovering missing GT in samples");
182 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE); 182 gt <- extract.gt(vcf, element="GT", as.numeric=FALSE);
183 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))}); 183 missing_gt <- apply(gt, MARGIN=2, function(x){ sum(is.na(x))});
774 select("coral_mlg_rep_sample_id", "coral_mlg_clonal_id"), 774 select("coral_mlg_rep_sample_id", "coral_mlg_clonal_id"),
775 by='coral_mlg_clonal_id') %>% 775 by='coral_mlg_clonal_id') %>%
776 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id.x),coral_mlg_rep_sample_id.y,coral_mlg_rep_sample_id.x)) %>% 776 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id.x),coral_mlg_rep_sample_id.y,coral_mlg_rep_sample_id.x)) %>%
777 ungroup() %>% 777 ungroup() %>%
778 dplyr::select(-coral_mlg_rep_sample_id.x,-coral_mlg_rep_sample_id.y, -group.x,-group.y) %>% 778 dplyr::select(-coral_mlg_rep_sample_id.x,-coral_mlg_rep_sample_id.y, -group.x,-group.y) %>%
779 distinct(); 779 group_by(coral_mlg_clonal_id) %>%
780 arrange(coral_mlg_rep_sample_id) %>%
781 slice(1);
780 782
781 # Confirm that the representative mlg is the same between runs. 783 # Confirm that the representative mlg is the same between runs.
782 uniques2 <- unique(prep_genotype_tibble[c("group", "coral_mlg_rep_sample_id")]); 784 uniques2 <- unique(prep_genotype_tibble[c("group", "coral_mlg_rep_sample_id")]);
783 uniques2 <- uniques2[!is.na(uniques2$coral_mlg_rep_sample_id),]; 785 uniques2 <- uniques2[!is.na(uniques2$coral_mlg_rep_sample_id),];
784 na.mlg3 <- which(is.na(prep_genotype_tibble$coral_mlg_rep_sample_id)); 786 na.mlg3 <- which(is.na(prep_genotype_tibble$coral_mlg_rep_sample_id));
794 # <chr> <chr> 796 # <chr> <chr>
795 # A.palmata C1651 797 # A.palmata C1651
796 representative_mlg_tibble <- prep_genotype_tibble %>% 798 representative_mlg_tibble <- prep_genotype_tibble %>%
797 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id) & (db_match =="no_match"), affy_id, coral_mlg_rep_sample_id)) %>% 799 mutate(coral_mlg_rep_sample_id=ifelse(is.na(coral_mlg_rep_sample_id) & (db_match =="no_match"), affy_id, coral_mlg_rep_sample_id)) %>%
798 ungroup() %>% 800 ungroup() %>%
799 select(-group)%>% 801 select(-group);
800 distinct();
801 # prep_genotype_table_tibble looks like this: 802 # prep_genotype_table_tibble looks like this:
802 # affy_id coral_mlg_clonal_id user_specimen_id db_match 803 # affy_id coral_mlg_clonal_id user_specimen_id db_match
803 # a550962...CEL HG0120 1090 match 804 # a550962...CEL HG0120 1090 match
804 # genetic_coral_species_call coral_mlg_rep_sample_id 805 # genetic_coral_species_call coral_mlg_rep_sample_id
805 # A.palmata 1104 806 # A.palmata 1104
806 prep_genotype_table_tibble <- stag_db_report %>% 807 prep_genotype_table_tibble <- stag_db_report %>%
807 select("affy_id", "coral_mlg_clonal_id", "user_specimen_id", "db_match", "genetic_coral_species_call") %>% 808 select("affy_id", "coral_mlg_clonal_id", "user_specimen_id", "db_match", "genetic_coral_species_call") %>%
808 left_join(representative_mlg_tibble %>% 809 left_join(representative_mlg_tibble %>%
809 select("affy_id", "coral_mlg_rep_sample_id"), 810 select("coral_mlg_rep_sample_id", "coral_mlg_clonal_id"),
810 by='affy_id'); 811 by='coral_mlg_clonal_id');
811 # genotype_table_tibble looks like this: 812 # genotype_table_tibble looks like this:
812 # affy_id coral_mlg_clonal_id user_specimen_id db_match 813 # affy_id coral_mlg_clonal_id user_specimen_id db_match
813 # a550962-436.CEL HG0120 1090 match 814 # a550962-436.CEL HG0120 1090 match
814 # genetic_coral_species_call coral_mlg_rep_sample_id bcoral_genet_id 815 # genetic_coral_species_call coral_mlg_rep_sample_id bcoral_genet_id
815 # A.palmata 1104 <NA> 816 # A.palmata 1104 <NA>
816 genotype_table_tibble <- prep_genotype_table_tibble %>% 817 genotype_table_tibble <- prep_genotype_table_tibble %>%
817 left_join(affy_metadata_data_frame %>% 818 left_join(affy_metadata_data_frame %>%
818 select("user_specimen_id", "bcoral_genet_id"), 819 select("user_specimen_id", "bcoral_genet_id"),
819 by='user_specimen_id') %>% 820 by='user_specimen_id');
820 drop_na(coral_mlg_rep_sample_id);
821 write_data_frame(output_data_dir, "genotype.tabular", genotype_table_tibble); 821 write_data_frame(output_data_dir, "genotype.tabular", genotype_table_tibble);
822 822
823 # Output the file needed for populating the person table. 823 # Output the file needed for populating the person table.
824 person_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows)); 824 person_table_data_frame <- data.frame(matrix(ncol=4, nrow=num_rows));
825 colnames(person_table_data_frame) <- c("last_name", "first_name", "organization", "email"); 825 colnames(person_table_data_frame) <- c("last_name", "first_name", "organization", "email");
900 sample_table_data_frame$percent_apalm_coral[i] <- sample_prep_data_frame$percent_apalm_coral[i]; 900 sample_table_data_frame$percent_apalm_coral[i] <- sample_prep_data_frame$percent_apalm_coral[i];
901 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 901 sample_table_data_frame$percent_alternative_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
902 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i]; 902 sample_table_data_frame$percent_heterozygous_coral[i] <- sample_prep_data_frame$percent_heterozygous_coral[i];
903 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE; 903 sample_table_data_frame$percent_heterozygous_sym[i] <- DEFAULT_MISSING_NUMERIC_VALUE;
904 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i]; 904 sample_table_data_frame$field_call[i] <- sample_prep_data_frame$field_call[i];
905 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i]; 905 sample_table_data_frame$bcoral_genet_id[i] <- sample_prep_data_frame$bcoral_genet_id[i];
906 } 906 }
907 write_data_frame(output_data_dir, "sample.tabular", sample_table_data_frame); 907 write_data_frame(output_data_dir, "sample.tabular", sample_table_data_frame);
908 908
909 # Output the file needed for populating the taxonomy table. 909 # Output the file needed for populating the taxonomy table.
910 # taxonomy_table_data_frame looks like this: 910 # taxonomy_table_data_frame looks like this: