Mercurial > repos > ufz > phi_toolkit_report
comparison report.Rmd @ 1:3a7f73d638ba draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/phi-toolkit commit 368e8a7322c9763c648637263d4695abc146be13
author | ufz |
---|---|
date | Tue, 22 Jul 2025 11:09:24 +0000 |
parents | 315c2ed31af1 |
children |
comparison
equal
deleted
inserted
replaced
0:315c2ed31af1 | 1:3a7f73d638ba |
---|---|
23 | 23 |
24 ------------------------------------------------------------------------ | 24 ------------------------------------------------------------------------ |
25 | 25 |
26 ```{r setup_env, include=FALSE} | 26 ```{r setup_env, include=FALSE} |
27 knitr::opts_chunk$set(echo = FALSE) | 27 knitr::opts_chunk$set(echo = FALSE) |
28 knitr::opts_chunk$set(dev = "svglite") # set output device to svg | |
28 | 29 |
29 cat("params$outdir:", params$outdir, "\n") | 30 cat("params$outdir:", params$outdir, "\n") |
30 ``` | 31 ``` |
31 | 32 |
32 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} | 33 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} |
33 # Define required packages | 34 # Define required packages |
34 required_packages <- c("tidyverse", "janitor", "here", | 35 # Note: update version_command if changed here! |
35 "kableExtra", "gmoviz", "circlize", | 36 required_packages <- c( |
36 "GenomicRanges", "patchwork", "fs", | 37 "tidyverse", "janitor", "here", |
37 "tools", "scales", "formattable", | 38 "kableExtra", "gmoviz", "circlize", |
38 "pdftools", "base64") | 39 "GenomicRanges", "patchwork", "fs", |
40 "tools", "scales", "formattable", | |
41 "pdftools", "base64" | |
42 ) | |
39 | 43 |
40 # Load required packages | 44 # Load required packages |
41 invisible(lapply(required_packages, library, character.only = TRUE)) | 45 invisible(lapply(required_packages, library, character.only = TRUE)) |
42 ``` | 46 ``` |
43 | 47 |
44 ```{r helper_functions, echo=FALSE} | 48 ```{r helper_functions, echo=FALSE} |
45 log_file <- "debug.log" | 49 log_file <- "debug.log" |
46 | 50 |
47 log_debug <- function(message) { | 51 log_debug <- function(message) { |
48 if (!exists("log_initialized") || !log_initialized) { | 52 if (!exists("log_initialized") || !log_initialized) { |
49 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE) | 53 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE) |
50 assign("log_initialized", TRUE, envir = .GlobalEnv) | 54 assign("log_initialized", TRUE, envir = .GlobalEnv) |
51 } else { | 55 } else { |
52 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE) | 56 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE) |
53 } | 57 } |
54 } | 58 } |
55 | 59 |
56 load_file <- function(path) { | 60 load_file <- function(path) { |
57 log_debug(paste("Attempting to load:", path)) | 61 log_debug(paste("Attempting to load:", path)) |
58 if (file.exists(path)) { | 62 if (file.exists(path)) { |
59 ext <- tools::file_ext(path) | 63 ext <- tools::file_ext(path) |
60 if (ext %in% c("tsv", "csv")) { | 64 if (ext %in% c("tsv", "csv")) { |
61 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% clean_names | 65 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% janitor::clean_names() |
62 log_debug(paste("Loaded", nrow(data), "rows from", path)) | 66 log_debug(paste("Loaded", nrow(data), "rows from", path)) |
63 data | 67 data |
64 } else if (ext == "fna") { | 68 } else if (ext == "fna") { |
65 data <- Biostrings::readDNAStringSet(path) | 69 data <- Biostrings::readDNAStringSet(path) |
66 log_debug(paste("Loaded", length(data), "sequences from", path)) | 70 log_debug(paste("Loaded", length(data), "sequences from", path)) |
67 data | 71 data |
72 } else { | |
73 log_debug(paste("Skipping", path, "- unsupported file type")) | |
74 NULL | |
75 } | |
68 } else { | 76 } else { |
69 log_debug(paste("Skipping", path, "- unsupported file type")) | 77 log_debug(paste("File does not exist:", path)) |
70 NULL | 78 NULL |
71 } | 79 } |
72 } else { | |
73 log_debug(paste("File does not exist:", path)) | |
74 NULL | |
75 } | |
76 } | 80 } |
77 | 81 |
78 get_file_info <- function(path, loaded_data) { | 82 get_file_info <- function(path, loaded_data) { |
79 log_debug(paste("Processing file info for:", path)) | 83 log_debug(paste("Processing file info for:", path)) |
80 if (file.exists(path)) { | 84 if (file.exists(path)) { |
81 ext <- tools::file_ext(path) | 85 ext <- tools::file_ext(path) |
82 if (ext %in% c("tsv", "csv", "fna")) { | 86 if (ext %in% c("tsv", "csv", "fna")) { |
83 data <- loaded_data[[basename(path)]] | 87 data <- loaded_data[[basename(path)]] |
84 rows <- if(ext == "fna") length(data) else nrow(data) | 88 rows <- if (ext == "fna") length(data) else nrow(data) |
85 tibble(exists = TRUE, rows = rows, size = file.size(path), path = path) | 89 tibble::tibble(exists = TRUE, rows = rows, size = file.size(path), path = path) |
90 } else { | |
91 tibble::tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path) | |
92 } | |
86 } else { | 93 } else { |
87 tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path) | 94 tibble::tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_) |
88 } | 95 } |
89 } else { | |
90 tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_) | |
91 } | |
92 } | 96 } |
93 | 97 |
94 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) { | 98 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) { |
95 log_debug(paste("Processing folder:", folder)) | 99 log_debug(paste("Processing folder:", folder)) |
96 genome_name <- basename(folder) | 100 genome_name <- basename(folder) |
97 | 101 |
98 paths <- list( | 102 paths <- list( |
99 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")), | 103 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")), |
100 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")), | 104 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")), |
101 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")), | 105 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")), |
102 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")), | 106 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")), |
103 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"), | 107 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"), |
104 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"), | 108 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"), |
105 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"), | 109 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"), |
106 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"), | 110 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"), |
107 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")), | 111 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")), |
108 vibrant = file.path(virus_analyses_dir, "vibrant", genome_name, | 112 vibrant = file.path( |
109 paste0("VIBRANT_", genome_name, "_virus"), | 113 virus_analyses_dir, "vibrant", genome_name, |
110 paste0("VIBRANT_results_", genome_name, "_virus"), | 114 paste0("VIBRANT_", genome_name, "_virus"), |
111 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv")) | 115 paste0("VIBRANT_results_", genome_name, "_virus"), |
112 ) | 116 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv") |
113 | 117 ) |
114 loaded_data <- map(paths, load_file) | 118 ) |
115 file_info <- map_dfr(paths, ~get_file_info(.x, loaded_data), .id = "file_type") | 119 |
116 | 120 loaded_data <- map(paths, load_file) |
117 virus_count <- if(!is.null(loaded_data$genomad)) { | 121 file_info <- purrr::map_dfr(paths, ~ get_file_info(.x, loaded_data), .id = "file_type") |
118 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE) | 122 |
119 log_debug(paste("Virus count:", count)) | 123 virus_count <- if (!is.null(loaded_data$genomad)) { |
120 count | 124 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE) |
121 } else { | 125 log_debug(paste("Virus count:", count)) |
122 log_debug("No genomad summary found, virus count set to 0") | 126 count |
123 0 | 127 } else { |
124 } | 128 log_debug("No genomad summary found, virus count set to 0") |
125 | 129 0 |
126 log_debug("Returning results from process_genome_folder") | 130 } |
127 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data) | 131 |
132 log_debug("Returning results from process_genome_folder") | |
133 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data) | |
128 } | 134 } |
129 ``` | 135 ``` |
130 | 136 |
131 | 137 |
132 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE} | 138 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE} |
133 compile_results <- function() { | 139 compile_results <- function() { |
134 base_dir <- params$outdir | 140 base_dir <- params$outdir |
135 log_debug(paste("Base directory:", base_dir)) | 141 log_debug(paste("Base directory:", base_dir)) |
136 | 142 |
137 host_analyses_dir <- file.path(base_dir, "host_analyses") | 143 host_analyses_dir <- file.path(base_dir, "host_analyses") |
138 virus_analyses_dir <- file.path(base_dir, "virus_analyses") | 144 virus_analyses_dir <- file.path(base_dir, "virus_analyses") |
139 | 145 |
140 # List all sample-level directories from all tools under virus_analyses | 146 # List all sample-level directories from all tools under virus_analyses |
141 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE) | 147 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE) |
142 | 148 |
143 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"), | 149 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"), |
144 full.names = TRUE, recursive = FALSE) | 150 full.names = TRUE, recursive = FALSE |
145 | 151 ) |
146 # cat(length(genome_folders), "sample(s) processed\n") | 152 |
147 | 153 # cat(length(genome_folders), "sample(s) processed\n") |
148 log_debug("Processing genome folders") | 154 |
149 genome_data <- map(genome_folders, process_genome_folder, | 155 log_debug("Processing genome folders") |
150 host_analyses_dir = host_analyses_dir, | 156 genome_data <- map(genome_folders, process_genome_folder, |
151 virus_analyses_dir = virus_analyses_dir) %>% | 157 host_analyses_dir = host_analyses_dir, |
152 set_names(basename(genome_folders)) %>% | 158 virus_analyses_dir = virus_analyses_dir |
153 compact() | 159 ) %>% |
154 | 160 purrr::set_names(basename(genome_folders)) %>% |
155 log_debug("Creating summary dataframe") | 161 compact() |
156 summary_df <- map_dfr(genome_data, ~{ | 162 |
157 file_info <- .x$file_info | 163 log_debug("Creating summary dataframe") |
158 tibble( | 164 summary_df <- purrr::map_dfr(genome_data, ~ { |
159 Sample = basename(file_info$path[1]), | 165 file_info <- .x$file_info |
160 Virus_Count = .x$virus_count, | 166 tibble::tibble( |
161 geNomad = file_info$exists[file_info$file_type == "genomad"], | 167 Sample = basename(file_info$path[1]), |
162 CheckV = file_info$exists[file_info$file_type == "checkv"], | 168 Virus_Count = .x$virus_count, |
163 VIBRANT = file_info$exists[file_info$file_type == "vibrant"], | 169 geNomad = file_info$exists[file_info$file_type == "genomad"], |
164 dRep = file_info$exists[file_info$file_type == "drep"], | 170 CheckV = file_info$exists[file_info$file_type == "checkv"], |
165 iPHOP = file_info$exists[file_info$file_type == "iphop"], | 171 VIBRANT = file_info$exists[file_info$file_type == "vibrant"], |
166 PhaTYP = file_info$exists[file_info$file_type == "phatyp"], | 172 dRep = file_info$exists[file_info$file_type == "drep"], |
167 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"], | 173 iPHOP = file_info$exists[file_info$file_type == "iphop"], |
168 geNomad_Path = file_info$path[file_info$file_type == "genomad"], | 174 PhaTYP = file_info$exists[file_info$file_type == "phatyp"], |
169 CheckV_Path = file_info$path[file_info$file_type == "checkv"], | 175 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"], |
170 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"], | 176 geNomad_Path = file_info$path[file_info$file_type == "genomad"], |
171 dRep_Path = file_info$path[file_info$file_type == "drep"], | 177 CheckV_Path = file_info$path[file_info$file_type == "checkv"], |
172 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"], | 178 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"], |
173 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"], | 179 dRep_Path = file_info$path[file_info$file_type == "drep"], |
174 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"], | 180 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"], |
175 file_info$rows[file_info$file_type == "genomad_phages"], | 181 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"], |
176 0) | 182 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"], |
177 ) | 183 file_info$rows[file_info$file_type == "genomad_phages"], |
178 }) %>% | 184 0 |
179 mutate(across(ends_with("_Path"), ~ifelse(is.na(.), "Not available", as.character(.)))) | 185 ) |
180 | 186 ) |
181 host_genomes_fasta <- list.files( | 187 }) %>% |
182 path = file.path(params$outdir, "genomes"), | 188 dplyr::mutate(dplyr::across(ends_with("_Path"), ~ ifelse(is.na(.), "Not available", as.character(.)))) |
183 pattern = "\\.fna$", | 189 |
184 full.names = TRUE | 190 host_genomes_fasta <- list.files( |
185 ) | 191 path = file.path(params$outdir, "genomes"), |
186 | 192 pattern = "\\.fna$", |
187 host_genomes_paths <- tibble( | 193 full.names = TRUE |
188 name = tools::file_path_sans_ext(basename(host_genomes_fasta)), | 194 ) |
189 path = host_genomes_fasta | 195 |
190 ) | 196 host_genomes_paths <- tibble::tibble( |
191 | 197 name = tools::file_path_sans_ext(basename(host_genomes_fasta)), |
192 data_gtdbtk_host <- read_tsv( | 198 path = host_genomes_fasta |
193 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"), | 199 ) |
194 show_col_types = FALSE | 200 |
195 ) %>% clean_names() | 201 data_gtdbtk_host <- read_tsv( |
196 | 202 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"), |
197 data_checkm_host <- read_tsv( | 203 show_col_types = FALSE |
198 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"), | 204 ) %>% janitor::clean_names() |
199 show_col_types = FALSE | 205 |
200 ) %>% clean_names() | 206 data_checkm_host <- read_tsv( |
201 | 207 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"), |
202 log_debug("Returning summary dataframe, genome data, and host data") | 208 show_col_types = FALSE |
203 | 209 ) %>% janitor::clean_names() |
204 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns")) | 210 |
205 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", "))) | 211 log_debug("Returning summary dataframe, genome data, and host data") |
206 log_debug(paste("genome_data length:", length(genome_data))) | 212 |
207 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", "))) | 213 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns")) |
208 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns")) | 214 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", "))) |
209 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", "))) | 215 log_debug(paste("genome_data length:", length(genome_data))) |
210 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns")) | 216 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", "))) |
211 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", "))) | 217 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns")) |
212 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns")) | 218 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", "))) |
213 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", "))) | 219 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns")) |
214 | 220 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", "))) |
215 list( | 221 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns")) |
216 summary = summary_df, | 222 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", "))) |
217 genome_data = genome_data, | 223 |
218 host_genomes_paths = host_genomes_paths, | 224 list( |
219 data_gtdbtk_host = data_gtdbtk_host, | 225 summary = summary_df, |
220 data_checkm_host = data_checkm_host | 226 genome_data = genome_data, |
221 ) | 227 host_genomes_paths = host_genomes_paths, |
228 data_gtdbtk_host = data_gtdbtk_host, | |
229 data_checkm_host = data_checkm_host | |
230 ) | |
222 } | 231 } |
223 ``` | 232 ``` |
224 | 233 |
225 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE} | 234 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE} |
226 log_debug("Starting execution of main function") | 235 log_debug("Starting execution of main function") |
227 result <- compile_results() | 236 result <- compile_results() |
228 | 237 |
229 if (is.null(result)) { | 238 if (is.null(result)) { |
230 log_debug("Main function execution failed") | 239 log_debug("Main function execution failed") |
231 stop("Main function execution failed") | 240 stop("Main function execution failed") |
232 } | 241 } |
233 | 242 |
234 summary_df <- result$summary | 243 summary_df <- result$summary |
235 genome_data <- result$genome_data | 244 genome_data <- result$genome_data |
236 host_genomes_paths <- result$host_genomes_paths | 245 host_genomes_paths <- result$host_genomes_paths |
237 data_gtdbtk_host <- result$data_gtdbtk_host | 246 data_gtdbtk_host <- result$data_gtdbtk_host |
238 data_checkm_host <- result$data_checkm_host | 247 data_checkm_host <- result$data_checkm_host |
239 log_debug("Data extracted successfully") | 248 log_debug("Data extracted successfully") |
240 | 249 |
241 # Remove any extensions from names in data gtdbtk and checm2 | |
242 data_gtdbtk_host <- data_gtdbtk_host %>% | |
243 mutate(user_genome = str_remove(user_genome, "\\.[^.]+$")) | |
244 | |
245 data_checkm_host <- data_checkm_host %>% | |
246 mutate(name = str_remove(name, "\\.[^.]+$")) | |
247 | |
248 result$summary <- result$summary %>% | 250 result$summary <- result$summary %>% |
249 mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) | 251 dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv")) |
250 ``` | 252 ``` |
251 | 253 |
252 # Summary {.tabset .tabset-fade} | 254 # Summary {.tabset .tabset-fade} |
253 | |
254 ## Overview Table | |
255 | 255 |
256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50. | 256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50. |
257 | 257 |
258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} | 258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} |
259 data <- result$summary | 259 data <- result$summary |
260 | 260 |
261 log_debug("Assigning checkm2 host data") | 261 log_debug("Assigning checkm2 host data") |
262 checkm_host_data <- data_checkm_host %>% clean_names() %>% | 262 checkm_host_data <- data_checkm_host %>% |
263 select(name, completeness, contamination, | 263 janitor::clean_names() %>% |
264 contig_n50, genome_size) | 264 dplyr::select( |
265 name, completeness, contamination, | |
266 contig_n50, genome_size | |
267 ) | |
265 | 268 |
266 log_debug("Assigning GTDB-Tk host data") | 269 log_debug("Assigning GTDB-Tk host data") |
267 gtdbtk_data <- data_gtdbtk_host %>% | 270 gtdbtk_data <- data_gtdbtk_host %>% |
268 select(user_genome, classification) | 271 dplyr::select(user_genome, classification) |
269 | 272 |
270 log_debug("Defining color-blind friendly palette") | 273 log_debug("Defining color-blind friendly palette") |
271 cb_friendly_colors <- list( | 274 cb_friendly_colors <- list( |
272 green = "#009E73", | 275 green = "#009E73", |
273 blue = "#0072B2", | 276 blue = "#0072B2", |
274 orange = "#E69F00", | 277 orange = "#E69F00", |
275 red = "#D55E00", | 278 red = "#D55E00", |
276 grey = "#999999" | 279 grey = "#999999" |
277 ) | 280 ) |
278 | 281 |
279 log_debug("Defining function to color cells") | 282 log_debug("Defining function to color cells") |
280 color_cell <- function(values, color_true = cb_friendly_colors$green, | 283 color_cell <- function(values, color_true = cb_friendly_colors$green, |
281 color_false = cb_friendly_colors$red) { | 284 color_false = cb_friendly_colors$red) { |
282 ifelse(values, | 285 ifelse(values, |
283 cell_spec("Yes", color = "white", bold = TRUE, background = color_true), | 286 kableExtra::cell_spec("Yes", color = "white", bold = TRUE, background = color_true), |
284 cell_spec("No", color = "white", bold = TRUE, background = color_false)) | 287 kableExtra::cell_spec("No", color = "white", bold = TRUE, background = color_false) |
288 ) | |
285 } | 289 } |
286 | 290 |
287 log_debug("Defining function to create bar plot") | 291 log_debug("Defining function to create bar plot") |
288 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) { | 292 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) { |
289 sapply(values, function(value) { | 293 sapply(values, function(value) { |
290 if(is.na(value) || !is.numeric(value)) { | 294 if (is.na(value) || !is.numeric(value)) { |
291 return("N/A") | 295 return("N/A") |
292 } | 296 } |
293 bar_width <- min(max(value, 0), max_value) / max_value * 100 | 297 bar_width <- min(max(value, 0), max_value) / max_value * 100 |
294 sprintf('<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%', | 298 sprintf( |
295 color, bar_width, value) | 299 '<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%', |
296 }) | 300 color, bar_width, value |
301 ) | |
302 }) | |
297 } | 303 } |
298 | 304 |
299 log_debug("Defining function to format large numbers") | 305 log_debug("Defining function to format large numbers") |
300 format_large_number <- function(x) { | 306 format_large_number <- function(x) { |
301 sapply(x, function(value) { | 307 sapply(x, function(value) { |
302 if (is.na(value) || !is.numeric(value)) { | 308 if (is.na(value) || !is.numeric(value)) { |
303 return("N/A") | 309 return("N/A") |
304 } else if (value < 1000) { | 310 } else if (value < 1000) { |
305 return(as.character(value)) | 311 return(as.character(value)) |
306 } else if (value < 1e6) { | 312 } else if (value < 1e6) { |
307 return(paste0(round(value / 1e3, 1), "K")) | 313 return(paste0(round(value / 1e3, 1), "K")) |
308 } else if (value < 1e9) { | 314 } else if (value < 1e9) { |
309 return(paste0(round(value / 1e6, 1), "M")) | 315 return(paste0(round(value / 1e6, 1), "M")) |
310 } else { | 316 } else { |
311 return(paste0(round(value / 1e9, 1), "G")) | 317 return(paste0(round(value / 1e9, 1), "G")) |
312 } | 318 } |
313 }) | 319 }) |
314 } | 320 } |
315 | 321 |
316 log_debug("Defining function to extract last known taxonomy level") | 322 log_debug("Defining function to extract last known taxonomy level") |
317 extract_last_known_taxonomy <- function(classification) { | 323 extract_last_known_taxonomy <- function(classification) { |
318 if (is.na(classification) || classification == "") { | 324 if (is.na(classification) || classification == "") { |
325 return(list(level = "Unknown", name = "Unknown")) | |
326 } | |
327 | |
328 parts <- strsplit(classification, ";")[[1]] | |
329 for (i in length(parts):1) { | |
330 level <- sub("^[a-z]__", "", parts[i]) | |
331 if (level != "") { | |
332 prefix <- sub("__.*$", "", parts[i]) | |
333 return(list(level = prefix, name = level)) | |
334 } | |
335 } | |
319 return(list(level = "Unknown", name = "Unknown")) | 336 return(list(level = "Unknown", name = "Unknown")) |
320 } | |
321 | |
322 parts <- strsplit(classification, ";")[[1]] | |
323 for (i in length(parts):1) { | |
324 level <- sub("^[a-z]__", "", parts[i]) | |
325 if (level != "") { | |
326 prefix <- sub("__.*$", "", parts[i]) | |
327 return(list(level = prefix, name = level)) | |
328 } | |
329 } | |
330 return(list(level = "Unknown", name = "Unknown")) | |
331 } | 337 } |
332 | 338 |
333 log_debug("Defining function to format taxonomy") | 339 log_debug("Defining function to format taxonomy") |
334 format_taxonomy <- function(classification) { | 340 format_taxonomy <- function(classification) { |
335 result <- extract_last_known_taxonomy(classification) | 341 result <- extract_last_known_taxonomy(classification) |
336 if (result$level == "Unknown") { | 342 if (result$level == "Unknown") { |
337 return("Unknown") | 343 return("Unknown") |
338 } else if (result$level == "s") { | 344 } else if (result$level == "s") { |
339 return(paste0("<i>", result$name, "</i>")) | 345 return(paste0("<i>", result$name, "</i>")) |
340 } else { | 346 } else { |
341 genus <- str_replace_all(result$name, "_", " ") | 347 genus <- str_replace_all(result$name, "_", " ") |
342 return(paste0("<i>", genus, "</i> sp.")) | 348 return(paste0("<i>", genus, "</i> sp.")) |
343 } | 349 } |
344 } | 350 } |
345 | 351 |
346 log_debug("Defining function to calculate quality score and determine genome quality class") | 352 log_debug("Defining function to calculate quality score and determine genome quality class") |
347 calculate_quality_score_and_class <- function(completeness, contamination) { | 353 calculate_quality_score_and_class <- function(completeness, contamination) { |
348 if (is.na(completeness) || is.na(contamination)) { | 354 if (is.na(completeness) || is.na(contamination)) { |
349 return(list( | 355 return(list( |
350 score = cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey), | 356 score = kableExtra::cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey), |
351 class = cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey), | 357 class = kableExtra::cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey), |
352 numeric_score = NA | 358 numeric_score = NA |
353 )) | 359 )) |
354 } | 360 } |
355 | 361 |
356 quality_score <- completeness - (5 * contamination) | 362 quality_score <- completeness - (5 * contamination) |
357 formatted_score <- sprintf("%.1f", quality_score) | 363 formatted_score <- sprintf("%.1f", quality_score) |
358 | 364 |
359 if (completeness > 90 && contamination < 5) { | 365 if (completeness > 90 && contamination < 5) { |
360 class <- "High-quality draft" | 366 class <- "High-quality draft" |
361 color <- cb_friendly_colors$green | 367 color <- cb_friendly_colors$green |
362 } else if (completeness >= 50 && contamination < 10) { | 368 } else if (completeness >= 50 && contamination < 10) { |
363 class <- "Medium-quality draft" | 369 class <- "Medium-quality draft" |
364 color <- cb_friendly_colors$blue | 370 color <- cb_friendly_colors$blue |
365 } else { | 371 } else { |
366 class <- "Low-quality draft" | 372 class <- "Low-quality draft" |
367 color <- cb_friendly_colors$red | 373 color <- cb_friendly_colors$red |
368 } | 374 } |
369 | 375 |
370 list( | 376 list( |
371 score = cell_spec(formatted_score, color = "white", bold = TRUE, background = color), | 377 score = kableExtra::cell_spec(formatted_score, color = "white", bold = TRUE, background = color), |
372 class = cell_spec(class, color = "white", bold = TRUE, background = color), | 378 class = kableExtra::cell_spec(class, color = "white", bold = TRUE, background = color), |
373 numeric_score = quality_score | 379 numeric_score = quality_score |
374 ) | 380 ) |
375 } | 381 } |
376 | 382 |
377 log_debug("Preparing the data") | 383 log_debug("Preparing the data") |
378 | 384 |
379 table_data <- data %>% | 385 table_data <- data %>% |
380 #mutate(Sample = basename(Sample) %>% trim_sample_name()) %>% | 386 # dplyr::mutate(Sample = basename(Sample) %>% trim_sample_name()) %>% |
381 mutate(Sample = basename(Sample)) %>% | 387 dplyr::mutate(Sample = basename(Sample)) %>% |
382 left_join(checkm_host_data, by = c("Sample" = "name")) %>% | 388 dplyr::left_join(checkm_host_data, by = c("Sample" = "name")) %>% |
383 left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>% | 389 dplyr::left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>% |
384 mutate( | 390 dplyr::mutate( |
385 quality_data = pmap(list(as.numeric(completeness), | 391 quality_data = purrr::pmap( |
386 as.numeric(contamination)), | 392 list( |
387 calculate_quality_score_and_class), | 393 as.numeric(completeness), |
388 Quality_Score = map_chr(quality_data, ~.$score), | 394 as.numeric(contamination) |
389 Genome_Quality = map_chr(quality_data, ~.$class), | 395 ), |
390 Quality_Score_Numeric = map_dbl(quality_data, ~.$numeric_score), | 396 calculate_quality_score_and_class |
391 Virus_Count_Numeric = as.numeric(Virus_Count), | 397 ), |
392 Virus_Count = cell_spec( | 398 Quality_Score = purrr::map_chr(quality_data, ~ .$score), |
393 Virus_Count, | 399 Genome_Quality = purrr::map_chr(quality_data, ~ .$class), |
394 color = "white", | 400 Quality_Score_Numeric = purrr::map_dbl(quality_data, ~ .$numeric_score), |
395 bold = TRUE, | 401 Virus_Count_Numeric = as.numeric(Virus_Count), |
396 background = case_when( | 402 Virus_Count = kableExtra::cell_spec( |
397 Virus_Count == 0 ~ cb_friendly_colors$red, | 403 Virus_Count, |
398 Virus_Count == 1 ~ cb_friendly_colors$blue, | 404 color = "white", |
399 Virus_Count > 1 ~ cb_friendly_colors$green | 405 bold = TRUE, |
400 ) | 406 background = dplyr::case_when( |
401 ), | 407 Virus_Count == 0 ~ cb_friendly_colors$red, |
402 Completeness_Numeric = as.numeric(completeness), | 408 Virus_Count == 1 ~ cb_friendly_colors$blue, |
403 Completeness = create_bar_plot(as.numeric(completeness), 100), | 409 Virus_Count > 1 ~ cb_friendly_colors$green |
404 Contamination = create_bar_plot(as.numeric(contamination), 100), | 410 ) |
405 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)), | 411 ), |
406 `Genome size (bp)` = format_large_number(as.numeric(genome_size)), | 412 Completeness_Numeric = as.numeric(completeness), |
407 `GTDB Taxonomy` = sapply(classification, format_taxonomy) | 413 Completeness = create_bar_plot(as.numeric(completeness), 100), |
408 ) %>% | 414 Contamination = create_bar_plot(as.numeric(contamination), 100), |
409 mutate(`#` = row_number()) %>% | 415 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)), |
410 select(`#`, Sample, `GTDB Taxonomy`, Virus_Count, | 416 `Genome size (bp)` = format_large_number(as.numeric(genome_size)), |
411 Quality_Score, Genome_Quality, Completeness, Contamination, | 417 `GTDB Taxonomy` = sapply(classification, format_taxonomy) |
412 `Genome size (bp)`, `N50 (contigs)`) | 418 ) %>% |
419 dplyr::mutate(`#` = row_number()) %>% | |
420 dplyr::select( | |
421 `#`, Sample, `GTDB Taxonomy`, Virus_Count, | |
422 Quality_Score, Genome_Quality, Completeness, Contamination, | |
423 `Genome size (bp)`, `N50 (contigs)` | |
424 ) | |
413 | 425 |
414 log_debug("Creating the table") | 426 log_debug("Creating the table") |
415 kbl(table_data, escape = FALSE, | 427 kableExtra::kbl(table_data, |
416 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2))) %>% | 428 escape = FALSE, |
417 kable_paper(full_width = TRUE) %>% | 429 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2)) |
418 column_spec(1, bold = TRUE, width = "2em") %>% | 430 ) %>% |
419 column_spec(2:3, bold = TRUE) %>% | 431 kableExtra::kable_paper(full_width = TRUE) %>% |
420 column_spec(4:5, width = "5em") %>% | 432 kableExtra::column_spec(1, bold = TRUE, width = "2em") %>% |
421 column_spec(6:7, width = "60px") %>% | 433 kableExtra::column_spec(2:3, bold = TRUE) %>% |
422 column_spec(8:9, width = "4em") %>% | 434 kableExtra::column_spec(4:5, width = "5em") %>% |
423 add_header_above(c(" " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2, | 435 kableExtra::column_spec(6:7, width = "60px") %>% |
424 "Statistics" = 2)) %>% | 436 kableExtra::column_spec(8:9, width = "4em") %>% |
425 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), | 437 kableExtra::add_header_above(c( |
426 font_size = 9, | 438 " " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2, |
427 html_font = "Arial", | 439 "Statistics" = 2 |
428 position = "left") %>% | 440 )) %>% |
429 row_spec(0, bold = TRUE, color = "white", background = "#333333") %>% | 441 kableExtra::kable_styling( |
430 row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>% | 442 bootstrap_options = c("striped", "hover", "condensed", "responsive"), |
431 column_spec(9, extra_css = "border-right: 2px solid #000000;") %>% | 443 font_size = 9, |
432 scroll_box(width = "100%", height = "100%", | 444 html_font = "Arial", |
433 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;") | 445 position = "left" |
446 ) %>% | |
447 kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#333333") %>% | |
448 kableExtra::row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>% | |
449 kableExtra::column_spec(9, extra_css = "border-right: 2px solid #000000;") %>% | |
450 kableExtra::scroll_box( | |
451 width = "100%", height = "100%", | |
452 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;" | |
453 ) | |
434 ``` | 454 ``` |
435 | |
436 ## Tools Documentation | |
437 | |
438 The following tools are utilized in this workflow. Each tool name below is a link to its respective documentation. | |
439 | |
440 **Host-analyses** | |
441 | |
442 - [**CheckM2 v1.1.0**](https://github.com/chklovski/CheckM2): Assesses the quality of the host. Most useful when working with assembled genomes. | |
443 | |
444 - [**GTDB-Tk v2.3.2**](https://ecogenomics.github.io/GTDBTk/index.html): Assigns a taxonomy to the host genome. | |
445 | |
446 - [**Defense-Finder v2.0.0, models 2.0.2**](https://ecogenomics.github.io/GTDBTk/index.html): Detects known anti-phage systems in the host. | |
447 | |
448 - [**geNomad v1.7.1**](https://portal.nersc.gov/genomad/): Predicts and annotates proviruses. | |
449 | |
450 **Virus-analyses** | |
451 | |
452 - [**CheckV v1.0.1**](https://pypi.org/project/checkv/): Evaluates the quality of viral genomes. | |
453 | |
454 - [**dRep v3.4.5**](https://drep.readthedocs.io/en/latest/): Compares viral genomes within the same host. | |
455 | |
456 - [**Abricate v1.0.1**](https://github.com/tseemann/abricate): Identifies virulence genes in the prophage genomes with the [VFDB database](https://www.mgc.ac.cn/VFs/). | |
457 | |
458 - [**iPHOP v1.3.3**](https://bitbucket.org/srouxjgi/iphop/src/main/): Predicts other potential hosts of viral genomes. | |
459 | |
460 - [**VIBRANT v1.2.1**](https://github.com/AnantharamanLab/VIBRANT): Used to identify Auxiliary Metabolic Genes in the prophages. | |
461 | |
462 ## Workflow | |
463 | |
464 The workflow begins with the input of bacterial genomes by the user. These are processed by the **host-analyses** tools. Prophage prediction is | |
465 performed by **geNomad** only. Afterward, prophages identified by **geNomad** are processed by the **virus-analyses** tools. | |
466 | |
467 If more than one prophage is recovered in the same sample, **dRep** is used to compare and determine if the viruses are identical or different within the same host. | |
468 | |
469 *PLACEHOLDER FOR PIPELINE* | |
470 | |
471 ## R Session Info | |
472 | |
473 Information about the R session used to render this markdown document. | |
474 | |
475 ```{r} | |
476 sessionInfo() | |
477 ``` | |
478 | |
479 | 455 |
480 # Results {.tabset .tabset-fade} | 456 # Results {.tabset .tabset-fade} |
481 | 457 |
482 ```{r} | 458 ```{r} |
483 # Creating combined_unique object | 459 # Creating combined_unique object |
484 | 460 |
485 combined_unique <- bind_rows( | 461 combined_unique <- bind_rows( |
486 checkm_host_data %>% | 462 checkm_host_data %>% |
487 # select(bin_id) %>% | 463 # dplyr::select(bin_id) %>% |
488 # dplyr::rename(Sample = bin_id), | 464 # dplyr::rename(Sample = bin_id), |
489 select(name) %>% | 465 dplyr::select(name) %>% |
490 dplyr::rename(Sample = name), | 466 dplyr::rename(Sample = name), |
491 | 467 data %>% |
492 data %>% | 468 # dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv")) %>% |
493 #mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) %>% | 469 dplyr::select(Sample) |
494 select(Sample) | |
495 ) %>% | 470 ) %>% |
496 distinct(Sample) %>% | 471 distinct(Sample) %>% |
497 arrange(Sample) | 472 arrange(Sample) |
498 | 473 |
499 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", "))) | 474 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", "))) |
500 ``` | 475 ``` |
501 | 476 |
502 | 477 |
503 | 478 |
504 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'} | 479 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'} |
505 # Process proviruses data | 480 # Process proviruses data |
506 process_proviruses <- function(data_genomad) { | 481 process_proviruses <- function(data_genomad) { |
507 proviruses <- data_genomad %>% | 482 proviruses <- data_genomad %>% |
508 dplyr::filter(topology == "Provirus") %>% | 483 dplyr::filter(topology == "Provirus") %>% |
509 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus" | 484 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus" |
510 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ... | 485 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ... |
511 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks) | 486 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks) |
512 | 487 |
513 proviruses <- proviruses %>% | 488 proviruses <- proviruses %>% |
514 tidyr::separate(coordinates, into = c("start", "end"), sep = "-") | 489 tidyr::separate(coordinates, into = c("start", "end"), sep = "-") |
515 | 490 |
516 proviruses$start <- as.integer(proviruses$start) | 491 proviruses$start <- as.integer(proviruses$start) |
517 proviruses$end <- as.integer(proviruses$end) | 492 proviruses$end <- as.integer(proviruses$end) |
518 | 493 |
519 proviruses_gr_features <- GRanges(seqnames = proviruses$contig, | 494 proviruses_gr_features <- GRanges( |
520 ranges = IRanges(start = proviruses$start, | 495 seqnames = proviruses$contig, |
521 end = proviruses$end)) | 496 ranges = IRanges( |
522 proviruses_gr_features$length <- proviruses_gr_features %>% ranges %>% width | 497 start = proviruses$start, |
523 proviruses_gr_features$score <- as.numeric(proviruses$virus_score) | 498 end = proviruses$end |
524 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks) | 499 ) |
525 | 500 ) |
526 proviruses_gr_features$n_hallmarks_pos <- | 501 proviruses_gr_features$length <- proviruses_gr_features %>% |
527 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2 | 502 ranges() %>% |
528 | 503 width() |
529 return(proviruses_gr_features) | 504 proviruses_gr_features$score <- as.numeric(proviruses$virus_score) |
505 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks) | |
506 | |
507 proviruses_gr_features$n_hallmarks_pos <- | |
508 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2 | |
509 | |
510 return(proviruses_gr_features) | |
530 } | 511 } |
531 | 512 |
532 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) { | 513 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) { |
533 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna")) | 514 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna")) |
534 #cat(fasta_file_path, "\n\n") | 515 # cat(fasta_file_path, "\n\n") |
535 genome_ideogram <- getIdeogramData(fasta_file = fasta_file_path) | 516 genome_ideogram <- gmoviz::getIdeogramData(fasta_file = fasta_file_path) |
536 | 517 |
537 # Replace any seqlevel to c_1, c_2, c_3, ... | 518 # Replace any seqlevel to c_1, c_2, c_3, ... |
538 new_seqlevels <- paste0("c", seq_along(seqlevels(genome_ideogram))) | 519 new_seqlevels <- paste0("c", seq_along(GenomeInfoDb::seqlevels(genome_ideogram))) |
539 names(new_seqlevels) <- seqlevels(genome_ideogram) | 520 names(new_seqlevels) <- GenomeInfoDb::seqlevels(genome_ideogram) |
540 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels) | 521 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels) |
541 colours <- rep("#a58bc5", length(seqlevels(genome_ideogram))) | 522 colours <- rep("#a58bc5", length(GenomeInfoDb::seqlevels(genome_ideogram))) |
542 | 523 |
543 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot | 524 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot |
544 | 525 |
545 gmovizInitialise(genome_ideogram, | 526 gmoviz::gmovizInitialise(genome_ideogram, |
546 sector_colours = colours, | 527 sector_colours = colours, |
547 sector_border_colours = colours, | 528 sector_border_colours = colours, |
548 sector_labels = FALSE | 529 sector_labels = FALSE |
549 ) | 530 ) |
550 | 531 |
551 for (i in 1:length(proviruses_gr_features)) { | 532 for (i in 1:length(proviruses_gr_features)) { |
552 name <- as.character(seqnames(proviruses_gr_features[i])) | 533 name <- as.character(seqnames(proviruses_gr_features[i])) |
553 start <- as.numeric(start(proviruses_gr_features[i])) | 534 start <- as.numeric(start(proviruses_gr_features[i])) |
554 end <- as.numeric(end(proviruses_gr_features[i])) | 535 end <- as.numeric(end(proviruses_gr_features[i])) |
555 region <- data.frame(start = start, end = end) | 536 region <- data.frame(start = start, end = end) |
556 circos.genomicRect(seqnames = name, | 537 circlize::circos.genomicRect( |
557 region, | 538 seqnames = name, |
558 ytop = .5, | 539 region, |
559 ybottom = 0, | 540 ytop = .5, |
560 track.index = 1, | 541 ybottom = 0, |
561 sector.index = name, | 542 track.index = 1, |
562 border = "#e9d27d", | 543 sector.index = name, |
563 col = "#e9d27d") | 544 border = "#e9d27d", |
564 } | 545 col = "#e9d27d" |
565 | 546 ) |
566 length <- as.numeric(proviruses_gr_features$length) | 547 } |
567 length <- ifelse(length > 1000000, | 548 |
568 paste0(round(length/1000000, 2), "mb"), | 549 length <- as.numeric(proviruses_gr_features$length) |
569 paste0(round(length/1000, 2), "kb")) | 550 length <- ifelse(length > 1000000, |
570 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")") | 551 paste0(round(length / 1000000, 2), "mb"), |
571 circos.labels(sectors = as.character(seqnames(proviruses_gr_features)), | 552 paste0(round(length / 1000, 2), "kb") |
572 x = as.numeric(start(proviruses_gr_features)), | 553 ) |
573 labels, | 554 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")") |
574 facing = "clockwise") | 555 circlize::circos.labels( |
556 sectors = as.character(seqnames(proviruses_gr_features)), | |
557 x = as.numeric(start(proviruses_gr_features)), | |
558 labels, | |
559 facing = "clockwise" | |
560 ) | |
575 } | 561 } |
576 | 562 |
577 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) { | 563 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) { |
578 genome_current <- sample # Add this line | 564 genome_current <- sample # Add this line |
579 tryCatch({ | 565 tryCatch( |
580 log_debug(paste("Starting to process sample:", sample)) | 566 { |
581 | 567 log_debug(paste("Starting to process sample:", sample)) |
582 # Check if sample exists in genome_data | 568 |
583 if (!(sample %in% names(genome_data))) { | 569 # Check if sample exists in genome_data |
584 log_debug(paste("Sample", sample, "not found in genome_data")) | 570 if (!(sample %in% names(genome_data))) { |
585 cat(paste("Error: Sample", sample, "not found in genome_data\n\n")) | 571 log_debug(paste("Sample", sample, "not found in genome_data")) |
586 return() | 572 cat(paste("Error: Sample", sample, "not found in genome_data\n\n")) |
587 } | 573 return() |
588 | 574 } |
589 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n")) | 575 |
590 | 576 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n")) |
591 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample] | 577 |
592 if (length(host_genome_path) == 0) { | 578 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample] |
593 log_debug(paste("Host genome path not found for sample:", sample)) | 579 if (length(host_genome_path) == 0) { |
594 cat(paste("Error: Host genome path not found for sample", sample, "\n\n")) | 580 log_debug(paste("Host genome path not found for sample:", sample)) |
595 return() | 581 cat(paste("Error: Host genome path not found for sample", sample, "\n\n")) |
596 } | 582 return() |
597 | 583 } |
598 host_genome_ideogram <- tryCatch({ | 584 |
599 getIdeogramData(fasta_file = host_genome_path) | 585 host_genome_ideogram <- tryCatch( |
600 }, error = function(e) { | 586 { |
601 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e))) | 587 gmoviz::getIdeogramData(fasta_file = host_genome_path) |
602 NULL | 588 }, |
603 }) | 589 error = function(e) { |
604 | 590 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e))) |
605 if (is.null(host_genome_ideogram)) { | 591 NULL |
606 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n")) | 592 } |
607 return() | 593 ) |
608 } | 594 |
609 | 595 if (is.null(host_genome_ideogram)) { |
610 sample_data <- genome_data[[sample]]$loaded_data | 596 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n")) |
611 genomad_summary <- sample_data$genomad | 597 return() |
612 genomad_annotation <- sample_data$genomad_annotations | 598 } |
613 checkv_data <- sample_data$checkv | 599 |
614 defense_finder_data <- sample_data$defense_finder | 600 sample_data <- genome_data[[sample]]$loaded_data |
615 abricate_data <- sample_data$abricate | 601 genomad_summary <- sample_data$genomad |
616 iphop_data <- sample_data$iphop | 602 genomad_annotation <- sample_data$genomad_annotations |
617 vibrant_data <- sample_data$vibrant | 603 checkv_data <- sample_data$checkv |
618 | 604 defense_finder_data <- sample_data$defense_finder |
619 cat("### Host Genome\n\n") | 605 abricate_data <- sample_data$abricate |
620 | 606 iphop_data <- sample_data$iphop |
621 cat("**GTDB-Tk taxonomy**: \n\n") | 607 vibrant_data <- sample_data$vibrant |
622 data_gtdbtk_host %>% filter(user_genome == sample) %>% | 608 |
623 select(classification) %>% | 609 cat("### Host Genome\n\n") |
624 kbl() %>% | 610 |
625 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | 611 cat("**GTDB-Tk taxonomy**: \n\n") |
626 kable_paper("striped", full_width = TRUE) %>% | 612 data_gtdbtk_host %>% |
627 scroll_box(width = "100%", height = "100%") %>% | 613 dplyr::filter(user_genome == sample) %>% |
628 cat() | 614 dplyr::select(classification) %>% |
629 | 615 kableExtra::kbl() %>% |
630 # Cat checkm summary for this genome | 616 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% |
631 cat("**CheckM2 Summary**:\n\n") | 617 kableExtra::kable_paper("striped", full_width = TRUE) %>% |
632 #checkm_summary <- data_checkm_host %>% filter(`bin_id` == sample) | 618 kableExtra::scroll_box(width = "100%", height = "100%") %>% |
633 checkm_summary <- data_checkm_host %>% filter(`name` == sample) | 619 cat() |
634 checkm_summary %>% clean_names %>% | 620 |
635 #select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>% | 621 # Cat checkm summary for this genome |
636 select(total_contigs, contig_n50, completeness, contamination) %>% | 622 cat("**CheckM2 Summary**:\n\n") |
637 kbl() %>% | 623 # checkm_summary <- data_checkm_host %>% dplyr::filter(`bin_id` == sample) |
638 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | 624 checkm_summary <- data_checkm_host %>% dplyr::filter(`name` == sample) |
639 kable_paper("striped", full_width = TRUE) %>% | 625 checkm_summary %>% |
640 scroll_box(width = "100%", height = "100%") %>% | 626 janitor::clean_names() %>% |
641 cat() | 627 # dplyr::select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>% |
642 | 628 dplyr::select(total_contigs, contig_n50, completeness, contamination) %>% |
643 # Display defense-finder as a table | 629 kableExtra::kbl() %>% |
644 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) { | 630 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% |
645 cat("**Defense-Finder Systems**:\n\n") | 631 kableExtra::kable_paper("striped", full_width = TRUE) %>% |
646 | 632 kableExtra::scroll_box(width = "100%", height = "100%") %>% |
647 defense_finder_data %>% | 633 cat() |
648 select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>% | 634 |
649 kbl() %>% | 635 # Display defense-finder as a table |
650 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | 636 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) { |
651 kable_paper("striped", full_width = TRUE) %>% | 637 cat("**Defense-Finder Systems**:\n\n") |
652 scroll_box(width = "100%", height = "100%") %>% | 638 |
639 defense_finder_data %>% | |
640 dplyr::select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>% | |
641 kableExtra::kbl() %>% | |
642 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
643 kableExtra::kable_paper("striped", full_width = TRUE) %>% | |
644 kableExtra::scroll_box(width = "100%", height = "100%") %>% | |
645 cat() | |
646 } else { | |
647 cat("No Defense-Finder systems detected.\n\n") | |
648 } | |
649 | |
650 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) { | |
651 q | |
652 log_debug(paste("No geNomad summary data found for sample:", sample)) | |
653 return() | |
654 } | |
655 | |
656 if (length(GenomeInfoDb::seqlevels(host_genome_ideogram)) == 1) { | |
657 host_genome_size <- sum(width(host_genome_ideogram)) | |
658 } else { | |
659 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name)) | |
660 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs))) | |
661 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs) | |
662 host_genome_size <- sum(width(filtered_host_genome)) | |
663 } | |
664 | |
665 # Process proviruses | |
666 proviruses_gr_features <- process_proviruses(genomad_summary) | |
667 | |
668 cat("**Genomad and CheckV Summary**:\n\n") | |
669 genomad_summary %>% | |
670 dplyr::select(seq_name, taxonomy, topology, coordinates, length) %>% | |
671 dplyr::left_join( | |
672 checkv_data %>% dplyr::select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality), | |
673 by = c("seq_name" = "contig_id") | |
674 ) %>% | |
675 dplyr::select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>% | |
676 kableExtra::kbl() %>% | |
677 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
678 kableExtra::kable_paper("striped", full_width = TRUE) %>% | |
679 kableExtra::scroll_box(width = "100%", height = "100%") %>% | |
680 cat() | |
681 | |
682 cat("**Host Genome Ideogram with Phages**:\n\n") | |
683 plot_genome_ideogram(sample, proviruses_gr_features) | |
684 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number. | |
685 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n') | |
686 cat("\n\n") | |
687 | |
688 # Process phage genomes | |
689 cat("### Prophages {.tabset .tabset-fade} \n\n") | |
690 cat("**Select prophage to show: ** \n\n") | |
691 for (i in seq_len(nrow(genomad_summary))) { | |
692 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample)) | |
693 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size) | |
694 } | |
695 | |
696 # Plot dREP if applicable | |
697 if (nrow(genomad_summary) > 1) { | |
698 cat("### vOTUs\n\n") | |
699 plot_drep(sample, genomad_summary) | |
700 } | |
701 | |
702 # Creating table with Abricate data | |
703 if (nrow(abricate_data) > 0) { | |
704 cat("### Virulence Genes {.tabset .tabset-fade} \n\n") | |
705 cat("Screening of virulence genes present in the prophage contigs. \n\n") | |
706 abricate_data %>% | |
707 dplyr::select(-number_file) %>% | |
708 kableExtra::kbl() %>% | |
709 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
710 kableExtra::kable_paper("striped", full_width = TRUE) %>% | |
711 kableExtra::scroll_box(width = "100%", height = "100%") %>% | |
712 cat() | |
713 cat("\n\n") | |
714 } | |
715 | |
716 # Creating table with iPHOP | |
717 if (nrow(iphop_data) > 0) { | |
718 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n") | |
719 cat("Prediction of potential hosts for the prophage contigs. \n\n") | |
720 iphop_data %>% | |
721 kableExtra::kbl() %>% | |
722 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
723 kableExtra::kable_paper("striped", full_width = TRUE) %>% | |
724 kableExtra::scroll_box(width = "100%", height = "100%") %>% | |
725 cat() | |
726 cat("\n\n") | |
727 } | |
728 | |
729 # Creating table with VIBRANT AMGs | |
730 if (nrow(vibrant_data) > 0) { | |
731 cat("### AMG Predictions {.tabset .tabset-fade} \n\n") | |
732 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n") | |
733 vibrant_data %>% | |
734 kableExtra::kbl() %>% | |
735 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
736 kableExtra::kable_paper("striped", full_width = TRUE) %>% | |
737 kableExtra::scroll_box(width = "100%", height = "100%") %>% | |
738 cat() | |
739 cat("\n\n") | |
740 } | |
741 | |
742 log_debug(paste("Finished processing sample:", sample)) | |
743 }, | |
744 error = function(e) { | |
745 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e))) | |
746 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n")) | |
747 } | |
748 ) | |
749 } | |
750 | |
751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) { | |
752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n")) | |
753 | |
754 current_contig <- sub("\\|.*", "", virus$seq_name) | |
755 | |
756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name)) | |
757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name)) | |
758 virus_length <- provirus_end - provirus_start + 1 | |
759 | |
760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name) | |
761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name) | |
762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"), | |
763 genomad_annotation$gene, | |
764 fixed = FALSE | |
765 ), ] %>% | |
766 dplyr::mutate(arrow_pos = ifelse(strand == -1, "start", "end")) | |
767 | |
768 | |
769 cat("\n\n**Phage-Host Genome Ideogram:**\n\n") | |
770 | |
771 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) | |
772 | |
773 cat("\n\n") | |
774 cat("\n\n**Genes Annotation (geNomad):**\n\n") | |
775 | |
776 current_annotations %>% | |
777 dplyr::select(gene, length, marker, annotation_accessions, annotation_description) %>% | |
778 kableExtra::kbl() %>% | |
779 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
780 kableExtra::kable_paper() %>% | |
653 cat() | 781 cat() |
654 } else { | |
655 cat("No Defense-Finder systems detected.\n\n") | |
656 } | |
657 | |
658 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) { | |
659 log_debug(paste("No geNomad summary data found for sample:", sample)) | |
660 return() | |
661 } | |
662 | |
663 if (length(seqlevels(host_genome_ideogram)) == 1) { | |
664 host_genome_size <- sum(width(host_genome_ideogram)) | |
665 } else { | |
666 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name)) | |
667 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs))) | |
668 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs) | |
669 host_genome_size <- sum(width(filtered_host_genome)) | |
670 } | |
671 | |
672 # Process proviruses | |
673 proviruses_gr_features <- process_proviruses(genomad_summary) | |
674 | |
675 cat("**Genomad and CheckV Summary**:\n\n") | |
676 genomad_summary %>% | |
677 select(seq_name, taxonomy, topology, coordinates, length) %>% | |
678 left_join( | |
679 checkv_data %>% select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality), | |
680 by = c("seq_name" = "contig_id")) %>% | |
681 select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>% | |
682 kbl() %>% | |
683 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
684 kable_paper("striped", full_width = TRUE) %>% | |
685 scroll_box(width = "100%", height = "100%") %>% | |
686 cat() | |
687 | |
688 cat("**Host Genome Ideogram with Phages**:\n\n") | |
689 plot_genome_ideogram(sample, proviruses_gr_features) | |
690 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number. | |
691 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n') | |
692 cat("\n\n") | 782 cat("\n\n") |
693 | |
694 # Process phage genomes | |
695 cat("### Prophages {.tabset .tabset-fade} \n\n") | |
696 cat("**Select prophage to show: ** \n\n") | |
697 for (i in seq_len(nrow(genomad_summary))) { | |
698 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample)) | |
699 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size) | |
700 } | |
701 | |
702 # Plot dREP if applicable | |
703 if (nrow(genomad_summary) > 1) { | |
704 cat("### vOTUs\n\n") | |
705 plot_drep(sample, genomad_summary) | |
706 } | |
707 | |
708 # Creating table with Abricate data | |
709 if (nrow(abricate_data) > 0) { | |
710 cat("### Virulence Genes {.tabset .tabset-fade} \n\n") | |
711 cat("Screening of virulence genes present in the prophage contigs. \n\n") | |
712 abricate_data %>% select(-number_file) %>% | |
713 kbl() %>% | |
714 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
715 kable_paper("striped", full_width = TRUE) %>% | |
716 scroll_box(width = "100%", height = "100%") %>% cat() | |
717 cat("\n\n") | |
718 } | |
719 | |
720 # Creating table with iPHOP | |
721 if (nrow(iphop_data) > 0) { | |
722 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n") | |
723 cat("Prediction of potential hosts for the prophage contigs. \n\n") | |
724 iphop_data %>% | |
725 kbl() %>% | |
726 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
727 kable_paper("striped", full_width = TRUE) %>% | |
728 scroll_box(width = "100%", height = "100%") %>% cat() | |
729 cat("\n\n") | |
730 } | |
731 | |
732 # Creating table with VIBRANT AMGs | |
733 if (nrow(vibrant_data) > 0) { | |
734 cat("### AMG Predictions {.tabset .tabset-fade} \n\n") | |
735 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n") | |
736 vibrant_data %>% | |
737 kbl() %>% | |
738 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
739 kable_paper("striped", full_width = TRUE) %>% | |
740 scroll_box(width = "100%", height = "100%") %>% cat() | |
741 cat("\n\n") | |
742 } | |
743 | |
744 log_debug(paste("Finished processing sample:", sample)) | |
745 }, error = function(e) { | |
746 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e))) | |
747 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n")) | |
748 }) | |
749 } | |
750 | |
751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) { | |
752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n")) | |
753 | |
754 current_contig <- sub("\\|.*", "", virus$seq_name) | |
755 | |
756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name)) | |
757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name)) | |
758 virus_length <- provirus_end - provirus_start + 1 | |
759 | |
760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name) | |
761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name) | |
762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"), | |
763 genomad_annotation$gene, fixed = FALSE), ] %>% | |
764 mutate(arrow_pos = ifelse(strand == -1, "start", "end")) | |
765 | |
766 | |
767 cat("\n\n**Phage–Host Genome Ideogram:**\n\n") | |
768 | |
769 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) | |
770 | |
771 cat("\n\n") | |
772 cat("\n\n**Genes Annotation (geNomad):**\n\n") | |
773 | |
774 current_annotations %>% | |
775 select(gene, length, marker, annotation_accessions, annotation_description) %>% | |
776 kbl() %>% | |
777 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | |
778 kable_paper() %>% | |
779 cat() | |
780 cat("\n\n") | |
781 } | 783 } |
782 | 784 |
783 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) { | 785 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) { |
784 tryCatch({ | 786 tryCatch( |
785 log_debug("Starting plot_phage_circos function") | 787 { |
786 log_debug(paste("Current virus:", virus$seq_name)) | 788 log_debug("Starting plot_phage_circos function") |
787 log_debug(paste("Virus length:", virus_length)) | 789 log_debug(paste("Current virus:", virus$seq_name)) |
788 log_debug(paste("Host genome size:", host_genome_size)) | 790 log_debug(paste("Virus length:", virus_length)) |
789 log_debug(paste("Provirus start:", provirus_start)) | 791 log_debug(paste("Host genome size:", host_genome_size)) |
790 log_debug(paste("Provirus end:", provirus_end)) | 792 log_debug(paste("Provirus start:", provirus_start)) |
791 | 793 log_debug(paste("Provirus end:", provirus_end)) |
792 # Check for NA or invalid values in input parameters | 794 |
793 if (is.na(virus_length) || virus_length <= 0) { | 795 # Check for NA or invalid values in input parameters |
794 log_debug("Error: Invalid virus length") | 796 if (is.na(virus_length) || virus_length <= 0) { |
795 return(NULL) | 797 log_debug("Error: Invalid virus length", virus_length) |
796 } | 798 return(NULL) |
797 if (is.na(host_genome_size) || host_genome_size <= 0) { | 799 } |
798 log_debug("Error: Invalid host genome size") | 800 if (is.na(host_genome_size) || host_genome_size <= 0) { |
799 return(NULL) | 801 log_debug("Error: Invalid host genome size") |
800 } | 802 return(NULL) |
801 if (is.na(provirus_start) || provirus_start < 0) { | 803 } |
802 log_debug("Error: Invalid provirus start position") | 804 if (is.na(provirus_start) || provirus_start < 0) { |
803 return(NULL) | 805 log_debug("Error: Invalid provirus start position") |
804 } | 806 return(NULL) |
805 if (is.na(provirus_end) || provirus_end <= provirus_start) { | 807 } |
806 log_debug("Error: Invalid provirus end position") | 808 if (is.na(provirus_end) || provirus_end <= provirus_start) { |
807 return(NULL) | 809 log_debug("Error: Invalid provirus end position") |
808 } | 810 return(NULL) |
809 | 811 } |
810 # Extract contig information | 812 |
811 current_contig <- sub("\\|.*", "", virus$seq_name) | 813 # Extract contig information |
812 log_debug(paste("Current contig:", current_contig)) | 814 current_contig <- sub("\\|.*", "", virus$seq_name) |
813 | 815 log_debug(paste("Current contig:", current_contig)) |
814 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ] | 816 |
815 if (nrow(contig_viruses) == 0) { | 817 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ] |
816 log_debug("Error: No viruses found for the current contig") | 818 if (nrow(contig_viruses) == 0) { |
817 return(NULL) | 819 log_debug("Error: No viruses found for the current contig") |
818 } | 820 return(NULL) |
819 | 821 } |
820 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name))) | 822 |
821 if (is.na(contig_length) || contig_length <= 0) { | 823 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name))) |
822 contig_length <- virus_length # Use virus length as fallback if contig length is invalid | 824 if (is.na(contig_length) || contig_length <= 0) { |
823 log_debug(paste("Using virus length as contig length:", contig_length)) | 825 contig_length <- virus_length # Use virus length as fallback if contig length is invalid |
824 } else { | 826 log_debug(paste("Using virus length as contig length:", contig_length)) |
825 log_debug(paste("Contig length:", contig_length)) | 827 } else { |
826 } | 828 log_debug(paste("Contig length:", contig_length)) |
827 | 829 } |
828 if (provirus_end > contig_length) { | 830 |
829 log_debug("Error: Provirus end position exceeds contig length") | 831 if (provirus_end > contig_length) { |
830 return(NULL) | 832 log_debug("Error: Provirus end position exceeds contig length") |
831 } | 833 return(NULL) |
832 | 834 } |
833 log_debug("Clearing circos") | 835 |
834 circos.clear() | 836 log_debug("Clearing circos") |
835 | 837 circlize::circos.clear() |
836 log_debug("Setting circos parameters") | 838 |
837 circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01)) | 839 log_debug("Setting circos parameters") |
838 | 840 circlize::circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01)) |
839 main_color <- "#a58bc5" | 841 |
840 zoom_color <- "#e9d27d" | 842 main_color <- "#a58bc5" |
841 | 843 zoom_color <- "#e9d27d" |
842 zoom_start <- (provirus_start / contig_length) * 100 | 844 |
843 zoom_end <- (provirus_end / contig_length) * 100 | 845 zoom_start <- (provirus_start / contig_length) * 100 |
844 | 846 zoom_end <- (provirus_end / contig_length) * 100 |
845 log_debug(paste("Zoom start:", zoom_start)) | 847 |
846 log_debug(paste("Zoom end:", zoom_end)) | 848 log_debug(paste("Zoom start:", zoom_start)) |
847 | 849 log_debug(paste("Zoom end:", zoom_end)) |
848 log_debug("Initializing circos") | 850 |
849 circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100)) | 851 log_debug("Initializing circos") |
850 | 852 circlize::circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100)) |
851 format_genome_labels <- function(x) { | 853 |
852 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"), | 854 format_genome_labels <- function(x) { |
853 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"), | 855 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"), |
854 paste0(x, " bp"))) | 856 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"), |
855 } | 857 paste0(x, " bp") |
856 | 858 ) |
857 log_debug("Adding link") | 859 ) |
858 tryCatch({ | 860 } |
859 circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100), | 861 |
860 rou1 = 0.8, | 862 log_debug("Adding link") |
861 rou2 = 0.97, | 863 tryCatch( |
862 h.ratio = 0.55, # width? | 864 { |
863 lty = 2, | 865 circlize::circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100), |
864 lwd = 0.5, | 866 rou1 = 0.8, |
865 h2 = 1, | 867 rou2 = 0.97, |
866 col = "grey99", border = "grey80") | 868 h.ratio = 0.55, # width? |
867 }, error = function(e) { | 869 lty = 2, |
868 log_debug(paste("Error in circos.link:", e$message)) | 870 lwd = 0.5, |
869 }) | 871 h2 = 1, |
870 | 872 col = "grey99", border = "grey80" |
871 log_debug("Adding zoom track") | 873 ) |
872 circos.track(factors = "Zoom", ylim = c(0, 1), track.height = 0.15, | 874 }, |
873 panel.fun = function(x, y) { | 875 error = function(e) { |
874 circos.rect(0, 0, 100, 1, col = zoom_color, border = NA) | 876 log_debug(paste("Error in circlize::circos.link:", e$message)) |
875 axis_labels <- seq(0, virus_length, length.out = 6) | 877 } |
876 axis_positions <- seq(0, 100, length.out = 6) | 878 ) |
877 circos.axis(h = "top", major.at = axis_positions, | 879 |
878 labels = format_genome_labels(axis_labels), | 880 log_debug("Adding zoom track") |
879 labels.cex = 0.7, direction = "outside") | 881 circlize::circos.track( |
880 | 882 factors = "Zoom", ylim = c(0, 1), track.height = 0.15, |
881 for (i in 1:nrow(current_annotations)) { | 883 panel.fun = function(x, y) { |
882 gene_start <- current_annotations$start[i] | 884 circlize::circos.rect(0, 0, 100, 1, col = zoom_color, border = NA) |
883 gene_end <- current_annotations$end[i] | 885 axis_labels <- seq(0, virus_length, length.out = 6) |
884 arrow_start <- (gene_start - provirus_start) / virus_length * 100 | 886 axis_positions <- seq(0, 100, length.out = 6) |
885 arrow_end <- (gene_end - provirus_start) / virus_length * 100 | 887 circlize::circos.axis( |
886 | 888 h = "top", major.at = axis_positions, |
887 circos.arrow(arrow_start, arrow_end, y1 = 0, y2 = 1, | 889 labels = format_genome_labels(axis_labels), |
888 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1), | 890 labels.cex = 0.7, direction = "outside" |
889 arrow.position = current_annotations$arrow_pos[i], | 891 ) |
890 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"), | 892 |
891 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue")) | 893 for (i in 1:nrow(current_annotations)) { |
892 } | 894 gene_start <- current_annotations$start[i] |
893 }, bg.border = NA) | 895 gene_end <- current_annotations$end[i] |
894 | 896 arrow_start <- (gene_start - provirus_start) / virus_length * 100 |
895 log_debug("Adding main track") | 897 arrow_end <- (gene_end - provirus_start) / virus_length * 100 |
896 circos.track(factors = "Main", ylim = c(0, 1), track.height = 0.1, | 898 |
897 panel.fun = function(x, y) { | 899 circlize::circos.arrow(arrow_start, arrow_end, |
898 circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA) | 900 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1), |
899 | 901 arrow.position = current_annotations$arrow_pos[i], |
900 for (i in 1:nrow(contig_viruses)) { | 902 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"), |
901 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) | 903 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue") |
902 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) | 904 ) |
903 | 905 } |
904 virus_start_percent <- (virus_start / contig_length) * 100 | 906 }, bg.border = NA |
905 virus_end_percent <- (virus_end / contig_length) * 100 | 907 ) |
906 | 908 |
907 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7) | 909 log_debug("Adding main track") |
908 | 910 circlize::circos.track( |
909 circos.rect(xleft = virus_start_percent, ybottom = 0, | 911 factors = "Main", ylim = c(0, 1), track.height = 0.1, |
910 xright = virus_end_percent, ytop = 1, | 912 panel.fun = function(x, y) { |
911 col = rect_color, border = NA) | 913 circlize::circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA) |
912 } | 914 |
913 | 915 for (i in 1:nrow(contig_viruses)) { |
914 axis_labels <- seq(0, contig_length, length.out = 6) | 916 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) |
915 axis_positions <- seq(0, 100, length.out = 6) | 917 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) |
916 circos.axis(h = "top", major.at = axis_positions, | 918 |
917 labels = format_genome_labels(axis_labels), | 919 virus_start_percent <- (virus_start / contig_length) * 100 |
918 labels.cex = 0.7, direction = "outside") | 920 virus_end_percent <- (virus_end / contig_length) * 100 |
919 }, bg.border = NA) | 921 |
920 | 922 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7) |
921 log_debug("Locating phage positions") | 923 |
922 phage_positions <- sapply(1:nrow(contig_viruses), function(i) { | 924 circlize::circos.rect( |
923 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) | 925 xleft = virus_start_percent, ybottom = 0, |
924 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) | 926 xright = virus_end_percent, ytop = 1, |
925 ((virus_start + virus_end) / 2 / contig_length) * 100 | 927 col = rect_color, border = NA |
926 }) | 928 ) |
927 | 929 } |
928 log_debug("Annotating names on phage positions") | 930 |
929 | 931 axis_labels <- seq(0, contig_length, length.out = 6) |
930 # Extract start and end positions from sequence names | 932 axis_positions <- seq(0, 100, length.out = 6) |
931 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name)) | 933 circlize::circos.axis( |
932 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name)) | 934 h = "top", major.at = axis_positions, |
933 | 935 labels = format_genome_labels(axis_labels), |
934 # Create phage labels with the desired format | 936 labels.cex = 0.7, direction = "outside" |
935 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb") | 937 ) |
936 | 938 }, bg.border = NA |
937 # Apply labels to circos plot | 939 ) |
938 circos.labels( | 940 |
939 sectors = "Main", | 941 log_debug("Locating phage positions") |
940 x = phage_positions, | 942 phage_positions <- sapply(1:nrow(contig_viruses), function(i) { |
941 labels = phage_labels, | 943 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) |
942 facing = "reverse.clockwise", | 944 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) |
943 niceFacing = TRUE, | 945 ((virus_start + virus_end) / 2 / contig_length) * 100 |
944 col = "black", | 946 }) |
945 cex = 0.6, | 947 |
946 side = "inside", | 948 log_debug("Annotating names on phage positions") |
947 connection_height = 0.02, | 949 |
948 line_col = "gray" | 950 # Extract start and end positions from sequence names |
949 ) | 951 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name)) |
950 | 952 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name)) |
951 center_x <- 50 | 953 |
952 virus_name <- virus$seq_name | 954 # Create phage labels with the desired format |
953 taxonomy <- virus$taxonomy | 955 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb") |
954 | 956 |
955 log_debug("Adding taxonomy and virus name to the plot") | 957 # Apply labels to circos plot |
956 circos.text(x = center_x, y = -0.2, labels = taxonomy, | 958 circlize::circos.labels( |
959 sectors = "Main", | |
960 x = phage_positions, | |
961 labels = phage_labels, | |
962 facing = "reverse.clockwise", | |
963 niceFacing = TRUE, | |
964 col = "black", | |
965 cex = 0.6, | |
966 side = "inside", | |
967 connection_height = 0.02, | |
968 line_col = "gray" | |
969 ) | |
970 | |
971 center_x <- 50 | |
972 virus_name <- virus$seq_name | |
973 taxonomy <- virus$taxonomy | |
974 | |
975 log_debug("Adding taxonomy and virus name to the plot") | |
976 circlize::circos.text( | |
977 x = center_x, y = -0.2, labels = taxonomy, | |
957 sector.index = "Zoom", track.index = 1, | 978 sector.index = "Zoom", track.index = 1, |
958 facing = "bending.inside", niceFacing = TRUE, | 979 facing = "bending.inside", niceFacing = TRUE, |
959 adj = c(0.5, 0.7), cex = 0.8) | 980 adj = c(0.5, 0.7), cex = 0.8 |
960 | 981 ) |
961 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ] | 982 |
962 if (nrow(checkv_info) > 0) { | 983 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ] |
963 checkv_quality <- checkv_info$checkv_quality | 984 if (nrow(checkv_info) > 0) { |
964 gene_count <- checkv_info$gene_count | 985 checkv_quality <- checkv_info$checkv_quality |
965 viral_genes <- checkv_info$viral_genes | 986 gene_count <- checkv_info$gene_count |
966 host_genes <- checkv_info$host_genes | 987 viral_genes <- checkv_info$viral_genes |
967 miuvig_quality <- checkv_info$miuvig_quality | 988 host_genes <- checkv_info$host_genes |
968 completeness <- checkv_info$completeness | 989 miuvig_quality <- checkv_info$miuvig_quality |
969 completeness_method <- checkv_info$completeness_method | 990 completeness <- checkv_info$completeness |
970 contamination <- checkv_info$contamination | 991 completeness_method <- checkv_info$completeness_method |
971 | 992 contamination <- checkv_info$contamination |
972 circos.text( | 993 |
973 x = center_x, y = -0.5, | 994 circlize::circos.text( |
974 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality), | 995 x = center_x, y = -0.5, |
975 sector.index = "Zoom", track.index = 2, | 996 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality), |
976 facing = "bending.inside", niceFacing = TRUE, | 997 sector.index = "Zoom", track.index = 2, |
977 adj = c(0.5, 0), cex = 0.7 | 998 facing = "bending.inside", niceFacing = TRUE, |
978 ) | 999 adj = c(0.5, 0), cex = 0.7 |
979 | 1000 ) |
980 circos.text( | 1001 |
981 x = center_x, y = -1.5, | 1002 circlize::circos.text( |
982 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes), | 1003 x = center_x, y = -1.5, |
983 sector.index = "Zoom", track.index = 2, | 1004 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes), |
984 facing = "bending.inside", niceFacing = TRUE, | 1005 sector.index = "Zoom", track.index = 2, |
985 adj = c(0.5, 0), cex = 0.7 | 1006 facing = "bending.inside", niceFacing = TRUE, |
986 ) | 1007 adj = c(0.5, 0), cex = 0.7 |
987 | 1008 ) |
988 circos.text( | 1009 |
989 x = center_x, y = -2.5, | 1010 circlize::circos.text( |
990 labels = paste("Completeness:", completeness, " - Contamination:", contamination), | 1011 x = center_x, y = -2.5, |
991 sector.index = "Zoom", track.index = 2, | 1012 labels = paste("Completeness:", completeness, " - Contamination:", contamination), |
992 facing = "bending.inside", niceFacing = TRUE, | 1013 sector.index = "Zoom", track.index = 2, |
993 adj = c(0.5, 0), cex = 0.7 | 1014 facing = "bending.inside", niceFacing = TRUE, |
994 ) | 1015 adj = c(0.5, 0), cex = 0.7 |
995 } | 1016 ) |
996 | 1017 } |
997 log_debug("Adding legend") | 1018 |
998 # Add legend | 1019 log_debug("Adding legend") |
999 legend("topright", | 1020 # Add legend |
1000 legend = c("Annotated gene", "Unknown gene"), | 1021 legend("topright", |
1001 fill = c("#7fbfff", "grey"), | 1022 legend = c("Annotated gene", "Unknown gene"), |
1002 border = c("darkblue", "grey20"), | 1023 fill = c("#7fbfff", "grey"), |
1003 cex = 0.8, | 1024 border = c("darkblue", "grey20"), |
1004 bty = "n") | 1025 cex = 0.8, |
1005 | 1026 bty = "n" |
1006 log_debug("Clearing circos") | 1027 ) |
1007 circos.clear() | 1028 |
1008 log_debug("Finished plot_phage_circos function successfully") | 1029 log_debug("Clearing circos") |
1009 }, error = function(e) { | 1030 circlize::circos.clear() |
1010 log_debug(paste("Error in plot_phage_circos:", e$message)) | 1031 log_debug("Finished plot_phage_circos function successfully") |
1011 circos.clear() | 1032 }, |
1012 }) | 1033 error = function(e) { |
1034 log_debug(paste("Error in plot_phage_circos:", e$message)) | |
1035 circlize::circos.clear() | |
1036 } | |
1037 ) | |
1013 } | 1038 } |
1014 | 1039 |
1015 plot_drep <- function(sample, genomad_summary) { | 1040 plot_drep <- function(sample, genomad_summary) { |
1016 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv") | 1041 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv") |
1017 drep_data <- read_csv(drep_file_path) %>% clean_names() | 1042 drep_data <- read_csv(drep_file_path) %>% janitor::clean_names() |
1018 drep_data <- cbind(genomad_summary$seq_name, drep_data) | 1043 drep_data <- cbind(genomad_summary$seq_name, drep_data) |
1019 | 1044 |
1020 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n") | 1045 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n") |
1021 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).") | 1046 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).") |
1022 | 1047 |
1023 cat("\n\n**Final cluster designations**\n\n") | 1048 cat("\n\n**Final cluster designations**\n\n") |
1024 drep_data %>% | 1049 drep_data %>% |
1025 kbl() %>% | 1050 kableExtra::kbl() %>% |
1026 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% | 1051 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% |
1027 kable_paper("striped", full_width = TRUE) %>% | 1052 kableExtra::kable_paper("striped", full_width = TRUE) %>% |
1028 cat() | 1053 cat() |
1029 | 1054 |
1030 # Insert the PDF plot | 1055 # Insert the PDF plot |
1031 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf") | 1056 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf") |
1032 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png") | 1057 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png") |
1033 | 1058 |
1034 if (file.exists(plot_path)) { | 1059 if (file.exists(plot_path)) { |
1035 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi=150) | 1060 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi = 150) |
1036 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png") | 1061 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png") |
1037 cat("**Primary clustering plot**\n\n") | 1062 cat("**Primary clustering plot**\n\n") |
1038 cat(sprintf( | 1063 cat(sprintf( |
1039 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>',base64_str | 1064 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>', base64_str |
1040 )) | 1065 )) |
1041 } else { | 1066 } else { |
1042 cat("**No dRep clustering plot found.**\n\n") | 1067 cat("**No dRep clustering plot found.**\n\n") |
1043 } | 1068 } |
1044 | |
1045 } | 1069 } |
1046 | 1070 |
1047 subset_and_update_ideogram <- function(ideogram, contigs) { | 1071 subset_and_update_ideogram <- function(ideogram, contigs) { |
1048 filtered <- ideogram[seqnames(ideogram) %in% contigs] | 1072 filtered <- ideogram[seqnames(ideogram) %in% contigs] |
1049 seqlevels(filtered) <- contigs | 1073 GenomeInfoDb::seqlevels(filtered) <- contigs |
1050 seqinfo(filtered) <- seqinfo(filtered)[contigs] | 1074 seqinfo(filtered) <- seqinfo(filtered)[contigs] |
1051 filtered | 1075 filtered |
1052 } | 1076 } |
1053 | 1077 |
1054 render_all_samples <- function(test_mode = FALSE) { | 1078 render_all_samples <- function(test_mode = FALSE) { |
1055 if (test_mode) { | 1079 if (test_mode) { |
1056 if (nrow(combined_unique) > 0) { | 1080 if (nrow(combined_unique) > 0) { |
1057 cat("**Select sample to show:** \n\n\n") | 1081 cat("**Select sample to show:** \n\n\n") |
1058 current_sample <- combined_unique$Sample[6] | 1082 current_sample <- combined_unique$Sample[6] |
1059 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) | 1083 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) |
1084 } else { | |
1085 print("No samples can be further analysed.") | |
1086 } | |
1060 } else { | 1087 } else { |
1061 print("No samples can be further analysed.") | 1088 cat("**Select sample to show:** \n\n\n") |
1062 } | 1089 for (i in seq_len(nrow(combined_unique))) { |
1063 } else { | 1090 current_sample <- combined_unique$Sample[i] |
1064 cat("**Select sample to show:** \n\n\n") | 1091 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) |
1065 for (i in seq_len(nrow(combined_unique))) { | 1092 } |
1066 current_sample <- combined_unique$Sample[i] | 1093 } |
1067 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) | |
1068 } | |
1069 } | |
1070 } | 1094 } |
1071 | 1095 |
1072 # Execute the main function | 1096 # Execute the main function |
1073 # Test mode processes one sample only | 1097 # Test mode processes one sample only |
1074 render_all_samples(test_mode = F) | 1098 render_all_samples(test_mode = F) |
1075 ``` | 1099 ``` |
1076 | |
1077 # Citation | |
1078 | |
1079 Cite this work: XXXXX | |
1080 | |
1081 |