comparison report.Rmd @ 1:3a7f73d638ba draft default tip

planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/phi-toolkit commit 368e8a7322c9763c648637263d4695abc146be13
author ufz
date Tue, 22 Jul 2025 11:09:24 +0000
parents 315c2ed31af1
children
comparison
equal deleted inserted replaced
0:315c2ed31af1 1:3a7f73d638ba
23 23
24 ------------------------------------------------------------------------ 24 ------------------------------------------------------------------------
25 25
26 ```{r setup_env, include=FALSE} 26 ```{r setup_env, include=FALSE}
27 knitr::opts_chunk$set(echo = FALSE) 27 knitr::opts_chunk$set(echo = FALSE)
28 knitr::opts_chunk$set(dev = "svglite") # set output device to svg
28 29
29 cat("params$outdir:", params$outdir, "\n") 30 cat("params$outdir:", params$outdir, "\n")
30 ``` 31 ```
31 32
32 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} 33 ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'}
33 # Define required packages 34 # Define required packages
34 required_packages <- c("tidyverse", "janitor", "here", 35 # Note: update version_command if changed here!
35 "kableExtra", "gmoviz", "circlize", 36 required_packages <- c(
36 "GenomicRanges", "patchwork", "fs", 37 "tidyverse", "janitor", "here",
37 "tools", "scales", "formattable", 38 "kableExtra", "gmoviz", "circlize",
38 "pdftools", "base64") 39 "GenomicRanges", "patchwork", "fs",
40 "tools", "scales", "formattable",
41 "pdftools", "base64"
42 )
39 43
40 # Load required packages 44 # Load required packages
41 invisible(lapply(required_packages, library, character.only = TRUE)) 45 invisible(lapply(required_packages, library, character.only = TRUE))
42 ``` 46 ```
43 47
44 ```{r helper_functions, echo=FALSE} 48 ```{r helper_functions, echo=FALSE}
45 log_file <- "debug.log" 49 log_file <- "debug.log"
46 50
47 log_debug <- function(message) { 51 log_debug <- function(message) {
48 if (!exists("log_initialized") || !log_initialized) { 52 if (!exists("log_initialized") || !log_initialized) {
49 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE) 53 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE)
50 assign("log_initialized", TRUE, envir = .GlobalEnv) 54 assign("log_initialized", TRUE, envir = .GlobalEnv)
51 } else { 55 } else {
52 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE) 56 cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE)
53 } 57 }
54 } 58 }
55 59
56 load_file <- function(path) { 60 load_file <- function(path) {
57 log_debug(paste("Attempting to load:", path)) 61 log_debug(paste("Attempting to load:", path))
58 if (file.exists(path)) { 62 if (file.exists(path)) {
59 ext <- tools::file_ext(path) 63 ext <- tools::file_ext(path)
60 if (ext %in% c("tsv", "csv")) { 64 if (ext %in% c("tsv", "csv")) {
61 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% clean_names 65 data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% janitor::clean_names()
62 log_debug(paste("Loaded", nrow(data), "rows from", path)) 66 log_debug(paste("Loaded", nrow(data), "rows from", path))
63 data 67 data
64 } else if (ext == "fna") { 68 } else if (ext == "fna") {
65 data <- Biostrings::readDNAStringSet(path) 69 data <- Biostrings::readDNAStringSet(path)
66 log_debug(paste("Loaded", length(data), "sequences from", path)) 70 log_debug(paste("Loaded", length(data), "sequences from", path))
67 data 71 data
72 } else {
73 log_debug(paste("Skipping", path, "- unsupported file type"))
74 NULL
75 }
68 } else { 76 } else {
69 log_debug(paste("Skipping", path, "- unsupported file type")) 77 log_debug(paste("File does not exist:", path))
70 NULL 78 NULL
71 } 79 }
72 } else {
73 log_debug(paste("File does not exist:", path))
74 NULL
75 }
76 } 80 }
77 81
78 get_file_info <- function(path, loaded_data) { 82 get_file_info <- function(path, loaded_data) {
79 log_debug(paste("Processing file info for:", path)) 83 log_debug(paste("Processing file info for:", path))
80 if (file.exists(path)) { 84 if (file.exists(path)) {
81 ext <- tools::file_ext(path) 85 ext <- tools::file_ext(path)
82 if (ext %in% c("tsv", "csv", "fna")) { 86 if (ext %in% c("tsv", "csv", "fna")) {
83 data <- loaded_data[[basename(path)]] 87 data <- loaded_data[[basename(path)]]
84 rows <- if(ext == "fna") length(data) else nrow(data) 88 rows <- if (ext == "fna") length(data) else nrow(data)
85 tibble(exists = TRUE, rows = rows, size = file.size(path), path = path) 89 tibble::tibble(exists = TRUE, rows = rows, size = file.size(path), path = path)
90 } else {
91 tibble::tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path)
92 }
86 } else { 93 } else {
87 tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path) 94 tibble::tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_)
88 } 95 }
89 } else {
90 tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_)
91 }
92 } 96 }
93 97
94 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) { 98 process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) {
95 log_debug(paste("Processing folder:", folder)) 99 log_debug(paste("Processing folder:", folder))
96 genome_name <- basename(folder) 100 genome_name <- basename(folder)
97 101
98 paths <- list( 102 paths <- list(
99 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")), 103 genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")),
100 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")), 104 genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")),
101 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")), 105 genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")),
102 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")), 106 defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")),
103 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"), 107 checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"),
104 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"), 108 iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"),
105 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"), 109 drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"),
106 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"), 110 phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"),
107 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")), 111 abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")),
108 vibrant = file.path(virus_analyses_dir, "vibrant", genome_name, 112 vibrant = file.path(
109 paste0("VIBRANT_", genome_name, "_virus"), 113 virus_analyses_dir, "vibrant", genome_name,
110 paste0("VIBRANT_results_", genome_name, "_virus"), 114 paste0("VIBRANT_", genome_name, "_virus"),
111 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv")) 115 paste0("VIBRANT_results_", genome_name, "_virus"),
112 ) 116 paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv")
113 117 )
114 loaded_data <- map(paths, load_file) 118 )
115 file_info <- map_dfr(paths, ~get_file_info(.x, loaded_data), .id = "file_type") 119
116 120 loaded_data <- map(paths, load_file)
117 virus_count <- if(!is.null(loaded_data$genomad)) { 121 file_info <- purrr::map_dfr(paths, ~ get_file_info(.x, loaded_data), .id = "file_type")
118 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE) 122
119 log_debug(paste("Virus count:", count)) 123 virus_count <- if (!is.null(loaded_data$genomad)) {
120 count 124 count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE)
121 } else { 125 log_debug(paste("Virus count:", count))
122 log_debug("No genomad summary found, virus count set to 0") 126 count
123 0 127 } else {
124 } 128 log_debug("No genomad summary found, virus count set to 0")
125 129 0
126 log_debug("Returning results from process_genome_folder") 130 }
127 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data) 131
132 log_debug("Returning results from process_genome_folder")
133 list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data)
128 } 134 }
129 ``` 135 ```
130 136
131 137
132 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE} 138 ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE}
133 compile_results <- function() { 139 compile_results <- function() {
134 base_dir <- params$outdir 140 base_dir <- params$outdir
135 log_debug(paste("Base directory:", base_dir)) 141 log_debug(paste("Base directory:", base_dir))
136 142
137 host_analyses_dir <- file.path(base_dir, "host_analyses") 143 host_analyses_dir <- file.path(base_dir, "host_analyses")
138 virus_analyses_dir <- file.path(base_dir, "virus_analyses") 144 virus_analyses_dir <- file.path(base_dir, "virus_analyses")
139 145
140 # List all sample-level directories from all tools under virus_analyses 146 # List all sample-level directories from all tools under virus_analyses
141 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE) 147 tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE)
142 148
143 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"), 149 genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"),
144 full.names = TRUE, recursive = FALSE) 150 full.names = TRUE, recursive = FALSE
145 151 )
146 # cat(length(genome_folders), "sample(s) processed\n") 152
147 153 # cat(length(genome_folders), "sample(s) processed\n")
148 log_debug("Processing genome folders") 154
149 genome_data <- map(genome_folders, process_genome_folder, 155 log_debug("Processing genome folders")
150 host_analyses_dir = host_analyses_dir, 156 genome_data <- map(genome_folders, process_genome_folder,
151 virus_analyses_dir = virus_analyses_dir) %>% 157 host_analyses_dir = host_analyses_dir,
152 set_names(basename(genome_folders)) %>% 158 virus_analyses_dir = virus_analyses_dir
153 compact() 159 ) %>%
154 160 purrr::set_names(basename(genome_folders)) %>%
155 log_debug("Creating summary dataframe") 161 compact()
156 summary_df <- map_dfr(genome_data, ~{ 162
157 file_info <- .x$file_info 163 log_debug("Creating summary dataframe")
158 tibble( 164 summary_df <- purrr::map_dfr(genome_data, ~ {
159 Sample = basename(file_info$path[1]), 165 file_info <- .x$file_info
160 Virus_Count = .x$virus_count, 166 tibble::tibble(
161 geNomad = file_info$exists[file_info$file_type == "genomad"], 167 Sample = basename(file_info$path[1]),
162 CheckV = file_info$exists[file_info$file_type == "checkv"], 168 Virus_Count = .x$virus_count,
163 VIBRANT = file_info$exists[file_info$file_type == "vibrant"], 169 geNomad = file_info$exists[file_info$file_type == "genomad"],
164 dRep = file_info$exists[file_info$file_type == "drep"], 170 CheckV = file_info$exists[file_info$file_type == "checkv"],
165 iPHOP = file_info$exists[file_info$file_type == "iphop"], 171 VIBRANT = file_info$exists[file_info$file_type == "vibrant"],
166 PhaTYP = file_info$exists[file_info$file_type == "phatyp"], 172 dRep = file_info$exists[file_info$file_type == "drep"],
167 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"], 173 iPHOP = file_info$exists[file_info$file_type == "iphop"],
168 geNomad_Path = file_info$path[file_info$file_type == "genomad"], 174 PhaTYP = file_info$exists[file_info$file_type == "phatyp"],
169 CheckV_Path = file_info$path[file_info$file_type == "checkv"], 175 Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"],
170 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"], 176 geNomad_Path = file_info$path[file_info$file_type == "genomad"],
171 dRep_Path = file_info$path[file_info$file_type == "drep"], 177 CheckV_Path = file_info$path[file_info$file_type == "checkv"],
172 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"], 178 VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"],
173 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"], 179 dRep_Path = file_info$path[file_info$file_type == "drep"],
174 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"], 180 PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"],
175 file_info$rows[file_info$file_type == "genomad_phages"], 181 Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"],
176 0) 182 Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"],
177 ) 183 file_info$rows[file_info$file_type == "genomad_phages"],
178 }) %>% 184 0
179 mutate(across(ends_with("_Path"), ~ifelse(is.na(.), "Not available", as.character(.)))) 185 )
180 186 )
181 host_genomes_fasta <- list.files( 187 }) %>%
182 path = file.path(params$outdir, "genomes"), 188 dplyr::mutate(dplyr::across(ends_with("_Path"), ~ ifelse(is.na(.), "Not available", as.character(.))))
183 pattern = "\\.fna$", 189
184 full.names = TRUE 190 host_genomes_fasta <- list.files(
185 ) 191 path = file.path(params$outdir, "genomes"),
186 192 pattern = "\\.fna$",
187 host_genomes_paths <- tibble( 193 full.names = TRUE
188 name = tools::file_path_sans_ext(basename(host_genomes_fasta)), 194 )
189 path = host_genomes_fasta 195
190 ) 196 host_genomes_paths <- tibble::tibble(
191 197 name = tools::file_path_sans_ext(basename(host_genomes_fasta)),
192 data_gtdbtk_host <- read_tsv( 198 path = host_genomes_fasta
193 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"), 199 )
194 show_col_types = FALSE 200
195 ) %>% clean_names() 201 data_gtdbtk_host <- read_tsv(
196 202 file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"),
197 data_checkm_host <- read_tsv( 203 show_col_types = FALSE
198 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"), 204 ) %>% janitor::clean_names()
199 show_col_types = FALSE 205
200 ) %>% clean_names() 206 data_checkm_host <- read_tsv(
201 207 file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"),
202 log_debug("Returning summary dataframe, genome data, and host data") 208 show_col_types = FALSE
203 209 ) %>% janitor::clean_names()
204 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns")) 210
205 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", "))) 211 log_debug("Returning summary dataframe, genome data, and host data")
206 log_debug(paste("genome_data length:", length(genome_data))) 212
207 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", "))) 213 log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns"))
208 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns")) 214 log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", ")))
209 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", "))) 215 log_debug(paste("genome_data length:", length(genome_data)))
210 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns")) 216 log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", ")))
211 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", "))) 217 log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns"))
212 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns")) 218 log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", ")))
213 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", "))) 219 log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns"))
214 220 log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", ")))
215 list( 221 log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns"))
216 summary = summary_df, 222 log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", ")))
217 genome_data = genome_data, 223
218 host_genomes_paths = host_genomes_paths, 224 list(
219 data_gtdbtk_host = data_gtdbtk_host, 225 summary = summary_df,
220 data_checkm_host = data_checkm_host 226 genome_data = genome_data,
221 ) 227 host_genomes_paths = host_genomes_paths,
228 data_gtdbtk_host = data_gtdbtk_host,
229 data_checkm_host = data_checkm_host
230 )
222 } 231 }
223 ``` 232 ```
224 233
225 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE} 234 ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE}
226 log_debug("Starting execution of main function") 235 log_debug("Starting execution of main function")
227 result <- compile_results() 236 result <- compile_results()
228 237
229 if (is.null(result)) { 238 if (is.null(result)) {
230 log_debug("Main function execution failed") 239 log_debug("Main function execution failed")
231 stop("Main function execution failed") 240 stop("Main function execution failed")
232 } 241 }
233 242
234 summary_df <- result$summary 243 summary_df <- result$summary
235 genome_data <- result$genome_data 244 genome_data <- result$genome_data
236 host_genomes_paths <- result$host_genomes_paths 245 host_genomes_paths <- result$host_genomes_paths
237 data_gtdbtk_host <- result$data_gtdbtk_host 246 data_gtdbtk_host <- result$data_gtdbtk_host
238 data_checkm_host <- result$data_checkm_host 247 data_checkm_host <- result$data_checkm_host
239 log_debug("Data extracted successfully") 248 log_debug("Data extracted successfully")
240 249
241 # Remove any extensions from names in data gtdbtk and checm2
242 data_gtdbtk_host <- data_gtdbtk_host %>%
243 mutate(user_genome = str_remove(user_genome, "\\.[^.]+$"))
244
245 data_checkm_host <- data_checkm_host %>%
246 mutate(name = str_remove(name, "\\.[^.]+$"))
247
248 result$summary <- result$summary %>% 250 result$summary <- result$summary %>%
249 mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) 251 dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv"))
250 ``` 252 ```
251 253
252 # Summary {.tabset .tabset-fade} 254 # Summary {.tabset .tabset-fade}
253
254 ## Overview Table
255 255
256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50. 256 This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50.
257 257
258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} 258 ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'}
259 data <- result$summary 259 data <- result$summary
260 260
261 log_debug("Assigning checkm2 host data") 261 log_debug("Assigning checkm2 host data")
262 checkm_host_data <- data_checkm_host %>% clean_names() %>% 262 checkm_host_data <- data_checkm_host %>%
263 select(name, completeness, contamination, 263 janitor::clean_names() %>%
264 contig_n50, genome_size) 264 dplyr::select(
265 name, completeness, contamination,
266 contig_n50, genome_size
267 )
265 268
266 log_debug("Assigning GTDB-Tk host data") 269 log_debug("Assigning GTDB-Tk host data")
267 gtdbtk_data <- data_gtdbtk_host %>% 270 gtdbtk_data <- data_gtdbtk_host %>%
268 select(user_genome, classification) 271 dplyr::select(user_genome, classification)
269 272
270 log_debug("Defining color-blind friendly palette") 273 log_debug("Defining color-blind friendly palette")
271 cb_friendly_colors <- list( 274 cb_friendly_colors <- list(
272 green = "#009E73", 275 green = "#009E73",
273 blue = "#0072B2", 276 blue = "#0072B2",
274 orange = "#E69F00", 277 orange = "#E69F00",
275 red = "#D55E00", 278 red = "#D55E00",
276 grey = "#999999" 279 grey = "#999999"
277 ) 280 )
278 281
279 log_debug("Defining function to color cells") 282 log_debug("Defining function to color cells")
280 color_cell <- function(values, color_true = cb_friendly_colors$green, 283 color_cell <- function(values, color_true = cb_friendly_colors$green,
281 color_false = cb_friendly_colors$red) { 284 color_false = cb_friendly_colors$red) {
282 ifelse(values, 285 ifelse(values,
283 cell_spec("Yes", color = "white", bold = TRUE, background = color_true), 286 kableExtra::cell_spec("Yes", color = "white", bold = TRUE, background = color_true),
284 cell_spec("No", color = "white", bold = TRUE, background = color_false)) 287 kableExtra::cell_spec("No", color = "white", bold = TRUE, background = color_false)
288 )
285 } 289 }
286 290
287 log_debug("Defining function to create bar plot") 291 log_debug("Defining function to create bar plot")
288 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) { 292 create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) {
289 sapply(values, function(value) { 293 sapply(values, function(value) {
290 if(is.na(value) || !is.numeric(value)) { 294 if (is.na(value) || !is.numeric(value)) {
291 return("N/A") 295 return("N/A")
292 } 296 }
293 bar_width <- min(max(value, 0), max_value) / max_value * 100 297 bar_width <- min(max(value, 0), max_value) / max_value * 100
294 sprintf('<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%', 298 sprintf(
295 color, bar_width, value) 299 '<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%',
296 }) 300 color, bar_width, value
301 )
302 })
297 } 303 }
298 304
299 log_debug("Defining function to format large numbers") 305 log_debug("Defining function to format large numbers")
300 format_large_number <- function(x) { 306 format_large_number <- function(x) {
301 sapply(x, function(value) { 307 sapply(x, function(value) {
302 if (is.na(value) || !is.numeric(value)) { 308 if (is.na(value) || !is.numeric(value)) {
303 return("N/A") 309 return("N/A")
304 } else if (value < 1000) { 310 } else if (value < 1000) {
305 return(as.character(value)) 311 return(as.character(value))
306 } else if (value < 1e6) { 312 } else if (value < 1e6) {
307 return(paste0(round(value / 1e3, 1), "K")) 313 return(paste0(round(value / 1e3, 1), "K"))
308 } else if (value < 1e9) { 314 } else if (value < 1e9) {
309 return(paste0(round(value / 1e6, 1), "M")) 315 return(paste0(round(value / 1e6, 1), "M"))
310 } else { 316 } else {
311 return(paste0(round(value / 1e9, 1), "G")) 317 return(paste0(round(value / 1e9, 1), "G"))
312 } 318 }
313 }) 319 })
314 } 320 }
315 321
316 log_debug("Defining function to extract last known taxonomy level") 322 log_debug("Defining function to extract last known taxonomy level")
317 extract_last_known_taxonomy <- function(classification) { 323 extract_last_known_taxonomy <- function(classification) {
318 if (is.na(classification) || classification == "") { 324 if (is.na(classification) || classification == "") {
325 return(list(level = "Unknown", name = "Unknown"))
326 }
327
328 parts <- strsplit(classification, ";")[[1]]
329 for (i in length(parts):1) {
330 level <- sub("^[a-z]__", "", parts[i])
331 if (level != "") {
332 prefix <- sub("__.*$", "", parts[i])
333 return(list(level = prefix, name = level))
334 }
335 }
319 return(list(level = "Unknown", name = "Unknown")) 336 return(list(level = "Unknown", name = "Unknown"))
320 }
321
322 parts <- strsplit(classification, ";")[[1]]
323 for (i in length(parts):1) {
324 level <- sub("^[a-z]__", "", parts[i])
325 if (level != "") {
326 prefix <- sub("__.*$", "", parts[i])
327 return(list(level = prefix, name = level))
328 }
329 }
330 return(list(level = "Unknown", name = "Unknown"))
331 } 337 }
332 338
333 log_debug("Defining function to format taxonomy") 339 log_debug("Defining function to format taxonomy")
334 format_taxonomy <- function(classification) { 340 format_taxonomy <- function(classification) {
335 result <- extract_last_known_taxonomy(classification) 341 result <- extract_last_known_taxonomy(classification)
336 if (result$level == "Unknown") { 342 if (result$level == "Unknown") {
337 return("Unknown") 343 return("Unknown")
338 } else if (result$level == "s") { 344 } else if (result$level == "s") {
339 return(paste0("<i>", result$name, "</i>")) 345 return(paste0("<i>", result$name, "</i>"))
340 } else { 346 } else {
341 genus <- str_replace_all(result$name, "_", " ") 347 genus <- str_replace_all(result$name, "_", " ")
342 return(paste0("<i>", genus, "</i> sp.")) 348 return(paste0("<i>", genus, "</i> sp."))
343 } 349 }
344 } 350 }
345 351
346 log_debug("Defining function to calculate quality score and determine genome quality class") 352 log_debug("Defining function to calculate quality score and determine genome quality class")
347 calculate_quality_score_and_class <- function(completeness, contamination) { 353 calculate_quality_score_and_class <- function(completeness, contamination) {
348 if (is.na(completeness) || is.na(contamination)) { 354 if (is.na(completeness) || is.na(contamination)) {
349 return(list( 355 return(list(
350 score = cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey), 356 score = kableExtra::cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey),
351 class = cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey), 357 class = kableExtra::cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey),
352 numeric_score = NA 358 numeric_score = NA
353 )) 359 ))
354 } 360 }
355 361
356 quality_score <- completeness - (5 * contamination) 362 quality_score <- completeness - (5 * contamination)
357 formatted_score <- sprintf("%.1f", quality_score) 363 formatted_score <- sprintf("%.1f", quality_score)
358 364
359 if (completeness > 90 && contamination < 5) { 365 if (completeness > 90 && contamination < 5) {
360 class <- "High-quality draft" 366 class <- "High-quality draft"
361 color <- cb_friendly_colors$green 367 color <- cb_friendly_colors$green
362 } else if (completeness >= 50 && contamination < 10) { 368 } else if (completeness >= 50 && contamination < 10) {
363 class <- "Medium-quality draft" 369 class <- "Medium-quality draft"
364 color <- cb_friendly_colors$blue 370 color <- cb_friendly_colors$blue
365 } else { 371 } else {
366 class <- "Low-quality draft" 372 class <- "Low-quality draft"
367 color <- cb_friendly_colors$red 373 color <- cb_friendly_colors$red
368 } 374 }
369 375
370 list( 376 list(
371 score = cell_spec(formatted_score, color = "white", bold = TRUE, background = color), 377 score = kableExtra::cell_spec(formatted_score, color = "white", bold = TRUE, background = color),
372 class = cell_spec(class, color = "white", bold = TRUE, background = color), 378 class = kableExtra::cell_spec(class, color = "white", bold = TRUE, background = color),
373 numeric_score = quality_score 379 numeric_score = quality_score
374 ) 380 )
375 } 381 }
376 382
377 log_debug("Preparing the data") 383 log_debug("Preparing the data")
378 384
379 table_data <- data %>% 385 table_data <- data %>%
380 #mutate(Sample = basename(Sample) %>% trim_sample_name()) %>% 386 # dplyr::mutate(Sample = basename(Sample) %>% trim_sample_name()) %>%
381 mutate(Sample = basename(Sample)) %>% 387 dplyr::mutate(Sample = basename(Sample)) %>%
382 left_join(checkm_host_data, by = c("Sample" = "name")) %>% 388 dplyr::left_join(checkm_host_data, by = c("Sample" = "name")) %>%
383 left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>% 389 dplyr::left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>%
384 mutate( 390 dplyr::mutate(
385 quality_data = pmap(list(as.numeric(completeness), 391 quality_data = purrr::pmap(
386 as.numeric(contamination)), 392 list(
387 calculate_quality_score_and_class), 393 as.numeric(completeness),
388 Quality_Score = map_chr(quality_data, ~.$score), 394 as.numeric(contamination)
389 Genome_Quality = map_chr(quality_data, ~.$class), 395 ),
390 Quality_Score_Numeric = map_dbl(quality_data, ~.$numeric_score), 396 calculate_quality_score_and_class
391 Virus_Count_Numeric = as.numeric(Virus_Count), 397 ),
392 Virus_Count = cell_spec( 398 Quality_Score = purrr::map_chr(quality_data, ~ .$score),
393 Virus_Count, 399 Genome_Quality = purrr::map_chr(quality_data, ~ .$class),
394 color = "white", 400 Quality_Score_Numeric = purrr::map_dbl(quality_data, ~ .$numeric_score),
395 bold = TRUE, 401 Virus_Count_Numeric = as.numeric(Virus_Count),
396 background = case_when( 402 Virus_Count = kableExtra::cell_spec(
397 Virus_Count == 0 ~ cb_friendly_colors$red, 403 Virus_Count,
398 Virus_Count == 1 ~ cb_friendly_colors$blue, 404 color = "white",
399 Virus_Count > 1 ~ cb_friendly_colors$green 405 bold = TRUE,
400 ) 406 background = dplyr::case_when(
401 ), 407 Virus_Count == 0 ~ cb_friendly_colors$red,
402 Completeness_Numeric = as.numeric(completeness), 408 Virus_Count == 1 ~ cb_friendly_colors$blue,
403 Completeness = create_bar_plot(as.numeric(completeness), 100), 409 Virus_Count > 1 ~ cb_friendly_colors$green
404 Contamination = create_bar_plot(as.numeric(contamination), 100), 410 )
405 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)), 411 ),
406 `Genome size (bp)` = format_large_number(as.numeric(genome_size)), 412 Completeness_Numeric = as.numeric(completeness),
407 `GTDB Taxonomy` = sapply(classification, format_taxonomy) 413 Completeness = create_bar_plot(as.numeric(completeness), 100),
408 ) %>% 414 Contamination = create_bar_plot(as.numeric(contamination), 100),
409 mutate(`#` = row_number()) %>% 415 `N50 (contigs)` = format_large_number(as.numeric(contig_n50)),
410 select(`#`, Sample, `GTDB Taxonomy`, Virus_Count, 416 `Genome size (bp)` = format_large_number(as.numeric(genome_size)),
411 Quality_Score, Genome_Quality, Completeness, Contamination, 417 `GTDB Taxonomy` = sapply(classification, format_taxonomy)
412 `Genome size (bp)`, `N50 (contigs)`) 418 ) %>%
419 dplyr::mutate(`#` = row_number()) %>%
420 dplyr::select(
421 `#`, Sample, `GTDB Taxonomy`, Virus_Count,
422 Quality_Score, Genome_Quality, Completeness, Contamination,
423 `Genome size (bp)`, `N50 (contigs)`
424 )
413 425
414 log_debug("Creating the table") 426 log_debug("Creating the table")
415 kbl(table_data, escape = FALSE, 427 kableExtra::kbl(table_data,
416 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2))) %>% 428 escape = FALSE,
417 kable_paper(full_width = TRUE) %>% 429 align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2))
418 column_spec(1, bold = TRUE, width = "2em") %>% 430 ) %>%
419 column_spec(2:3, bold = TRUE) %>% 431 kableExtra::kable_paper(full_width = TRUE) %>%
420 column_spec(4:5, width = "5em") %>% 432 kableExtra::column_spec(1, bold = TRUE, width = "2em") %>%
421 column_spec(6:7, width = "60px") %>% 433 kableExtra::column_spec(2:3, bold = TRUE) %>%
422 column_spec(8:9, width = "4em") %>% 434 kableExtra::column_spec(4:5, width = "5em") %>%
423 add_header_above(c(" " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2, 435 kableExtra::column_spec(6:7, width = "60px") %>%
424 "Statistics" = 2)) %>% 436 kableExtra::column_spec(8:9, width = "4em") %>%
425 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 437 kableExtra::add_header_above(c(
426 font_size = 9, 438 " " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2,
427 html_font = "Arial", 439 "Statistics" = 2
428 position = "left") %>% 440 )) %>%
429 row_spec(0, bold = TRUE, color = "white", background = "#333333") %>% 441 kableExtra::kable_styling(
430 row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>% 442 bootstrap_options = c("striped", "hover", "condensed", "responsive"),
431 column_spec(9, extra_css = "border-right: 2px solid #000000;") %>% 443 font_size = 9,
432 scroll_box(width = "100%", height = "100%", 444 html_font = "Arial",
433 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;") 445 position = "left"
446 ) %>%
447 kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#333333") %>%
448 kableExtra::row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>%
449 kableExtra::column_spec(9, extra_css = "border-right: 2px solid #000000;") %>%
450 kableExtra::scroll_box(
451 width = "100%", height = "100%",
452 extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;"
453 )
434 ``` 454 ```
435
436 ## Tools Documentation
437
438 The following tools are utilized in this workflow. Each tool name below is a link to its respective documentation.
439
440 **Host-analyses**
441
442 - [**CheckM2 v1.1.0**](https://github.com/chklovski/CheckM2): Assesses the quality of the host. Most useful when working with assembled genomes.
443
444 - [**GTDB-Tk v2.3.2**](https://ecogenomics.github.io/GTDBTk/index.html): Assigns a taxonomy to the host genome.
445
446 - [**Defense-Finder v2.0.0, models 2.0.2**](https://ecogenomics.github.io/GTDBTk/index.html): Detects known anti-phage systems in the host.
447
448 - [**geNomad v1.7.1**](https://portal.nersc.gov/genomad/): Predicts and annotates proviruses.
449
450 **Virus-analyses**
451
452 - [**CheckV v1.0.1**](https://pypi.org/project/checkv/): Evaluates the quality of viral genomes.
453
454 - [**dRep v3.4.5**](https://drep.readthedocs.io/en/latest/): Compares viral genomes within the same host.
455
456 - [**Abricate v1.0.1**](https://github.com/tseemann/abricate): Identifies virulence genes in the prophage genomes with the [VFDB database](https://www.mgc.ac.cn/VFs/).
457
458 - [**iPHOP v1.3.3**](https://bitbucket.org/srouxjgi/iphop/src/main/): Predicts other potential hosts of viral genomes.
459
460 - [**VIBRANT v1.2.1**](https://github.com/AnantharamanLab/VIBRANT): Used to identify Auxiliary Metabolic Genes in the prophages.
461
462 ## Workflow
463
464 The workflow begins with the input of bacterial genomes by the user. These are processed by the **host-analyses** tools. Prophage prediction is
465 performed by **geNomad** only. Afterward, prophages identified by **geNomad** are processed by the **virus-analyses** tools.
466
467 If more than one prophage is recovered in the same sample, **dRep** is used to compare and determine if the viruses are identical or different within the same host.
468
469 *PLACEHOLDER FOR PIPELINE*
470
471 ## R Session Info
472
473 Information about the R session used to render this markdown document.
474
475 ```{r}
476 sessionInfo()
477 ```
478
479 455
480 # Results {.tabset .tabset-fade} 456 # Results {.tabset .tabset-fade}
481 457
482 ```{r} 458 ```{r}
483 # Creating combined_unique object 459 # Creating combined_unique object
484 460
485 combined_unique <- bind_rows( 461 combined_unique <- bind_rows(
486 checkm_host_data %>% 462 checkm_host_data %>%
487 # select(bin_id) %>% 463 # dplyr::select(bin_id) %>%
488 # dplyr::rename(Sample = bin_id), 464 # dplyr::rename(Sample = bin_id),
489 select(name) %>% 465 dplyr::select(name) %>%
490 dplyr::rename(Sample = name), 466 dplyr::rename(Sample = name),
491 467 data %>%
492 data %>% 468 # dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv")) %>%
493 #mutate(Sample = str_remove(Sample, "_virus_summary.tsv")) %>% 469 dplyr::select(Sample)
494 select(Sample)
495 ) %>% 470 ) %>%
496 distinct(Sample) %>% 471 distinct(Sample) %>%
497 arrange(Sample) 472 arrange(Sample)
498 473
499 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", "))) 474 log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", ")))
500 ``` 475 ```
501 476
502 477
503 478
504 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'} 479 ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'}
505 # Process proviruses data 480 # Process proviruses data
506 process_proviruses <- function(data_genomad) { 481 process_proviruses <- function(data_genomad) {
507 proviruses <- data_genomad %>% 482 proviruses <- data_genomad %>%
508 dplyr::filter(topology == "Provirus") %>% 483 dplyr::filter(topology == "Provirus") %>%
509 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus" 484 dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus"
510 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ... 485 dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ...
511 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks) 486 dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks)
512 487
513 proviruses <- proviruses %>% 488 proviruses <- proviruses %>%
514 tidyr::separate(coordinates, into = c("start", "end"), sep = "-") 489 tidyr::separate(coordinates, into = c("start", "end"), sep = "-")
515 490
516 proviruses$start <- as.integer(proviruses$start) 491 proviruses$start <- as.integer(proviruses$start)
517 proviruses$end <- as.integer(proviruses$end) 492 proviruses$end <- as.integer(proviruses$end)
518 493
519 proviruses_gr_features <- GRanges(seqnames = proviruses$contig, 494 proviruses_gr_features <- GRanges(
520 ranges = IRanges(start = proviruses$start, 495 seqnames = proviruses$contig,
521 end = proviruses$end)) 496 ranges = IRanges(
522 proviruses_gr_features$length <- proviruses_gr_features %>% ranges %>% width 497 start = proviruses$start,
523 proviruses_gr_features$score <- as.numeric(proviruses$virus_score) 498 end = proviruses$end
524 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks) 499 )
525 500 )
526 proviruses_gr_features$n_hallmarks_pos <- 501 proviruses_gr_features$length <- proviruses_gr_features %>%
527 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2 502 ranges() %>%
528 503 width()
529 return(proviruses_gr_features) 504 proviruses_gr_features$score <- as.numeric(proviruses$virus_score)
505 proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks)
506
507 proviruses_gr_features$n_hallmarks_pos <-
508 abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2
509
510 return(proviruses_gr_features)
530 } 511 }
531 512
532 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) { 513 plot_genome_ideogram <- function(genome_current, proviruses_gr_features) {
533 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna")) 514 fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna"))
534 #cat(fasta_file_path, "\n\n") 515 # cat(fasta_file_path, "\n\n")
535 genome_ideogram <- getIdeogramData(fasta_file = fasta_file_path) 516 genome_ideogram <- gmoviz::getIdeogramData(fasta_file = fasta_file_path)
536 517
537 # Replace any seqlevel to c_1, c_2, c_3, ... 518 # Replace any seqlevel to c_1, c_2, c_3, ...
538 new_seqlevels <- paste0("c", seq_along(seqlevels(genome_ideogram))) 519 new_seqlevels <- paste0("c", seq_along(GenomeInfoDb::seqlevels(genome_ideogram)))
539 names(new_seqlevels) <- seqlevels(genome_ideogram) 520 names(new_seqlevels) <- GenomeInfoDb::seqlevels(genome_ideogram)
540 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels) 521 genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels)
541 colours <- rep("#a58bc5", length(seqlevels(genome_ideogram))) 522 colours <- rep("#a58bc5", length(GenomeInfoDb::seqlevels(genome_ideogram)))
542 523
543 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot 524 par(mar = c(2, 2, 2, 2)) # minimal margins around the plot
544 525
545 gmovizInitialise(genome_ideogram, 526 gmoviz::gmovizInitialise(genome_ideogram,
546 sector_colours = colours, 527 sector_colours = colours,
547 sector_border_colours = colours, 528 sector_border_colours = colours,
548 sector_labels = FALSE 529 sector_labels = FALSE
549 ) 530 )
550 531
551 for (i in 1:length(proviruses_gr_features)) { 532 for (i in 1:length(proviruses_gr_features)) {
552 name <- as.character(seqnames(proviruses_gr_features[i])) 533 name <- as.character(seqnames(proviruses_gr_features[i]))
553 start <- as.numeric(start(proviruses_gr_features[i])) 534 start <- as.numeric(start(proviruses_gr_features[i]))
554 end <- as.numeric(end(proviruses_gr_features[i])) 535 end <- as.numeric(end(proviruses_gr_features[i]))
555 region <- data.frame(start = start, end = end) 536 region <- data.frame(start = start, end = end)
556 circos.genomicRect(seqnames = name, 537 circlize::circos.genomicRect(
557 region, 538 seqnames = name,
558 ytop = .5, 539 region,
559 ybottom = 0, 540 ytop = .5,
560 track.index = 1, 541 ybottom = 0,
561 sector.index = name, 542 track.index = 1,
562 border = "#e9d27d", 543 sector.index = name,
563 col = "#e9d27d") 544 border = "#e9d27d",
564 } 545 col = "#e9d27d"
565 546 )
566 length <- as.numeric(proviruses_gr_features$length) 547 }
567 length <- ifelse(length > 1000000, 548
568 paste0(round(length/1000000, 2), "mb"), 549 length <- as.numeric(proviruses_gr_features$length)
569 paste0(round(length/1000, 2), "kb")) 550 length <- ifelse(length > 1000000,
570 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")") 551 paste0(round(length / 1000000, 2), "mb"),
571 circos.labels(sectors = as.character(seqnames(proviruses_gr_features)), 552 paste0(round(length / 1000, 2), "kb")
572 x = as.numeric(start(proviruses_gr_features)), 553 )
573 labels, 554 labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")")
574 facing = "clockwise") 555 circlize::circos.labels(
556 sectors = as.character(seqnames(proviruses_gr_features)),
557 x = as.numeric(start(proviruses_gr_features)),
558 labels,
559 facing = "clockwise"
560 )
575 } 561 }
576 562
577 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) { 563 process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) {
578 genome_current <- sample # Add this line 564 genome_current <- sample # Add this line
579 tryCatch({ 565 tryCatch(
580 log_debug(paste("Starting to process sample:", sample)) 566 {
581 567 log_debug(paste("Starting to process sample:", sample))
582 # Check if sample exists in genome_data 568
583 if (!(sample %in% names(genome_data))) { 569 # Check if sample exists in genome_data
584 log_debug(paste("Sample", sample, "not found in genome_data")) 570 if (!(sample %in% names(genome_data))) {
585 cat(paste("Error: Sample", sample, "not found in genome_data\n\n")) 571 log_debug(paste("Sample", sample, "not found in genome_data"))
586 return() 572 cat(paste("Error: Sample", sample, "not found in genome_data\n\n"))
587 } 573 return()
588 574 }
589 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n")) 575
590 576 cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n"))
591 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample] 577
592 if (length(host_genome_path) == 0) { 578 host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample]
593 log_debug(paste("Host genome path not found for sample:", sample)) 579 if (length(host_genome_path) == 0) {
594 cat(paste("Error: Host genome path not found for sample", sample, "\n\n")) 580 log_debug(paste("Host genome path not found for sample:", sample))
595 return() 581 cat(paste("Error: Host genome path not found for sample", sample, "\n\n"))
596 } 582 return()
597 583 }
598 host_genome_ideogram <- tryCatch({ 584
599 getIdeogramData(fasta_file = host_genome_path) 585 host_genome_ideogram <- tryCatch(
600 }, error = function(e) { 586 {
601 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e))) 587 gmoviz::getIdeogramData(fasta_file = host_genome_path)
602 NULL 588 },
603 }) 589 error = function(e) {
604 590 log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e)))
605 if (is.null(host_genome_ideogram)) { 591 NULL
606 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n")) 592 }
607 return() 593 )
608 } 594
609 595 if (is.null(host_genome_ideogram)) {
610 sample_data <- genome_data[[sample]]$loaded_data 596 cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n"))
611 genomad_summary <- sample_data$genomad 597 return()
612 genomad_annotation <- sample_data$genomad_annotations 598 }
613 checkv_data <- sample_data$checkv 599
614 defense_finder_data <- sample_data$defense_finder 600 sample_data <- genome_data[[sample]]$loaded_data
615 abricate_data <- sample_data$abricate 601 genomad_summary <- sample_data$genomad
616 iphop_data <- sample_data$iphop 602 genomad_annotation <- sample_data$genomad_annotations
617 vibrant_data <- sample_data$vibrant 603 checkv_data <- sample_data$checkv
618 604 defense_finder_data <- sample_data$defense_finder
619 cat("### Host Genome\n\n") 605 abricate_data <- sample_data$abricate
620 606 iphop_data <- sample_data$iphop
621 cat("**GTDB-Tk taxonomy**: \n\n") 607 vibrant_data <- sample_data$vibrant
622 data_gtdbtk_host %>% filter(user_genome == sample) %>% 608
623 select(classification) %>% 609 cat("### Host Genome\n\n")
624 kbl() %>% 610
625 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 611 cat("**GTDB-Tk taxonomy**: \n\n")
626 kable_paper("striped", full_width = TRUE) %>% 612 data_gtdbtk_host %>%
627 scroll_box(width = "100%", height = "100%") %>% 613 dplyr::filter(user_genome == sample) %>%
628 cat() 614 dplyr::select(classification) %>%
629 615 kableExtra::kbl() %>%
630 # Cat checkm summary for this genome 616 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
631 cat("**CheckM2 Summary**:\n\n") 617 kableExtra::kable_paper("striped", full_width = TRUE) %>%
632 #checkm_summary <- data_checkm_host %>% filter(`bin_id` == sample) 618 kableExtra::scroll_box(width = "100%", height = "100%") %>%
633 checkm_summary <- data_checkm_host %>% filter(`name` == sample) 619 cat()
634 checkm_summary %>% clean_names %>% 620
635 #select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>% 621 # Cat checkm summary for this genome
636 select(total_contigs, contig_n50, completeness, contamination) %>% 622 cat("**CheckM2 Summary**:\n\n")
637 kbl() %>% 623 # checkm_summary <- data_checkm_host %>% dplyr::filter(`bin_id` == sample)
638 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 624 checkm_summary <- data_checkm_host %>% dplyr::filter(`name` == sample)
639 kable_paper("striped", full_width = TRUE) %>% 625 checkm_summary %>%
640 scroll_box(width = "100%", height = "100%") %>% 626 janitor::clean_names() %>%
641 cat() 627 # dplyr::select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>%
642 628 dplyr::select(total_contigs, contig_n50, completeness, contamination) %>%
643 # Display defense-finder as a table 629 kableExtra::kbl() %>%
644 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) { 630 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
645 cat("**Defense-Finder Systems**:\n\n") 631 kableExtra::kable_paper("striped", full_width = TRUE) %>%
646 632 kableExtra::scroll_box(width = "100%", height = "100%") %>%
647 defense_finder_data %>% 633 cat()
648 select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>% 634
649 kbl() %>% 635 # Display defense-finder as a table
650 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 636 if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) {
651 kable_paper("striped", full_width = TRUE) %>% 637 cat("**Defense-Finder Systems**:\n\n")
652 scroll_box(width = "100%", height = "100%") %>% 638
639 defense_finder_data %>%
640 dplyr::select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>%
641 kableExtra::kbl() %>%
642 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
643 kableExtra::kable_paper("striped", full_width = TRUE) %>%
644 kableExtra::scroll_box(width = "100%", height = "100%") %>%
645 cat()
646 } else {
647 cat("No Defense-Finder systems detected.\n\n")
648 }
649
650 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) {
651 q
652 log_debug(paste("No geNomad summary data found for sample:", sample))
653 return()
654 }
655
656 if (length(GenomeInfoDb::seqlevels(host_genome_ideogram)) == 1) {
657 host_genome_size <- sum(width(host_genome_ideogram))
658 } else {
659 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name))
660 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs)))
661 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs)
662 host_genome_size <- sum(width(filtered_host_genome))
663 }
664
665 # Process proviruses
666 proviruses_gr_features <- process_proviruses(genomad_summary)
667
668 cat("**Genomad and CheckV Summary**:\n\n")
669 genomad_summary %>%
670 dplyr::select(seq_name, taxonomy, topology, coordinates, length) %>%
671 dplyr::left_join(
672 checkv_data %>% dplyr::select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality),
673 by = c("seq_name" = "contig_id")
674 ) %>%
675 dplyr::select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>%
676 kableExtra::kbl() %>%
677 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
678 kableExtra::kable_paper("striped", full_width = TRUE) %>%
679 kableExtra::scroll_box(width = "100%", height = "100%") %>%
680 cat()
681
682 cat("**Host Genome Ideogram with Phages**:\n\n")
683 plot_genome_ideogram(sample, proviruses_gr_features)
684 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number.
685 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n')
686 cat("\n\n")
687
688 # Process phage genomes
689 cat("### Prophages {.tabset .tabset-fade} \n\n")
690 cat("**Select prophage to show: ** \n\n")
691 for (i in seq_len(nrow(genomad_summary))) {
692 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample))
693 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size)
694 }
695
696 # Plot dREP if applicable
697 if (nrow(genomad_summary) > 1) {
698 cat("### vOTUs\n\n")
699 plot_drep(sample, genomad_summary)
700 }
701
702 # Creating table with Abricate data
703 if (nrow(abricate_data) > 0) {
704 cat("### Virulence Genes {.tabset .tabset-fade} \n\n")
705 cat("Screening of virulence genes present in the prophage contigs. \n\n")
706 abricate_data %>%
707 dplyr::select(-number_file) %>%
708 kableExtra::kbl() %>%
709 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
710 kableExtra::kable_paper("striped", full_width = TRUE) %>%
711 kableExtra::scroll_box(width = "100%", height = "100%") %>%
712 cat()
713 cat("\n\n")
714 }
715
716 # Creating table with iPHOP
717 if (nrow(iphop_data) > 0) {
718 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n")
719 cat("Prediction of potential hosts for the prophage contigs. \n\n")
720 iphop_data %>%
721 kableExtra::kbl() %>%
722 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
723 kableExtra::kable_paper("striped", full_width = TRUE) %>%
724 kableExtra::scroll_box(width = "100%", height = "100%") %>%
725 cat()
726 cat("\n\n")
727 }
728
729 # Creating table with VIBRANT AMGs
730 if (nrow(vibrant_data) > 0) {
731 cat("### AMG Predictions {.tabset .tabset-fade} \n\n")
732 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n")
733 vibrant_data %>%
734 kableExtra::kbl() %>%
735 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
736 kableExtra::kable_paper("striped", full_width = TRUE) %>%
737 kableExtra::scroll_box(width = "100%", height = "100%") %>%
738 cat()
739 cat("\n\n")
740 }
741
742 log_debug(paste("Finished processing sample:", sample))
743 },
744 error = function(e) {
745 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e)))
746 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n"))
747 }
748 )
749 }
750
751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) {
752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n"))
753
754 current_contig <- sub("\\|.*", "", virus$seq_name)
755
756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name))
757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name))
758 virus_length <- provirus_end - provirus_start + 1
759
760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name)
761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name)
762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"),
763 genomad_annotation$gene,
764 fixed = FALSE
765 ), ] %>%
766 dplyr::mutate(arrow_pos = ifelse(strand == -1, "start", "end"))
767
768
769 cat("\n\n**Phage-Host Genome Ideogram:**\n\n")
770
771 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data)
772
773 cat("\n\n")
774 cat("\n\n**Genes Annotation (geNomad):**\n\n")
775
776 current_annotations %>%
777 dplyr::select(gene, length, marker, annotation_accessions, annotation_description) %>%
778 kableExtra::kbl() %>%
779 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
780 kableExtra::kable_paper() %>%
653 cat() 781 cat()
654 } else {
655 cat("No Defense-Finder systems detected.\n\n")
656 }
657
658 if (is.null(genomad_summary) || nrow(genomad_summary) == 0) {
659 log_debug(paste("No geNomad summary data found for sample:", sample))
660 return()
661 }
662
663 if (length(seqlevels(host_genome_ideogram)) == 1) {
664 host_genome_size <- sum(width(host_genome_ideogram))
665 } else {
666 virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name))
667 virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs)))
668 filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs)
669 host_genome_size <- sum(width(filtered_host_genome))
670 }
671
672 # Process proviruses
673 proviruses_gr_features <- process_proviruses(genomad_summary)
674
675 cat("**Genomad and CheckV Summary**:\n\n")
676 genomad_summary %>%
677 select(seq_name, taxonomy, topology, coordinates, length) %>%
678 left_join(
679 checkv_data %>% select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality),
680 by = c("seq_name" = "contig_id")) %>%
681 select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>%
682 kbl() %>%
683 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
684 kable_paper("striped", full_width = TRUE) %>%
685 scroll_box(width = "100%", height = "100%") %>%
686 cat()
687
688 cat("**Host Genome Ideogram with Phages**:\n\n")
689 plot_genome_ideogram(sample, proviruses_gr_features)
690 cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number.
691 If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n')
692 cat("\n\n") 782 cat("\n\n")
693
694 # Process phage genomes
695 cat("### Prophages {.tabset .tabset-fade} \n\n")
696 cat("**Select prophage to show: ** \n\n")
697 for (i in seq_len(nrow(genomad_summary))) {
698 log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample))
699 process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size)
700 }
701
702 # Plot dREP if applicable
703 if (nrow(genomad_summary) > 1) {
704 cat("### vOTUs\n\n")
705 plot_drep(sample, genomad_summary)
706 }
707
708 # Creating table with Abricate data
709 if (nrow(abricate_data) > 0) {
710 cat("### Virulence Genes {.tabset .tabset-fade} \n\n")
711 cat("Screening of virulence genes present in the prophage contigs. \n\n")
712 abricate_data %>% select(-number_file) %>%
713 kbl() %>%
714 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
715 kable_paper("striped", full_width = TRUE) %>%
716 scroll_box(width = "100%", height = "100%") %>% cat()
717 cat("\n\n")
718 }
719
720 # Creating table with iPHOP
721 if (nrow(iphop_data) > 0) {
722 cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n")
723 cat("Prediction of potential hosts for the prophage contigs. \n\n")
724 iphop_data %>%
725 kbl() %>%
726 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
727 kable_paper("striped", full_width = TRUE) %>%
728 scroll_box(width = "100%", height = "100%") %>% cat()
729 cat("\n\n")
730 }
731
732 # Creating table with VIBRANT AMGs
733 if (nrow(vibrant_data) > 0) {
734 cat("### AMG Predictions {.tabset .tabset-fade} \n\n")
735 cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n")
736 vibrant_data %>%
737 kbl() %>%
738 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
739 kable_paper("striped", full_width = TRUE) %>%
740 scroll_box(width = "100%", height = "100%") %>% cat()
741 cat("\n\n")
742 }
743
744 log_debug(paste("Finished processing sample:", sample))
745 }, error = function(e) {
746 log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e)))
747 cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n"))
748 })
749 }
750
751 process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) {
752 cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n"))
753
754 current_contig <- sub("\\|.*", "", virus$seq_name)
755
756 provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name))
757 provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name))
758 virus_length <- provirus_end - provirus_start + 1
759
760 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name)
761 current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name)
762 current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"),
763 genomad_annotation$gene, fixed = FALSE), ] %>%
764 mutate(arrow_pos = ifelse(strand == -1, "start", "end"))
765
766
767 cat("\n\n**Phage–Host Genome Ideogram:**\n\n")
768
769 plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data)
770
771 cat("\n\n")
772 cat("\n\n**Genes Annotation (geNomad):**\n\n")
773
774 current_annotations %>%
775 select(gene, length, marker, annotation_accessions, annotation_description) %>%
776 kbl() %>%
777 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
778 kable_paper() %>%
779 cat()
780 cat("\n\n")
781 } 783 }
782 784
783 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) { 785 plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) {
784 tryCatch({ 786 tryCatch(
785 log_debug("Starting plot_phage_circos function") 787 {
786 log_debug(paste("Current virus:", virus$seq_name)) 788 log_debug("Starting plot_phage_circos function")
787 log_debug(paste("Virus length:", virus_length)) 789 log_debug(paste("Current virus:", virus$seq_name))
788 log_debug(paste("Host genome size:", host_genome_size)) 790 log_debug(paste("Virus length:", virus_length))
789 log_debug(paste("Provirus start:", provirus_start)) 791 log_debug(paste("Host genome size:", host_genome_size))
790 log_debug(paste("Provirus end:", provirus_end)) 792 log_debug(paste("Provirus start:", provirus_start))
791 793 log_debug(paste("Provirus end:", provirus_end))
792 # Check for NA or invalid values in input parameters 794
793 if (is.na(virus_length) || virus_length <= 0) { 795 # Check for NA or invalid values in input parameters
794 log_debug("Error: Invalid virus length") 796 if (is.na(virus_length) || virus_length <= 0) {
795 return(NULL) 797 log_debug("Error: Invalid virus length", virus_length)
796 } 798 return(NULL)
797 if (is.na(host_genome_size) || host_genome_size <= 0) { 799 }
798 log_debug("Error: Invalid host genome size") 800 if (is.na(host_genome_size) || host_genome_size <= 0) {
799 return(NULL) 801 log_debug("Error: Invalid host genome size")
800 } 802 return(NULL)
801 if (is.na(provirus_start) || provirus_start < 0) { 803 }
802 log_debug("Error: Invalid provirus start position") 804 if (is.na(provirus_start) || provirus_start < 0) {
803 return(NULL) 805 log_debug("Error: Invalid provirus start position")
804 } 806 return(NULL)
805 if (is.na(provirus_end) || provirus_end <= provirus_start) { 807 }
806 log_debug("Error: Invalid provirus end position") 808 if (is.na(provirus_end) || provirus_end <= provirus_start) {
807 return(NULL) 809 log_debug("Error: Invalid provirus end position")
808 } 810 return(NULL)
809 811 }
810 # Extract contig information 812
811 current_contig <- sub("\\|.*", "", virus$seq_name) 813 # Extract contig information
812 log_debug(paste("Current contig:", current_contig)) 814 current_contig <- sub("\\|.*", "", virus$seq_name)
813 815 log_debug(paste("Current contig:", current_contig))
814 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ] 816
815 if (nrow(contig_viruses) == 0) { 817 contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ]
816 log_debug("Error: No viruses found for the current contig") 818 if (nrow(contig_viruses) == 0) {
817 return(NULL) 819 log_debug("Error: No viruses found for the current contig")
818 } 820 return(NULL)
819 821 }
820 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name))) 822
821 if (is.na(contig_length) || contig_length <= 0) { 823 contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name)))
822 contig_length <- virus_length # Use virus length as fallback if contig length is invalid 824 if (is.na(contig_length) || contig_length <= 0) {
823 log_debug(paste("Using virus length as contig length:", contig_length)) 825 contig_length <- virus_length # Use virus length as fallback if contig length is invalid
824 } else { 826 log_debug(paste("Using virus length as contig length:", contig_length))
825 log_debug(paste("Contig length:", contig_length)) 827 } else {
826 } 828 log_debug(paste("Contig length:", contig_length))
827 829 }
828 if (provirus_end > contig_length) { 830
829 log_debug("Error: Provirus end position exceeds contig length") 831 if (provirus_end > contig_length) {
830 return(NULL) 832 log_debug("Error: Provirus end position exceeds contig length")
831 } 833 return(NULL)
832 834 }
833 log_debug("Clearing circos") 835
834 circos.clear() 836 log_debug("Clearing circos")
835 837 circlize::circos.clear()
836 log_debug("Setting circos parameters") 838
837 circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01)) 839 log_debug("Setting circos parameters")
838 840 circlize::circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01))
839 main_color <- "#a58bc5" 841
840 zoom_color <- "#e9d27d" 842 main_color <- "#a58bc5"
841 843 zoom_color <- "#e9d27d"
842 zoom_start <- (provirus_start / contig_length) * 100 844
843 zoom_end <- (provirus_end / contig_length) * 100 845 zoom_start <- (provirus_start / contig_length) * 100
844 846 zoom_end <- (provirus_end / contig_length) * 100
845 log_debug(paste("Zoom start:", zoom_start)) 847
846 log_debug(paste("Zoom end:", zoom_end)) 848 log_debug(paste("Zoom start:", zoom_start))
847 849 log_debug(paste("Zoom end:", zoom_end))
848 log_debug("Initializing circos") 850
849 circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100)) 851 log_debug("Initializing circos")
850 852 circlize::circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100))
851 format_genome_labels <- function(x) { 853
852 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"), 854 format_genome_labels <- function(x) {
853 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"), 855 ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"),
854 paste0(x, " bp"))) 856 ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"),
855 } 857 paste0(x, " bp")
856 858 )
857 log_debug("Adding link") 859 )
858 tryCatch({ 860 }
859 circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100), 861
860 rou1 = 0.8, 862 log_debug("Adding link")
861 rou2 = 0.97, 863 tryCatch(
862 h.ratio = 0.55, # width? 864 {
863 lty = 2, 865 circlize::circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100),
864 lwd = 0.5, 866 rou1 = 0.8,
865 h2 = 1, 867 rou2 = 0.97,
866 col = "grey99", border = "grey80") 868 h.ratio = 0.55, # width?
867 }, error = function(e) { 869 lty = 2,
868 log_debug(paste("Error in circos.link:", e$message)) 870 lwd = 0.5,
869 }) 871 h2 = 1,
870 872 col = "grey99", border = "grey80"
871 log_debug("Adding zoom track") 873 )
872 circos.track(factors = "Zoom", ylim = c(0, 1), track.height = 0.15, 874 },
873 panel.fun = function(x, y) { 875 error = function(e) {
874 circos.rect(0, 0, 100, 1, col = zoom_color, border = NA) 876 log_debug(paste("Error in circlize::circos.link:", e$message))
875 axis_labels <- seq(0, virus_length, length.out = 6) 877 }
876 axis_positions <- seq(0, 100, length.out = 6) 878 )
877 circos.axis(h = "top", major.at = axis_positions, 879
878 labels = format_genome_labels(axis_labels), 880 log_debug("Adding zoom track")
879 labels.cex = 0.7, direction = "outside") 881 circlize::circos.track(
880 882 factors = "Zoom", ylim = c(0, 1), track.height = 0.15,
881 for (i in 1:nrow(current_annotations)) { 883 panel.fun = function(x, y) {
882 gene_start <- current_annotations$start[i] 884 circlize::circos.rect(0, 0, 100, 1, col = zoom_color, border = NA)
883 gene_end <- current_annotations$end[i] 885 axis_labels <- seq(0, virus_length, length.out = 6)
884 arrow_start <- (gene_start - provirus_start) / virus_length * 100 886 axis_positions <- seq(0, 100, length.out = 6)
885 arrow_end <- (gene_end - provirus_start) / virus_length * 100 887 circlize::circos.axis(
886 888 h = "top", major.at = axis_positions,
887 circos.arrow(arrow_start, arrow_end, y1 = 0, y2 = 1, 889 labels = format_genome_labels(axis_labels),
888 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1), 890 labels.cex = 0.7, direction = "outside"
889 arrow.position = current_annotations$arrow_pos[i], 891 )
890 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"), 892
891 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue")) 893 for (i in 1:nrow(current_annotations)) {
892 } 894 gene_start <- current_annotations$start[i]
893 }, bg.border = NA) 895 gene_end <- current_annotations$end[i]
894 896 arrow_start <- (gene_start - provirus_start) / virus_length * 100
895 log_debug("Adding main track") 897 arrow_end <- (gene_end - provirus_start) / virus_length * 100
896 circos.track(factors = "Main", ylim = c(0, 1), track.height = 0.1, 898
897 panel.fun = function(x, y) { 899 circlize::circos.arrow(arrow_start, arrow_end,
898 circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA) 900 arrow.head.width = 0.75, arrow.head.length = cm_x(0.1),
899 901 arrow.position = current_annotations$arrow_pos[i],
900 for (i in 1:nrow(contig_viruses)) { 902 col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"),
901 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) 903 border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue")
902 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) 904 )
903 905 }
904 virus_start_percent <- (virus_start / contig_length) * 100 906 }, bg.border = NA
905 virus_end_percent <- (virus_end / contig_length) * 100 907 )
906 908
907 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7) 909 log_debug("Adding main track")
908 910 circlize::circos.track(
909 circos.rect(xleft = virus_start_percent, ybottom = 0, 911 factors = "Main", ylim = c(0, 1), track.height = 0.1,
910 xright = virus_end_percent, ytop = 1, 912 panel.fun = function(x, y) {
911 col = rect_color, border = NA) 913 circlize::circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA)
912 } 914
913 915 for (i in 1:nrow(contig_viruses)) {
914 axis_labels <- seq(0, contig_length, length.out = 6) 916 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i]))
915 axis_positions <- seq(0, 100, length.out = 6) 917 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i]))
916 circos.axis(h = "top", major.at = axis_positions, 918
917 labels = format_genome_labels(axis_labels), 919 virus_start_percent <- (virus_start / contig_length) * 100
918 labels.cex = 0.7, direction = "outside") 920 virus_end_percent <- (virus_end / contig_length) * 100
919 }, bg.border = NA) 921
920 922 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7)
921 log_debug("Locating phage positions") 923
922 phage_positions <- sapply(1:nrow(contig_viruses), function(i) { 924 circlize::circos.rect(
923 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) 925 xleft = virus_start_percent, ybottom = 0,
924 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) 926 xright = virus_end_percent, ytop = 1,
925 ((virus_start + virus_end) / 2 / contig_length) * 100 927 col = rect_color, border = NA
926 }) 928 )
927 929 }
928 log_debug("Annotating names on phage positions") 930
929 931 axis_labels <- seq(0, contig_length, length.out = 6)
930 # Extract start and end positions from sequence names 932 axis_positions <- seq(0, 100, length.out = 6)
931 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name)) 933 circlize::circos.axis(
932 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name)) 934 h = "top", major.at = axis_positions,
933 935 labels = format_genome_labels(axis_labels),
934 # Create phage labels with the desired format 936 labels.cex = 0.7, direction = "outside"
935 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb") 937 )
936 938 }, bg.border = NA
937 # Apply labels to circos plot 939 )
938 circos.labels( 940
939 sectors = "Main", 941 log_debug("Locating phage positions")
940 x = phage_positions, 942 phage_positions <- sapply(1:nrow(contig_viruses), function(i) {
941 labels = phage_labels, 943 virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i]))
942 facing = "reverse.clockwise", 944 virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i]))
943 niceFacing = TRUE, 945 ((virus_start + virus_end) / 2 / contig_length) * 100
944 col = "black", 946 })
945 cex = 0.6, 947
946 side = "inside", 948 log_debug("Annotating names on phage positions")
947 connection_height = 0.02, 949
948 line_col = "gray" 950 # Extract start and end positions from sequence names
949 ) 951 start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name))
950 952 end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name))
951 center_x <- 50 953
952 virus_name <- virus$seq_name 954 # Create phage labels with the desired format
953 taxonomy <- virus$taxonomy 955 phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb")
954 956
955 log_debug("Adding taxonomy and virus name to the plot") 957 # Apply labels to circos plot
956 circos.text(x = center_x, y = -0.2, labels = taxonomy, 958 circlize::circos.labels(
959 sectors = "Main",
960 x = phage_positions,
961 labels = phage_labels,
962 facing = "reverse.clockwise",
963 niceFacing = TRUE,
964 col = "black",
965 cex = 0.6,
966 side = "inside",
967 connection_height = 0.02,
968 line_col = "gray"
969 )
970
971 center_x <- 50
972 virus_name <- virus$seq_name
973 taxonomy <- virus$taxonomy
974
975 log_debug("Adding taxonomy and virus name to the plot")
976 circlize::circos.text(
977 x = center_x, y = -0.2, labels = taxonomy,
957 sector.index = "Zoom", track.index = 1, 978 sector.index = "Zoom", track.index = 1,
958 facing = "bending.inside", niceFacing = TRUE, 979 facing = "bending.inside", niceFacing = TRUE,
959 adj = c(0.5, 0.7), cex = 0.8) 980 adj = c(0.5, 0.7), cex = 0.8
960 981 )
961 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ] 982
962 if (nrow(checkv_info) > 0) { 983 checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ]
963 checkv_quality <- checkv_info$checkv_quality 984 if (nrow(checkv_info) > 0) {
964 gene_count <- checkv_info$gene_count 985 checkv_quality <- checkv_info$checkv_quality
965 viral_genes <- checkv_info$viral_genes 986 gene_count <- checkv_info$gene_count
966 host_genes <- checkv_info$host_genes 987 viral_genes <- checkv_info$viral_genes
967 miuvig_quality <- checkv_info$miuvig_quality 988 host_genes <- checkv_info$host_genes
968 completeness <- checkv_info$completeness 989 miuvig_quality <- checkv_info$miuvig_quality
969 completeness_method <- checkv_info$completeness_method 990 completeness <- checkv_info$completeness
970 contamination <- checkv_info$contamination 991 completeness_method <- checkv_info$completeness_method
971 992 contamination <- checkv_info$contamination
972 circos.text( 993
973 x = center_x, y = -0.5, 994 circlize::circos.text(
974 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality), 995 x = center_x, y = -0.5,
975 sector.index = "Zoom", track.index = 2, 996 labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality),
976 facing = "bending.inside", niceFacing = TRUE, 997 sector.index = "Zoom", track.index = 2,
977 adj = c(0.5, 0), cex = 0.7 998 facing = "bending.inside", niceFacing = TRUE,
978 ) 999 adj = c(0.5, 0), cex = 0.7
979 1000 )
980 circos.text( 1001
981 x = center_x, y = -1.5, 1002 circlize::circos.text(
982 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes), 1003 x = center_x, y = -1.5,
983 sector.index = "Zoom", track.index = 2, 1004 labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes),
984 facing = "bending.inside", niceFacing = TRUE, 1005 sector.index = "Zoom", track.index = 2,
985 adj = c(0.5, 0), cex = 0.7 1006 facing = "bending.inside", niceFacing = TRUE,
986 ) 1007 adj = c(0.5, 0), cex = 0.7
987 1008 )
988 circos.text( 1009
989 x = center_x, y = -2.5, 1010 circlize::circos.text(
990 labels = paste("Completeness:", completeness, " - Contamination:", contamination), 1011 x = center_x, y = -2.5,
991 sector.index = "Zoom", track.index = 2, 1012 labels = paste("Completeness:", completeness, " - Contamination:", contamination),
992 facing = "bending.inside", niceFacing = TRUE, 1013 sector.index = "Zoom", track.index = 2,
993 adj = c(0.5, 0), cex = 0.7 1014 facing = "bending.inside", niceFacing = TRUE,
994 ) 1015 adj = c(0.5, 0), cex = 0.7
995 } 1016 )
996 1017 }
997 log_debug("Adding legend") 1018
998 # Add legend 1019 log_debug("Adding legend")
999 legend("topright", 1020 # Add legend
1000 legend = c("Annotated gene", "Unknown gene"), 1021 legend("topright",
1001 fill = c("#7fbfff", "grey"), 1022 legend = c("Annotated gene", "Unknown gene"),
1002 border = c("darkblue", "grey20"), 1023 fill = c("#7fbfff", "grey"),
1003 cex = 0.8, 1024 border = c("darkblue", "grey20"),
1004 bty = "n") 1025 cex = 0.8,
1005 1026 bty = "n"
1006 log_debug("Clearing circos") 1027 )
1007 circos.clear() 1028
1008 log_debug("Finished plot_phage_circos function successfully") 1029 log_debug("Clearing circos")
1009 }, error = function(e) { 1030 circlize::circos.clear()
1010 log_debug(paste("Error in plot_phage_circos:", e$message)) 1031 log_debug("Finished plot_phage_circos function successfully")
1011 circos.clear() 1032 },
1012 }) 1033 error = function(e) {
1034 log_debug(paste("Error in plot_phage_circos:", e$message))
1035 circlize::circos.clear()
1036 }
1037 )
1013 } 1038 }
1014 1039
1015 plot_drep <- function(sample, genomad_summary) { 1040 plot_drep <- function(sample, genomad_summary) {
1016 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv") 1041 drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv")
1017 drep_data <- read_csv(drep_file_path) %>% clean_names() 1042 drep_data <- read_csv(drep_file_path) %>% janitor::clean_names()
1018 drep_data <- cbind(genomad_summary$seq_name, drep_data) 1043 drep_data <- cbind(genomad_summary$seq_name, drep_data)
1019 1044
1020 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n") 1045 cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n")
1021 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).") 1046 cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).")
1022 1047
1023 cat("\n\n**Final cluster designations**\n\n") 1048 cat("\n\n**Final cluster designations**\n\n")
1024 drep_data %>% 1049 drep_data %>%
1025 kbl() %>% 1050 kableExtra::kbl() %>%
1026 kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 1051 kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
1027 kable_paper("striped", full_width = TRUE) %>% 1052 kableExtra::kable_paper("striped", full_width = TRUE) %>%
1028 cat() 1053 cat()
1029 1054
1030 # Insert the PDF plot 1055 # Insert the PDF plot
1031 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf") 1056 plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf")
1032 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png") 1057 png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png")
1033 1058
1034 if (file.exists(plot_path)) { 1059 if (file.exists(plot_path)) {
1035 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi=150) 1060 pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi = 150)
1036 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png") 1061 base64_str <- base64enc::dataURI(file = png_path, mime = "image/png")
1037 cat("**Primary clustering plot**\n\n") 1062 cat("**Primary clustering plot**\n\n")
1038 cat(sprintf( 1063 cat(sprintf(
1039 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>',base64_str 1064 '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>', base64_str
1040 )) 1065 ))
1041 } else { 1066 } else {
1042 cat("**No dRep clustering plot found.**\n\n") 1067 cat("**No dRep clustering plot found.**\n\n")
1043 } 1068 }
1044
1045 } 1069 }
1046 1070
1047 subset_and_update_ideogram <- function(ideogram, contigs) { 1071 subset_and_update_ideogram <- function(ideogram, contigs) {
1048 filtered <- ideogram[seqnames(ideogram) %in% contigs] 1072 filtered <- ideogram[seqnames(ideogram) %in% contigs]
1049 seqlevels(filtered) <- contigs 1073 GenomeInfoDb::seqlevels(filtered) <- contigs
1050 seqinfo(filtered) <- seqinfo(filtered)[contigs] 1074 seqinfo(filtered) <- seqinfo(filtered)[contigs]
1051 filtered 1075 filtered
1052 } 1076 }
1053 1077
1054 render_all_samples <- function(test_mode = FALSE) { 1078 render_all_samples <- function(test_mode = FALSE) {
1055 if (test_mode) { 1079 if (test_mode) {
1056 if (nrow(combined_unique) > 0) { 1080 if (nrow(combined_unique) > 0) {
1057 cat("**Select sample to show:** \n\n\n") 1081 cat("**Select sample to show:** \n\n\n")
1058 current_sample <- combined_unique$Sample[6] 1082 current_sample <- combined_unique$Sample[6]
1059 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) 1083 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data)
1084 } else {
1085 print("No samples can be further analysed.")
1086 }
1060 } else { 1087 } else {
1061 print("No samples can be further analysed.") 1088 cat("**Select sample to show:** \n\n\n")
1062 } 1089 for (i in seq_len(nrow(combined_unique))) {
1063 } else { 1090 current_sample <- combined_unique$Sample[i]
1064 cat("**Select sample to show:** \n\n\n") 1091 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data)
1065 for (i in seq_len(nrow(combined_unique))) { 1092 }
1066 current_sample <- combined_unique$Sample[i] 1093 }
1067 process_sample(current_sample, combined_unique, host_genomes_paths, genome_data)
1068 }
1069 }
1070 } 1094 }
1071 1095
1072 # Execute the main function 1096 # Execute the main function
1073 # Test mode processes one sample only 1097 # Test mode processes one sample only
1074 render_all_samples(test_mode = F) 1098 render_all_samples(test_mode = F)
1075 ``` 1099 ```
1076
1077 # Citation
1078
1079 Cite this work: XXXXX
1080
1081