Mercurial > repos > ufz > phi_toolkit_report
view report.Rmd @ 1:3a7f73d638ba draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/ufz-galaxy-tools/blob/main/tools/phi-toolkit commit 368e8a7322c9763c648637263d4695abc146be13
author | ufz |
---|---|
date | Tue, 22 Jul 2025 11:09:24 +0000 |
parents | 315c2ed31af1 |
children |
line wrap: on
line source
--- title: "PHI Prophage-Host Interaction Toolkit report" subtitle: "Toolkit for the Detection, Comparison, and Annotation of Prophages in Bacterial Genomes." date: "`r format(Sys.Date(), '%B %d, %Y')`" output: html_document: theme: flatly toc: yes toc_float: false number_sections: yes code_folding: none fig_width: 12 fig_height: 8 fig_caption: true df_print: paged editor_options: markdown: wrap: 72 params: outdir: "data" --- ------------------------------------------------------------------------ ```{r setup_env, include=FALSE} knitr::opts_chunk$set(echo = FALSE) knitr::opts_chunk$set(dev = "svglite") # set output device to svg cat("params$outdir:", params$outdir, "\n") ``` ```{r setup_libraries, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} # Define required packages # Note: update version_command if changed here! required_packages <- c( "tidyverse", "janitor", "here", "kableExtra", "gmoviz", "circlize", "GenomicRanges", "patchwork", "fs", "tools", "scales", "formattable", "pdftools", "base64" ) # Load required packages invisible(lapply(required_packages, library, character.only = TRUE)) ``` ```{r helper_functions, echo=FALSE} log_file <- "debug.log" log_debug <- function(message) { if (!exists("log_initialized") || !log_initialized) { cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = FALSE) assign("log_initialized", TRUE, envir = .GlobalEnv) } else { cat(paste0(Sys.time(), " - DEBUG: ", message, "\n"), file = log_file, append = TRUE) } } load_file <- function(path) { log_debug(paste("Attempting to load:", path)) if (file.exists(path)) { ext <- tools::file_ext(path) if (ext %in% c("tsv", "csv")) { data <- read_delim(path, delim = ifelse(ext == "csv", ",", "\t"), show_col_types = FALSE) %>% janitor::clean_names() log_debug(paste("Loaded", nrow(data), "rows from", path)) data } else if (ext == "fna") { data <- Biostrings::readDNAStringSet(path) log_debug(paste("Loaded", length(data), "sequences from", path)) data } else { log_debug(paste("Skipping", path, "- unsupported file type")) NULL } } else { log_debug(paste("File does not exist:", path)) NULL } } get_file_info <- function(path, loaded_data) { log_debug(paste("Processing file info for:", path)) if (file.exists(path)) { ext <- tools::file_ext(path) if (ext %in% c("tsv", "csv", "fna")) { data <- loaded_data[[basename(path)]] rows <- if (ext == "fna") length(data) else nrow(data) tibble::tibble(exists = TRUE, rows = rows, size = file.size(path), path = path) } else { tibble::tibble(exists = TRUE, rows = NA_integer_, size = file.size(path), path = path) } } else { tibble::tibble(exists = FALSE, rows = NA_integer_, size = NA_real_, path = NA_character_) } } process_genome_folder <- function(folder, host_analyses_dir, virus_analyses_dir) { log_debug(paste("Processing folder:", folder)) genome_name <- basename(folder) paths <- list( genomad = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_summary.tsv")), genomad_phages = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus.fna")), genomad_annotations = file.path(host_analyses_dir, "genomad", genome_name, paste0(genome_name, "_summary"), paste0(genome_name, "_virus_genes.tsv")), defense_finder = file.path(host_analyses_dir, "defense-finder", genome_name, paste0(genome_name, "_defense_finder_systems.tsv")), checkv = file.path(virus_analyses_dir, "checkv", genome_name, "quality_summary.tsv"), iphop = file.path(virus_analyses_dir, "iphop", genome_name, "Host_prediction_to_genome_m90.csv"), drep = file.path(virus_analyses_dir, "drep_compare", genome_name, "data_tables", "Cdb.csv"), phatyp = file.path(virus_analyses_dir, "phatyp", genome_name, "phatyp.csv"), abricate = file.path(virus_analyses_dir, "abricate", genome_name, paste0(genome_name, "_virus_vfdb.tsv")), vibrant = file.path( virus_analyses_dir, "vibrant", genome_name, paste0("VIBRANT_", genome_name, "_virus"), paste0("VIBRANT_results_", genome_name, "_virus"), paste0("VIBRANT_AMG_individuals_", genome_name, "_virus.tsv") ) ) loaded_data <- map(paths, load_file) file_info <- purrr::map_dfr(paths, ~ get_file_info(.x, loaded_data), .id = "file_type") virus_count <- if (!is.null(loaded_data$genomad)) { count <- sum(loaded_data$genomad$virus_score > 0.5, na.rm = TRUE) log_debug(paste("Virus count:", count)) count } else { log_debug("No genomad summary found, virus count set to 0") 0 } log_debug("Returning results from process_genome_folder") list(file_info = file_info, virus_count = virus_count, loaded_data = loaded_data) } ``` ```{r compile_results, message=FALSE, warning=FALSE, echo=FALSE} compile_results <- function() { base_dir <- params$outdir log_debug(paste("Base directory:", base_dir)) host_analyses_dir <- file.path(base_dir, "host_analyses") virus_analyses_dir <- file.path(base_dir, "virus_analyses") # List all sample-level directories from all tools under virus_analyses tool_dirs <- list.dirs(virus_analyses_dir, full.names = TRUE, recursive = FALSE) genome_folders <- list.dirs(file.path(base_dir, "host_analyses", "genomad"), full.names = TRUE, recursive = FALSE ) # cat(length(genome_folders), "sample(s) processed\n") log_debug("Processing genome folders") genome_data <- map(genome_folders, process_genome_folder, host_analyses_dir = host_analyses_dir, virus_analyses_dir = virus_analyses_dir ) %>% purrr::set_names(basename(genome_folders)) %>% compact() log_debug("Creating summary dataframe") summary_df <- purrr::map_dfr(genome_data, ~ { file_info <- .x$file_info tibble::tibble( Sample = basename(file_info$path[1]), Virus_Count = .x$virus_count, geNomad = file_info$exists[file_info$file_type == "genomad"], CheckV = file_info$exists[file_info$file_type == "checkv"], VIBRANT = file_info$exists[file_info$file_type == "vibrant"], dRep = file_info$exists[file_info$file_type == "drep"], iPHOP = file_info$exists[file_info$file_type == "iphop"], PhaTYP = file_info$exists[file_info$file_type == "phatyp"], Defense_Finder = file_info$exists[file_info$file_type == "defense_finder"], geNomad_Path = file_info$path[file_info$file_type == "genomad"], CheckV_Path = file_info$path[file_info$file_type == "checkv"], VIBRANT_Path = file_info$path[file_info$file_type == "vibrant"], dRep_Path = file_info$path[file_info$file_type == "drep"], PhaTYP_Path = file_info$path[file_info$file_type == "phatyp"], Defense_Finder_Path = file_info$path[file_info$file_type == "defense_finder"], Virus_Contigs = ifelse(file_info$exists[file_info$file_type == "genomad_phages"], file_info$rows[file_info$file_type == "genomad_phages"], 0 ) ) }) %>% dplyr::mutate(dplyr::across(ends_with("_Path"), ~ ifelse(is.na(.), "Not available", as.character(.)))) host_genomes_fasta <- list.files( path = file.path(params$outdir, "genomes"), pattern = "\\.fna$", full.names = TRUE ) host_genomes_paths <- tibble::tibble( name = tools::file_path_sans_ext(basename(host_genomes_fasta)), path = host_genomes_fasta ) data_gtdbtk_host <- read_tsv( file.path(params$outdir, "host_analyses/gtdbtk/gtdbtk.bac120.summary.tsv"), show_col_types = FALSE ) %>% janitor::clean_names() data_checkm_host <- read_tsv( file.path(params$outdir, "host_analyses/checkm2/quality_report.tsv"), show_col_types = FALSE ) %>% janitor::clean_names() log_debug("Returning summary dataframe, genome data, and host data") log_debug(paste("summary_df dimensions:", nrow(summary_df), "rows,", ncol(summary_df), "columns")) log_debug(paste("summary_df column names:", paste(colnames(summary_df), collapse = ", "))) log_debug(paste("genome_data length:", length(genome_data))) log_debug(paste("genome_data names:", paste(names(genome_data), collapse = ", "))) log_debug(paste("host_genomes_paths dimensions:", nrow(host_genomes_paths), "rows,", ncol(host_genomes_paths), "columns")) log_debug(paste("host_genomes_paths column names:", paste(colnames(host_genomes_paths), collapse = ", "))) log_debug(paste("data_gtdbtk_host dimensions:", nrow(data_gtdbtk_host), "rows,", ncol(data_gtdbtk_host), "columns")) log_debug(paste("data_gtdbtk_host column names:", paste(colnames(data_gtdbtk_host), collapse = ", "))) log_debug(paste("data_checkm_host dimensions:", nrow(data_checkm_host), "rows,", ncol(data_checkm_host), "columns")) log_debug(paste("data_checkm_host column names:", paste(colnames(data_checkm_host), collapse = ", "))) list( summary = summary_df, genome_data = genome_data, host_genomes_paths = host_genomes_paths, data_gtdbtk_host = data_gtdbtk_host, data_checkm_host = data_checkm_host ) } ``` ```{r run_main_function, echo=FALSE, message=FALSE, warning=FALSE} log_debug("Starting execution of main function") result <- compile_results() if (is.null(result)) { log_debug("Main function execution failed") stop("Main function execution failed") } summary_df <- result$summary genome_data <- result$genome_data host_genomes_paths <- result$host_genomes_paths data_gtdbtk_host <- result$data_gtdbtk_host data_checkm_host <- result$data_checkm_host log_debug("Data extracted successfully") result$summary <- result$summary %>% dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv")) ``` # Summary {.tabset .tabset-fade} This table provides sample-by-sample information on detected viruses and key host genome statistics. It includes taxonomy, virus count, genome quality classification, CheckM2 metrics (completeness and contamination), and genome assembly statistics such as size and N50. ```{r render_table, message=FALSE, warning=FALSE, echo=FALSE, results='asis'} data <- result$summary log_debug("Assigning checkm2 host data") checkm_host_data <- data_checkm_host %>% janitor::clean_names() %>% dplyr::select( name, completeness, contamination, contig_n50, genome_size ) log_debug("Assigning GTDB-Tk host data") gtdbtk_data <- data_gtdbtk_host %>% dplyr::select(user_genome, classification) log_debug("Defining color-blind friendly palette") cb_friendly_colors <- list( green = "#009E73", blue = "#0072B2", orange = "#E69F00", red = "#D55E00", grey = "#999999" ) log_debug("Defining function to color cells") color_cell <- function(values, color_true = cb_friendly_colors$green, color_false = cb_friendly_colors$red) { ifelse(values, kableExtra::cell_spec("Yes", color = "white", bold = TRUE, background = color_true), kableExtra::cell_spec("No", color = "white", bold = TRUE, background = color_false) ) } log_debug("Defining function to create bar plot") create_bar_plot <- function(values, max_value, color = cb_friendly_colors$grey) { sapply(values, function(value) { if (is.na(value) || !is.numeric(value)) { return("N/A") } bar_width <- min(max(value, 0), max_value) / max_value * 100 sprintf( '<div style="background-color: %s; width: %f%%; height: 10px;"></div>%.1f%%', color, bar_width, value ) }) } log_debug("Defining function to format large numbers") format_large_number <- function(x) { sapply(x, function(value) { if (is.na(value) || !is.numeric(value)) { return("N/A") } else if (value < 1000) { return(as.character(value)) } else if (value < 1e6) { return(paste0(round(value / 1e3, 1), "K")) } else if (value < 1e9) { return(paste0(round(value / 1e6, 1), "M")) } else { return(paste0(round(value / 1e9, 1), "G")) } }) } log_debug("Defining function to extract last known taxonomy level") extract_last_known_taxonomy <- function(classification) { if (is.na(classification) || classification == "") { return(list(level = "Unknown", name = "Unknown")) } parts <- strsplit(classification, ";")[[1]] for (i in length(parts):1) { level <- sub("^[a-z]__", "", parts[i]) if (level != "") { prefix <- sub("__.*$", "", parts[i]) return(list(level = prefix, name = level)) } } return(list(level = "Unknown", name = "Unknown")) } log_debug("Defining function to format taxonomy") format_taxonomy <- function(classification) { result <- extract_last_known_taxonomy(classification) if (result$level == "Unknown") { return("Unknown") } else if (result$level == "s") { return(paste0("<i>", result$name, "</i>")) } else { genus <- str_replace_all(result$name, "_", " ") return(paste0("<i>", genus, "</i> sp.")) } } log_debug("Defining function to calculate quality score and determine genome quality class") calculate_quality_score_and_class <- function(completeness, contamination) { if (is.na(completeness) || is.na(contamination)) { return(list( score = kableExtra::cell_spec("N/A", color = "white", bold = TRUE, background = cb_friendly_colors$grey), class = kableExtra::cell_spec("Unknown", color = "white", bold = TRUE, background = cb_friendly_colors$grey), numeric_score = NA )) } quality_score <- completeness - (5 * contamination) formatted_score <- sprintf("%.1f", quality_score) if (completeness > 90 && contamination < 5) { class <- "High-quality draft" color <- cb_friendly_colors$green } else if (completeness >= 50 && contamination < 10) { class <- "Medium-quality draft" color <- cb_friendly_colors$blue } else { class <- "Low-quality draft" color <- cb_friendly_colors$red } list( score = kableExtra::cell_spec(formatted_score, color = "white", bold = TRUE, background = color), class = kableExtra::cell_spec(class, color = "white", bold = TRUE, background = color), numeric_score = quality_score ) } log_debug("Preparing the data") table_data <- data %>% # dplyr::mutate(Sample = basename(Sample) %>% trim_sample_name()) %>% dplyr::mutate(Sample = basename(Sample)) %>% dplyr::left_join(checkm_host_data, by = c("Sample" = "name")) %>% dplyr::left_join(gtdbtk_data, by = c("Sample" = "user_genome")) %>% dplyr::mutate( quality_data = purrr::pmap( list( as.numeric(completeness), as.numeric(contamination) ), calculate_quality_score_and_class ), Quality_Score = purrr::map_chr(quality_data, ~ .$score), Genome_Quality = purrr::map_chr(quality_data, ~ .$class), Quality_Score_Numeric = purrr::map_dbl(quality_data, ~ .$numeric_score), Virus_Count_Numeric = as.numeric(Virus_Count), Virus_Count = kableExtra::cell_spec( Virus_Count, color = "white", bold = TRUE, background = dplyr::case_when( Virus_Count == 0 ~ cb_friendly_colors$red, Virus_Count == 1 ~ cb_friendly_colors$blue, Virus_Count > 1 ~ cb_friendly_colors$green ) ), Completeness_Numeric = as.numeric(completeness), Completeness = create_bar_plot(as.numeric(completeness), 100), Contamination = create_bar_plot(as.numeric(contamination), 100), `N50 (contigs)` = format_large_number(as.numeric(contig_n50)), `Genome size (bp)` = format_large_number(as.numeric(genome_size)), `GTDB Taxonomy` = sapply(classification, format_taxonomy) ) %>% dplyr::mutate(`#` = row_number()) %>% dplyr::select( `#`, Sample, `GTDB Taxonomy`, Virus_Count, Quality_Score, Genome_Quality, Completeness, Contamination, `Genome size (bp)`, `N50 (contigs)` ) log_debug("Creating the table") kableExtra::kbl(table_data, escape = FALSE, align = c("c", "l", "l", "c", rep("c", 2), rep("r", 2), rep("r", 2)) ) %>% kableExtra::kable_paper(full_width = TRUE) %>% kableExtra::column_spec(1, bold = TRUE, width = "2em") %>% kableExtra::column_spec(2:3, bold = TRUE) %>% kableExtra::column_spec(4:5, width = "5em") %>% kableExtra::column_spec(6:7, width = "60px") %>% kableExtra::column_spec(8:9, width = "4em") %>% kableExtra::add_header_above(c( " " = 4, "Host Genome Quality" = 2, "CheckM Metrics" = 2, "Statistics" = 2 )) %>% kableExtra::kable_styling( bootstrap_options = c("striped", "hover", "condensed", "responsive"), font_size = 9, html_font = "Arial", position = "left" ) %>% kableExtra::row_spec(0, bold = TRUE, color = "white", background = "#333333") %>% kableExtra::row_spec(0, extra_css = "border-bottom: 2px solid #000000;") %>% kableExtra::column_spec(9, extra_css = "border-right: 2px solid #000000;") %>% kableExtra::scroll_box( width = "100%", height = "100%", extra_css = "overflow-x: auto; border: 1px solid #ccc; border-radius: 4px;" ) ``` # Results {.tabset .tabset-fade} ```{r} # Creating combined_unique object combined_unique <- bind_rows( checkm_host_data %>% # dplyr::select(bin_id) %>% # dplyr::rename(Sample = bin_id), dplyr::select(name) %>% dplyr::rename(Sample = name), data %>% # dplyr::mutate(Sample = stringr::str_remove(Sample, "_virus_summary.tsv")) %>% dplyr::select(Sample) ) %>% distinct(Sample) %>% arrange(Sample) log_debug(paste("combined_unique samples:", paste(combined_unique$Sample, collapse = ", "))) ``` ```{r main_workflow, fig.width=6, fig.height=6, out.height="100%", out.width='100%', dpi=300, fig.align='center', warning=FALSE, message=FALSE, results='asis'} # Process proviruses data process_proviruses <- function(data_genomad) { proviruses <- data_genomad %>% dplyr::filter(topology == "Provirus") %>% dplyr::mutate(contig = sub("\\|provirus_.*", "", seq_name)) %>% # take everything before "|provirus" dplyr::mutate(contig = paste0("c", as.numeric(factor(contig)))) %>% # map them to c_1, c_2, ... dplyr::select(seq_name, coordinates, length, contig, virus_score, n_hallmarks) proviruses <- proviruses %>% tidyr::separate(coordinates, into = c("start", "end"), sep = "-") proviruses$start <- as.integer(proviruses$start) proviruses$end <- as.integer(proviruses$end) proviruses_gr_features <- GRanges( seqnames = proviruses$contig, ranges = IRanges( start = proviruses$start, end = proviruses$end ) ) proviruses_gr_features$length <- proviruses_gr_features %>% ranges() %>% width() proviruses_gr_features$score <- as.numeric(proviruses$virus_score) proviruses_gr_features$n_hallmarks <- as.numeric(proviruses$n_hallmarks) proviruses_gr_features$n_hallmarks_pos <- abs(start(proviruses_gr_features) - end(proviruses_gr_features)) / 2 return(proviruses_gr_features) } plot_genome_ideogram <- function(genome_current, proviruses_gr_features) { fasta_file_path <- file.path(params$outdir, "genomes", paste0(genome_current, ".fna")) # cat(fasta_file_path, "\n\n") genome_ideogram <- gmoviz::getIdeogramData(fasta_file = fasta_file_path) # Replace any seqlevel to c_1, c_2, c_3, ... new_seqlevels <- paste0("c", seq_along(GenomeInfoDb::seqlevels(genome_ideogram))) names(new_seqlevels) <- GenomeInfoDb::seqlevels(genome_ideogram) genome_ideogram <- GenomeInfoDb::renameSeqlevels(genome_ideogram, new_seqlevels) colours <- rep("#a58bc5", length(GenomeInfoDb::seqlevels(genome_ideogram))) par(mar = c(2, 2, 2, 2)) # minimal margins around the plot gmoviz::gmovizInitialise(genome_ideogram, sector_colours = colours, sector_border_colours = colours, sector_labels = FALSE ) for (i in 1:length(proviruses_gr_features)) { name <- as.character(seqnames(proviruses_gr_features[i])) start <- as.numeric(start(proviruses_gr_features[i])) end <- as.numeric(end(proviruses_gr_features[i])) region <- data.frame(start = start, end = end) circlize::circos.genomicRect( seqnames = name, region, ytop = .5, ybottom = 0, track.index = 1, sector.index = name, border = "#e9d27d", col = "#e9d27d" ) } length <- as.numeric(proviruses_gr_features$length) length <- ifelse(length > 1000000, paste0(round(length / 1000000, 2), "mb"), paste0(round(length / 1000, 2), "kb") ) labels <- paste0(as.character(seqnames(proviruses_gr_features)), " (", length, ")") circlize::circos.labels( sectors = as.character(seqnames(proviruses_gr_features)), x = as.numeric(start(proviruses_gr_features)), labels, facing = "clockwise" ) } process_sample <- function(sample, combined_unique, host_genomes_paths, genome_data) { genome_current <- sample # Add this line tryCatch( { log_debug(paste("Starting to process sample:", sample)) # Check if sample exists in genome_data if (!(sample %in% names(genome_data))) { log_debug(paste("Sample", sample, "not found in genome_data")) cat(paste("Error: Sample", sample, "not found in genome_data\n\n")) return() } cat(paste("## ", sample, "{.tabset .tabset-fade} \n\n")) host_genome_path <- host_genomes_paths$path[host_genomes_paths$name == sample] if (length(host_genome_path) == 0) { log_debug(paste("Host genome path not found for sample:", sample)) cat(paste("Error: Host genome path not found for sample", sample, "\n\n")) return() } host_genome_ideogram <- tryCatch( { gmoviz::getIdeogramData(fasta_file = host_genome_path) }, error = function(e) { log_debug(paste("Error loading host genome ideogram for sample", sample, ":", conditionMessage(e))) NULL } ) if (is.null(host_genome_ideogram)) { cat(paste("Error: Unable to load host genome ideogram for sample", sample, "\n\n")) return() } sample_data <- genome_data[[sample]]$loaded_data genomad_summary <- sample_data$genomad genomad_annotation <- sample_data$genomad_annotations checkv_data <- sample_data$checkv defense_finder_data <- sample_data$defense_finder abricate_data <- sample_data$abricate iphop_data <- sample_data$iphop vibrant_data <- sample_data$vibrant cat("### Host Genome\n\n") cat("**GTDB-Tk taxonomy**: \n\n") data_gtdbtk_host %>% dplyr::filter(user_genome == sample) %>% dplyr::select(classification) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() # Cat checkm summary for this genome cat("**CheckM2 Summary**:\n\n") # checkm_summary <- data_checkm_host %>% dplyr::filter(`bin_id` == sample) checkm_summary <- data_checkm_host %>% dplyr::filter(`name` == sample) checkm_summary %>% janitor::clean_names() %>% # dplyr::select(number_contigs, n50_contigs, completeness, contamination, strain_heterogeneity) %>% dplyr::select(total_contigs, contig_n50, completeness, contamination) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() # Display defense-finder as a table if (!is.null(defense_finder_data) && nrow(defense_finder_data) > 0) { cat("**Defense-Finder Systems**:\n\n") defense_finder_data %>% dplyr::select(sys_id, type, subtype, sys_beg, sys_end, protein_in_syst, genes_count, name_of_profiles_in_sys) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() } else { cat("No Defense-Finder systems detected.\n\n") } if (is.null(genomad_summary) || nrow(genomad_summary) == 0) { q log_debug(paste("No geNomad summary data found for sample:", sample)) return() } if (length(GenomeInfoDb::seqlevels(host_genome_ideogram)) == 1) { host_genome_size <- sum(width(host_genome_ideogram)) } else { virus_containing_contigs <- unique(sub("\\|.*", "", genomad_summary$seq_name)) virus_containing_contigs <- paste0("c_", as.numeric(factor(virus_containing_contigs))) filtered_host_genome <- subset_and_update_ideogram(host_genome_ideogram, virus_containing_contigs) host_genome_size <- sum(width(filtered_host_genome)) } # Process proviruses proviruses_gr_features <- process_proviruses(genomad_summary) cat("**Genomad and CheckV Summary**:\n\n") genomad_summary %>% dplyr::select(seq_name, taxonomy, topology, coordinates, length) %>% dplyr::left_join( checkv_data %>% dplyr::select(contig_id, gene_count, viral_genes, checkv_quality, miuvig_quality), by = c("seq_name" = "contig_id") ) %>% dplyr::select(seq_name, length, gene_count, viral_genes, checkv_quality, miuvig_quality, taxonomy, topology, coordinates) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() cat("**Host Genome Ideogram with Phages**:\n\n") plot_genome_ideogram(sample, proviruses_gr_features) cat('In this circular plot, **"c"** indicates the contig, and the number that follows (e.g., **c1**) represents the contig number. If multiple contigs are present in the genome, each will be shown with a distinct label (e.g., **c1**, **c2**, etc.).\n\n') cat("\n\n") # Process phage genomes cat("### Prophages {.tabset .tabset-fade} \n\n") cat("**Select prophage to show: ** \n\n") for (i in seq_len(nrow(genomad_summary))) { log_debug(paste("Processing phage", i, "of", nrow(genomad_summary), "for sample", sample)) process_phage(genomad_summary[i, ], genomad_summary, genomad_annotation, checkv_data, host_genome_size) } # Plot dREP if applicable if (nrow(genomad_summary) > 1) { cat("### vOTUs\n\n") plot_drep(sample, genomad_summary) } # Creating table with Abricate data if (nrow(abricate_data) > 0) { cat("### Virulence Genes {.tabset .tabset-fade} \n\n") cat("Screening of virulence genes present in the prophage contigs. \n\n") abricate_data %>% dplyr::select(-number_file) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() cat("\n\n") } # Creating table with iPHOP if (nrow(iphop_data) > 0) { cat("### Prophage-Host Prediction {.tabset .tabset-fade} \n\n") cat("Prediction of potential hosts for the prophage contigs. \n\n") iphop_data %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() cat("\n\n") } # Creating table with VIBRANT AMGs if (nrow(vibrant_data) > 0) { cat("### AMG Predictions {.tabset .tabset-fade} \n\n") cat("Prediction of auxiliary metabolic genes in the prophage contigs. \n\n") vibrant_data %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% kableExtra::scroll_box(width = "100%", height = "100%") %>% cat() cat("\n\n") } log_debug(paste("Finished processing sample:", sample)) }, error = function(e) { log_debug(paste("Error in process_sample for", sample, ":", conditionMessage(e))) cat(paste("Error processing sample", sample, ":", conditionMessage(e), "\n\n")) } ) } process_phage <- function(virus, genomad_summary, genomad_annotation, checkv_data, host_genome_size) { cat(paste("#### Phage ID:", virus$seq_name, " {.tabset .tabset-fade} \n\n")) current_contig <- sub("\\|.*", "", virus$seq_name) provirus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", virus$seq_name)) provirus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", virus$seq_name)) virus_length <- provirus_end - provirus_start + 1 current_contig_base <- sub("\\|provirus_.*", "", virus$seq_name) current_provirus_range <- sub(".*\\|provirus_", "", virus$seq_name) current_annotations <- genomad_annotation[grepl(paste0(current_contig_base, "\\|provirus_", current_provirus_range, "_"), genomad_annotation$gene, fixed = FALSE ), ] %>% dplyr::mutate(arrow_pos = ifelse(strand == -1, "start", "end")) cat("\n\n**Phage-Host Genome Ideogram:**\n\n") plot_phage_circos(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) cat("\n\n") cat("\n\n**Genes Annotation (geNomad):**\n\n") current_annotations %>% dplyr::select(gene, length, marker, annotation_accessions, annotation_description) %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper() %>% cat() cat("\n\n") } plot_phage_circos <- function(virus, genomad_summary, current_annotations, virus_length, host_genome_size, provirus_start, provirus_end, checkv_data) { tryCatch( { log_debug("Starting plot_phage_circos function") log_debug(paste("Current virus:", virus$seq_name)) log_debug(paste("Virus length:", virus_length)) log_debug(paste("Host genome size:", host_genome_size)) log_debug(paste("Provirus start:", provirus_start)) log_debug(paste("Provirus end:", provirus_end)) # Check for NA or invalid values in input parameters if (is.na(virus_length) || virus_length <= 0) { log_debug("Error: Invalid virus length", virus_length) return(NULL) } if (is.na(host_genome_size) || host_genome_size <= 0) { log_debug("Error: Invalid host genome size") return(NULL) } if (is.na(provirus_start) || provirus_start < 0) { log_debug("Error: Invalid provirus start position") return(NULL) } if (is.na(provirus_end) || provirus_end <= provirus_start) { log_debug("Error: Invalid provirus end position") return(NULL) } # Extract contig information current_contig <- sub("\\|.*", "", virus$seq_name) log_debug(paste("Current contig:", current_contig)) contig_viruses <- genomad_summary[grepl(paste0("^", current_contig), genomad_summary$seq_name), ] if (nrow(contig_viruses) == 0) { log_debug("Error: No viruses found for the current contig") return(NULL) } contig_length <- max(as.numeric(sub(".*_(\\d+)$", "\\1", contig_viruses$seq_name))) if (is.na(contig_length) || contig_length <= 0) { contig_length <- virus_length # Use virus length as fallback if contig length is invalid log_debug(paste("Using virus length as contig length:", contig_length)) } else { log_debug(paste("Contig length:", contig_length)) } if (provirus_end > contig_length) { log_debug("Error: Provirus end position exceeds contig length") return(NULL) } log_debug("Clearing circos") circlize::circos.clear() log_debug("Setting circos parameters") circlize::circos.par(start.degree = 180, gap.degree = 10, track.margin = c(0.01, 0.01)) main_color <- "#a58bc5" zoom_color <- "#e9d27d" zoom_start <- (provirus_start / contig_length) * 100 zoom_end <- (provirus_end / contig_length) * 100 log_debug(paste("Zoom start:", zoom_start)) log_debug(paste("Zoom end:", zoom_end)) log_debug("Initializing circos") circlize::circos.initialize(factors = c("Zoom", "Main"), xlim = c(0, 100)) format_genome_labels <- function(x) { ifelse(x >= 1e6, paste0(round(x / 1e6, 2), " Mb"), ifelse(x >= 1e3, paste0(round(x / 1e3, 2), " Kb"), paste0(x, " bp") ) ) } log_debug("Adding link") tryCatch( { circlize::circos.link("Main", c(zoom_start, zoom_end), "Zoom", c(0, 100), rou1 = 0.8, rou2 = 0.97, h.ratio = 0.55, # width? lty = 2, lwd = 0.5, h2 = 1, col = "grey99", border = "grey80" ) }, error = function(e) { log_debug(paste("Error in circlize::circos.link:", e$message)) } ) log_debug("Adding zoom track") circlize::circos.track( factors = "Zoom", ylim = c(0, 1), track.height = 0.15, panel.fun = function(x, y) { circlize::circos.rect(0, 0, 100, 1, col = zoom_color, border = NA) axis_labels <- seq(0, virus_length, length.out = 6) axis_positions <- seq(0, 100, length.out = 6) circlize::circos.axis( h = "top", major.at = axis_positions, labels = format_genome_labels(axis_labels), labels.cex = 0.7, direction = "outside" ) for (i in 1:nrow(current_annotations)) { gene_start <- current_annotations$start[i] gene_end <- current_annotations$end[i] arrow_start <- (gene_start - provirus_start) / virus_length * 100 arrow_end <- (gene_end - provirus_start) / virus_length * 100 circlize::circos.arrow(arrow_start, arrow_end, arrow.head.width = 0.75, arrow.head.length = cm_x(0.1), arrow.position = current_annotations$arrow_pos[i], col = ifelse(is.na(current_annotations$annotation_description[i]), "grey", "#7fbfff"), border = ifelse(is.na(current_annotations$annotation_description[i]), "grey20", "darkblue") ) } }, bg.border = NA ) log_debug("Adding main track") circlize::circos.track( factors = "Main", ylim = c(0, 1), track.height = 0.1, panel.fun = function(x, y) { circlize::circos.rect(xleft = 0, ybottom = 0, xright = 100, ytop = 1, col = main_color, border = NA) for (i in 1:nrow(contig_viruses)) { virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) virus_start_percent <- (virus_start / contig_length) * 100 virus_end_percent <- (virus_end / contig_length) * 100 rect_color <- if (contig_viruses$seq_name[i] == virus$seq_name) zoom_color else adjustcolor(zoom_color, alpha.f = 0.7) circlize::circos.rect( xleft = virus_start_percent, ybottom = 0, xright = virus_end_percent, ytop = 1, col = rect_color, border = NA ) } axis_labels <- seq(0, contig_length, length.out = 6) axis_positions <- seq(0, 100, length.out = 6) circlize::circos.axis( h = "top", major.at = axis_positions, labels = format_genome_labels(axis_labels), labels.cex = 0.7, direction = "outside" ) }, bg.border = NA ) log_debug("Locating phage positions") phage_positions <- sapply(1:nrow(contig_viruses), function(i) { virus_start <- as.numeric(sub(".*provirus_(\\d+)_\\d+", "\\1", contig_viruses$seq_name[i])) virus_end <- as.numeric(sub(".*provirus_\\d+_(\\d+)", "\\1", contig_viruses$seq_name[i])) ((virus_start + virus_end) / 2 / contig_length) * 100 }) log_debug("Annotating names on phage positions") # Extract start and end positions from sequence names start_positions <- as.numeric(sub(".*provirus_([0-9]+)_.*", "\\1", contig_viruses$seq_name)) end_positions <- as.numeric(sub(".*provirus_[0-9]+_([0-9]+)", "\\1", contig_viruses$seq_name)) # Create phage labels with the desired format phage_labels <- paste0(round(contig_viruses$length / 1e3, 2), " Kb") # Apply labels to circos plot circlize::circos.labels( sectors = "Main", x = phage_positions, labels = phage_labels, facing = "reverse.clockwise", niceFacing = TRUE, col = "black", cex = 0.6, side = "inside", connection_height = 0.02, line_col = "gray" ) center_x <- 50 virus_name <- virus$seq_name taxonomy <- virus$taxonomy log_debug("Adding taxonomy and virus name to the plot") circlize::circos.text( x = center_x, y = -0.2, labels = taxonomy, sector.index = "Zoom", track.index = 1, facing = "bending.inside", niceFacing = TRUE, adj = c(0.5, 0.7), cex = 0.8 ) checkv_info <- checkv_data[checkv_data$contig_id == virus$seq_name, ] if (nrow(checkv_info) > 0) { checkv_quality <- checkv_info$checkv_quality gene_count <- checkv_info$gene_count viral_genes <- checkv_info$viral_genes host_genes <- checkv_info$host_genes miuvig_quality <- checkv_info$miuvig_quality completeness <- checkv_info$completeness completeness_method <- checkv_info$completeness_method contamination <- checkv_info$contamination circlize::circos.text( x = center_x, y = -0.5, labels = paste("CheckV Quality:", checkv_quality, " - miuvig Quality:", miuvig_quality), sector.index = "Zoom", track.index = 2, facing = "bending.inside", niceFacing = TRUE, adj = c(0.5, 0), cex = 0.7 ) circlize::circos.text( x = center_x, y = -1.5, labels = paste("Gene Count:", gene_count, " - Viral Genes:", viral_genes, " - Host Genes:", host_genes), sector.index = "Zoom", track.index = 2, facing = "bending.inside", niceFacing = TRUE, adj = c(0.5, 0), cex = 0.7 ) circlize::circos.text( x = center_x, y = -2.5, labels = paste("Completeness:", completeness, " - Contamination:", contamination), sector.index = "Zoom", track.index = 2, facing = "bending.inside", niceFacing = TRUE, adj = c(0.5, 0), cex = 0.7 ) } log_debug("Adding legend") # Add legend legend("topright", legend = c("Annotated gene", "Unknown gene"), fill = c("#7fbfff", "grey"), border = c("darkblue", "grey20"), cex = 0.8, bty = "n" ) log_debug("Clearing circos") circlize::circos.clear() log_debug("Finished plot_phage_circos function successfully") }, error = function(e) { log_debug(paste("Error in plot_phage_circos:", e$message)) circlize::circos.clear() } ) } plot_drep <- function(sample, genomad_summary) { drep_file_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "data_tables", "Cdb.csv") drep_data <- read_csv(drep_file_path) %>% janitor::clean_names() drep_data <- cbind(genomad_summary$seq_name, drep_data) cat("When more than 1 phage is detected in the host genome, we perform a clustering step using the tool dRep.\n\n") cat("A threshold of 0.95 was applied to the ANI similarity index to define clusters of virus operational taxonomic units (vOTUs).") cat("\n\n**Final cluster designations**\n\n") drep_data %>% kableExtra::kbl() %>% kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% kableExtra::kable_paper("striped", full_width = TRUE) %>% cat() # Insert the PDF plot plot_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.pdf") png_path <- file.path(params$outdir, "virus_analyses", "drep_compare", sample, "figures", "Primary_clustering_dendrogram.png") if (file.exists(plot_path)) { pdftools::pdf_convert(plot_path, format = "png", filenames = png_path, verbose = FALSE, dpi = 150) base64_str <- base64enc::dataURI(file = png_path, mime = "image/png") cat("**Primary clustering plot**\n\n") cat(sprintf( '<div style="text-align: center;"><img src="%s" style="max-width: 100%%; height: auto; border:1px solid #ddd; border-radius:4px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin: 1em 0;"></div>', base64_str )) } else { cat("**No dRep clustering plot found.**\n\n") } } subset_and_update_ideogram <- function(ideogram, contigs) { filtered <- ideogram[seqnames(ideogram) %in% contigs] GenomeInfoDb::seqlevels(filtered) <- contigs seqinfo(filtered) <- seqinfo(filtered)[contigs] filtered } render_all_samples <- function(test_mode = FALSE) { if (test_mode) { if (nrow(combined_unique) > 0) { cat("**Select sample to show:** \n\n\n") current_sample <- combined_unique$Sample[6] process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) } else { print("No samples can be further analysed.") } } else { cat("**Select sample to show:** \n\n\n") for (i in seq_len(nrow(combined_unique))) { current_sample <- combined_unique$Sample[i] process_sample(current_sample, combined_unique, host_genomes_paths, genome_data) } } } # Execute the main function # Test mode processes one sample only render_all_samples(test_mode = F) ```