| 0 | 1 #!/usr/bin/env Rscript | 
|  | 2 library(R2HTML) | 
|  | 3 library(hwriter) | 
|  | 4 library(DT) | 
|  | 5 library(tools) | 
|  | 6 | 
|  | 7 source("htmlheader.R") | 
|  | 8 source("config.R")  # load TANDEM_RANKS | 
|  | 9 source("utils.R") | 
|  | 10 DT_OPTIONS = list(pageLength = 1000, lengthMenu = c(10, 50, 100, 1000, 5000, 10000)) | 
|  | 11 HTMLHEADER_TAREAN = gsub("PAGE_TITLE","TAREAN summary", htmlheader) | 
|  | 12 HTMLHEADER_INDEX = gsub("PAGE_TITLE","Clustering summary", htmlheader) | 
|  | 13 | 
|  | 14 WD = getwd()  # to get script directory when run from Rserve | 
|  | 15 | 
|  | 16 reformat_header = function(df){ | 
|  | 17     H = colnames(df) | 
|  | 18     H[H=="TR_score"] = "TAREAN k-mer_coverage" | 
|  | 19     H[H=="vcount"] = "|V|" | 
|  | 20     H[H=="ecount"] = "|E|" | 
|  | 21     H[H=="Genome_Proportion[%]"] = "Proportion[%]" | 
|  | 22     H[H=="Proportion_Adjusted[%]"] = "Proportion adjusted[%]" | 
|  | 23     H[H=="supercluster"] = "Super_cluster" | 
|  | 24     H[H=="size_real"] = "Number of reads" | 
|  | 25     H[H=="TR_monomer_length"] = "Consensus_length" | 
|  | 26     H[H=="TR_consensus"] = "Consensus" | 
|  | 27     H[H=="pbs_score"] = "PBS score" | 
|  | 28     H[H=="ltr_detection"] = "LTR detection" | 
|  | 29     H[H=="kmer_analysis"] = "TAREAN k-mer analysis" | 
|  | 30 | 
|  | 31    # H[H=="annotations_summary"] = "Similarity_hits" | 
|  | 32     H[H=="annotations_summary"] = "Similarity_hits_[above 0.1%]" | 
|  | 33     H[H=="annotations_summary_custom"] = "Similarity_hits_to_custom_database" | 
|  | 34     H[H=="loop_index"] = "connected_component_index C" | 
|  | 35     H[H=="pair_completeness"] = "pair_completeness_index_P" | 
|  | 36     H = gsub("_","<br>",H) | 
|  | 37     H=gsub("TR_","",H) | 
|  | 38     H = capitalize(H) | 
|  | 39     colnames(df) = H | 
|  | 40     return(df) | 
|  | 41 } | 
|  | 42 | 
|  | 43 reformat4html=function(df){ | 
|  | 44     for (n in colnames(df)){ | 
|  | 45         if (class(df[,n]) == 'character'){ | 
|  | 46             df[,n] = gsub("\n","<br>", df[,n]) | 
|  | 47         } | 
|  | 48         if (class(df[,n]) == 'numeric'){ | 
|  | 49             df[,n] = signif(df[,n],3) | 
|  | 50         } | 
|  | 51     } | 
|  | 52     return(df) | 
|  | 53 } | 
|  | 54 | 
|  | 55 capitalize = function(s){ | 
|  | 56     paste(toupper(substring(s, 1, 1)), | 
|  | 57           substring(s, 2), | 
|  | 58           sep="") | 
|  | 59 } | 
|  | 60 | 
|  | 61 | 
|  | 62 create_main_reports = function(paths, N_clustering, N_input,N_omit, merge_threshold, | 
|  | 63                                paired, consensus_files, custom_db, tarean_mode, | 
|  | 64                                HTML_LINKS, pipeline_version_info, max_memory, | 
|  | 65                                max_number_reads_for_clustering, mincln){ | 
|  | 66     ## this create main html index and also tarean report ## | 
|  | 67     ## index and tarean html reports are created always | 
|  | 68     ## extract all paths and directories | 
|  | 69     HTML_LINKS = nested2named_list(HTML_LINKS) | 
|  | 70     paths = nested2named_list(paths) | 
|  | 71     csvfile = paths[['clusters_info']] | 
|  | 72     clusters_summary_csv = paths[['clusters_summary_csv']] | 
|  | 73     profrep_classification_csv = paths[['profrep_classification_csv']] | 
|  | 74     htmlfile = paths[["tarean_report_html"]] | 
|  | 75     html_report_dt = paths[["cluster_report_html"]] | 
|  | 76     main_report = paths[["main_report_html"]] | 
|  | 77     summarized_annnotation_html = paths[["summarized_annotation_html"]] | 
|  | 78     libdir = paths[['libdir']] | 
|  | 79     clusters_dir = paths[["clusters__relative"]] | 
|  | 80     superclusters_dir = paths[['superclusters__relative']] | 
|  | 81     seqdb = paths[['sequences_db']] | 
|  | 82     hitsortdb = paths[['hitsort_db']] | 
|  | 83     connect_to_databases(seqdb, hitsortdb) | 
|  | 84     dfraw = read.table(csvfile, as.is=TRUE, header=TRUE, sep="\t", na.strings = c('None','NA')) | 
|  | 85     # table must be updated | 
|  | 86     dfraw$supercluster_best_hit = dbGetQuery(HITSORTDB, "SELECT supercluster_best_hit FROM cluster_info")[, 1] | 
|  | 87     ## columns to use | 
|  | 88     selected_cols = c("index", "size_real","size_adjusted", "vcount","ecount", | 
|  | 89                      "loop_index", "pair_completeness", | 
|  | 90                     'satellite_probability','satellite', | 
|  | 91                     'TR_score','pbs_score','ltr_detection', 'TR_monomer_length', | 
|  | 92                     'TR_consensus', "annotations_summary", "supercluster", 'tandem_rank', | 
|  | 93                     'supercluster_best_hit') | 
|  | 94 | 
|  | 95     ## some columns are added (like Graph_layout, clusters,...) | 
|  | 96     ## columns for html report | 
|  | 97     selected_cols_tarean = c( | 
|  | 98         "Cluster", | 
|  | 99         "Proportion[%]", | 
|  | 100         "Proportion_Adjusted[%]", | 
|  | 101         "size_real", | 
|  | 102         'satellite_probability', | 
|  | 103         'TR_monomer_length', | 
|  | 104         'TR_consensus', | 
|  | 105         'Graph_layout', | 
|  | 106         'kmer_analysis', | 
|  | 107         "loop_index", | 
|  | 108         "pair_completeness", | 
|  | 109         'TR_score', | 
|  | 110         "vcount", | 
|  | 111         "ecount", | 
|  | 112         'pbs_score', | 
|  | 113         "annotations_summary" | 
|  | 114     ) | 
|  | 115     selected_cols_main = c( | 
|  | 116         "Cluster", | 
|  | 117         "supercluster", | 
|  | 118         "Proportion[%]", | 
|  | 119         "Proportion_Adjusted[%]", | 
|  | 120         "size_real", | 
|  | 121         'Graph_layout', | 
|  | 122         "annotations_summary", | 
|  | 123         'ltr_detection', | 
|  | 124         'satellite_probability', | 
|  | 125         'TAREAN_annotation', | 
|  | 126         'TR_monomer_length', | 
|  | 127         'TR_consensus', | 
|  | 128         'kmer_analysis', | 
|  | 129         "loop_index", | 
|  | 130         "pair_completeness", | 
|  | 131         'TR_score', | 
|  | 132         "ecount", | 
|  | 133         "vcount" | 
|  | 134     ) | 
|  | 135 | 
|  | 136     if (custom_db){ | 
|  | 137         selected_cols_main = c(selected_cols_main, "annotations_summary_custom") | 
|  | 138         selected_cols_tarean = c(selected_cols_tarean, "annotations_summary_custom") | 
|  | 139         selected_cols = c(selected_cols, "annotations_summary_custom") | 
|  | 140     } | 
|  | 141     if (is_comparative()){ | 
|  | 142         prefix_codes = dbGetQuery(SEQDB, "SELECT * FROM prefix_codes") | 
|  | 143         species_counts = dbGetQuery(HITSORTDB, "SELECT * FROM comparative_counts") | 
|  | 144         superclusters = dbGetQuery(HITSORTDB,paste( | 
|  | 145                                                  "SELECT supercluster, cluster FROM superclusters WHERE cluster <=", | 
|  | 146                                                  nrow(species_counts)) | 
|  | 147                                    ) | 
|  | 148         species_counts = merge(superclusters, species_counts, by.x = "cluster", by.y = "clusterindex") | 
|  | 149         ## include commented header with total counts: | 
|  | 150         cat("# Total counts:\t\t", paste(prefix_codes$N, collapse="\t"),"\n#\n", | 
|  | 151             sep="", | 
|  | 152             file = paths[['comparative_analysis_counts_csv']]) | 
|  | 153 | 
|  | 154         write.table(species_counts, file = paths[['comparative_analysis_counts_csv']], | 
|  | 155                     sep = "\t", col.names = TRUE, row.names = FALSE, append=TRUE) | 
|  | 156         species_counts_formated = apply( | 
|  | 157             species_counts[, prefix_codes$prefix, drop = FALSE], | 
|  | 158             1, function(x) paste(prefix_codes$prefix, ":", x, "\n",sep='', collapse="")) | 
|  | 159         dfraw$species_counts = species_counts_formated[1:nrow(dfraw)] | 
|  | 160         selected_cols = c(selected_cols, "species_counts") | 
|  | 161         selected_cols_main = c(selected_cols_main, "species_counts") | 
|  | 162     } | 
|  | 163 | 
|  | 164 | 
|  | 165     df_report = dfraw[,selected_cols] | 
|  | 166     ## describe tandem ranks: | 
|  | 167     df_report$TAREAN_annotation = RANKS_TANDEM[as.character(df_report$tandem_rank)] | 
|  | 168     ## remove Cluster_similarity_hits | 
|  | 169     df_report_csv = reformat_df_report(df_report) | 
|  | 170     df_report_csv = df_report_csv[,!colnames(df_report_csv) %in% "Cluster_similarity_hits"] | 
|  | 171     df_report_csv$Final_annotation="" | 
|  | 172 | 
|  | 173     ## make table for profrep classification | 
|  | 174     write.table( | 
|  | 175         reformat_df_to_profrep_classification(df_report), file = profrep_classification_csv, | 
|  | 176         sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) | 
|  | 177 | 
|  | 178     df_report$"kmer_analysis" = ifelse(dfraw$putative_tandem, hwrite("report", link = dfraw$html_tarean), "N/A") | 
|  | 179     df_report$"Graph_layout" = hwriteImage(dfraw$image_file_tmb, link = dfraw$image_file, table = FALSE) | 
|  | 180     df_report$Cluster = paste0("CL", df_report$index) | 
|  | 181     df_report$"Proportion[%]" = signif (100 * df_report$size_real / N_clustering, 2) | 
|  | 182     df_report$"Proportion_Adjusted[%]" = signif (100 * df_report$size_adjusted / N_clustering, 2) | 
|  | 183     if (!tarean_mode){ | 
|  | 184 | 
|  | 185         df_report$Cluster=sapply( | 
|  | 186             df_report$index, | 
|  | 187             function(x) hwrite(x, link = sprintf("%s/dir_CL%04d/index.html", clusters_dir, x))) | 
|  | 188 | 
|  | 189         df_report$supercluster = sapply( | 
|  | 190             df_report$supercluster, | 
|  | 191             function(x) hwrite(x, link = sprintf("%s/dir_SC%04d/index.html", superclusters_dir, x))) | 
|  | 192     } | 
|  | 193     ## TAREAN report | 
|  | 194     ## copy tarean output data help to place nad make link to it | 
|  | 195     file.copy(paste0(WD,"/style1.css"), dirname(htmlfile)) | 
|  | 196     file.copy(paste0(WD,"/documentation.html"), dirname(htmlfile)) | 
|  | 197 | 
|  | 198     tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN) | 
|  | 199     tarean_html("Tandem Repeat Analyzer", HTML.title, HR=1) | 
|  | 200     tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN) | 
|  | 201     tarean_html('Run statistics:', HTML.title, HR=2) | 
|  | 202     tarean_html(paste("Number of input reads:", N_input )) | 
|  | 203     tarean_html(paste("Number of analyzed reads:", N_clustering)) | 
|  | 204     tarean_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes"))) | 
|  | 205 | 
|  | 206     ## export links to consensus sequecnes in fasta files | 
|  | 207     tarean_html("Consensus files - fasta format:", HTML.title, HR=2) | 
|  | 208     for (i in TANDEM_RANKS[TANDEM_RANKS != 0]){ ## no consensus for rank 0 | 
|  | 209         if (!is.null (consensus_files[[i]])){ | 
|  | 210             N = sum(dfraw$tandem_rank == TANDEM_RANKS[i]) | 
|  | 211             name_string = paste(names(TANDEM_RANKS)[i]," - total ", N, "found" ) | 
|  | 212             tarean_html(paste("<p>", | 
|  | 213                   hwrite(name_string, download = name_string, | 
|  | 214                          link = basename(consensus_files[[i]][[1]])), | 
|  | 215                   "<br>\n")) | 
|  | 216 | 
|  | 217         } | 
|  | 218     } | 
|  | 219     ## print link to documentation ## | 
|  | 220     tarean_html("Documentation", HTML.title, HR=2) | 
|  | 221     tarean_html(paste('<p> For the explanation of TAREAN output see', | 
|  | 222         ' <a href="documentation.html#tra" > the help section </a> <p>')) | 
|  | 223     ## HOW TO CITE section) | 
|  | 224 | 
|  | 225     ## PRINT TABLES WITH CLUSTERS | 
|  | 226     for (n in names(TANDEM_RANKS)){ | 
|  | 227       tarean_html(n, HTML.title, HR=2) | 
|  | 228       inc <- dfraw$tandem_rank == TANDEM_RANKS[n] | 
|  | 229       if (sum(inc > 0)){ | 
|  | 230           tarean_html(reformat4html( | 
|  | 231               reformat_header( | 
|  | 232                   df_report[inc, selected_cols_tarean ,drop=FALSE] | 
|  | 233               ) | 
|  | 234           ), | 
|  | 235           align = "left", digits = 3) | 
|  | 236       }else{ | 
|  | 237         tarean_html("not found") | 
|  | 238       } | 
|  | 239     } | 
|  | 240 | 
|  | 241     ## export table with all cluster | 
|  | 242     cat("",file = html_report_dt) | 
|  | 243 | 
|  | 244     DT_instance =  df_report[,selected_cols_main, drop = FALSE] %>% | 
|  | 245         reformat_header %>% reformat4html %>% datatable(escape = FALSE, options = DT_OPTIONS) %>% | 
|  | 246         formatStyle(columns = seq_along(selected_cols), "font-size" = "12px") %>% | 
|  | 247         formatStyle(columns = "Similarity<br>hits<br>[above 0.1%]", "min-width" = "500px") | 
|  | 248 | 
|  | 249     saveWidget(DT_instance, file = normalizePath(html_report_dt), | 
|  | 250                libdir=normalizePath(libdir) , selfcontained = FALSE) | 
|  | 251 | 
|  | 252     add_preamble(normalizePath(html_report_dt), | 
|  | 253                  preamble='<h2>Cluster annotation</h2> <p><a href="documentation.html#clust"> For table legend see documentation. <a> </p>') | 
|  | 254 | 
|  | 255     ## Main page - Clustering info - global information about clustering | 
|  | 256     top_clusters_prop = sum(df_report$size_real)/N_clustering | 
|  | 257     clustering_info = summary_histogram(fn = paths[["summary_histogram"]], N_clustering, N_omit, df_report$size_adjusted, | 
|  | 258                     top_clusters_prop) | 
|  | 259     index_html = start_html(main_report, HTMLHEADER_INDEX) | 
|  | 260     index_html("Clustering Summary", HTML.title, HR = 1) | 
|  | 261 | 
|  | 262     index_html(paste0('<a href="', | 
|  | 263                       paths[['summary_histogram__relative']], | 
|  | 264                       '"> <img src="', paths[['summary_histogram__relative']], | 
|  | 265                       '" width="700" border="1" >', | 
|  | 266                       ' </a>'), cat) | 
|  | 267   index_html('<p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>',cat) | 
|  | 268 | 
|  | 269     index_html('Run information:', HTML.title, HR = 2) | 
|  | 270     index_html(paste("Number of input reads:", N_input )) | 
|  | 271     index_html(paste("Number of analyzed reads:", N_clustering)) | 
|  | 272     if (N_omit != 0){ | 
|  | 273       index_html(paste("Number of reads removed by automatic filtering of abundant putative satellites:", N_omit)) | 
|  | 274       index_html(paste("Number of remaining reads after filtering of abundant satellites:", N_clustering - N_omit )) | 
|  | 275     } | 
|  | 276 | 
|  | 277     index_html( | 
|  | 278         paste( | 
|  | 279             "Proportion of reads in top clusters :", | 
|  | 280             signif(100 * sum(df_report$size_real)/N_clustering,2), | 
|  | 281             "%" | 
|  | 282         )) | 
|  | 283     index_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes"))) | 
|  | 284     index_html(paste("Paired-end reads:",ifelse(paired, "Yes", "No"))) | 
|  | 285     index_html("Available analyses:", HTML.title, HR=2) | 
|  | 286     index_html(paste("<p>",hwrite("Tandem repeat analysis", link = HTML_LINKS$INDEX_TO_TAREAN),"</p>"),cat) | 
|  | 287 | 
|  | 288     if (!tarean_mode){ | 
|  | 289         index_html(paste("<p>", hwrite("Cluster annotation", link = HTML_LINKS$INDEX_TO_CLUSTER_REPORT),"</p>"),cat) | 
|  | 290         index_html(paste("<p>", hwrite("Supercluster annotation", | 
|  | 291                                        link = HTML_LINKS$INDEX_TO_SUPERCLUSTER_REPORT),"</p>"),cat) | 
|  | 292         index_html(paste("<p>", hwrite("Repeat annotation summary", link = HTML_LINKS$INDEX_TO_SUMMARIZED_ANNOTATION),"</p>"),cat) | 
|  | 293     } | 
|  | 294 | 
|  | 295   if (is_comparative()) { | 
|  | 296     tryCatch({ | 
|  | 297       imagemap = plot_rect_map( | 
|  | 298         read_counts = paths[['comparative_analysis_counts_csv']], | 
|  | 299         cluster_annotation = paths[['profrep_classification_csv']], | 
|  | 300         output_file = paths[['comparative_summary_map']] | 
|  | 301       )}, | 
|  | 302       error = function(err){ | 
|  | 303         print(paste("error while plotting ", err)) | 
|  | 304       } | 
|  | 305     ) | 
|  | 306 | 
|  | 307     HTML.title("Comparative analysis - Total number of reads in clustering analysis", file = main_report) | 
|  | 308     index_html(df2html( | 
|  | 309       prefix_codes, | 
|  | 310       header = c("Code", "Total read count"), rounding_function = round), | 
|  | 311       cat | 
|  | 312       ) | 
|  | 313     HTML.title("Comparative analysis - Number of reads in individual clusters", file = main_report) | 
|  | 314 | 
|  | 315     index_html(paste0('<img src="', paths[['comparative_summary_map__relative']], | 
|  | 316                       '" usemap ="#clustermap" border="2">'), cat) | 
|  | 317 | 
|  | 318     index_html( | 
|  | 319       "Bar plot on top shows the size of individual clusters. Size of the rectangles in lower panel is proportional to the number of reads in a cluster for each species. Clusters and species were sorted using hierarchical clustering. Bars and rectangles in the plot are hyperlinked to the individual cluster reports.") | 
|  | 320     index_html(imagemap) | 
|  | 321   } | 
|  | 322 | 
|  | 323   how2cite = readLines(paths[["how_to_cite"]]) | 
|  | 324 | 
|  | 325   index_html(how2cite, cat, sep="\n") | 
|  | 326   index_html("<br><hr>", cat) | 
|  | 327   index_html('Details:', HTML.title, HR = 3) | 
|  | 328   index_html(pipeline_version_info %>% preformatted, cat) | 
|  | 329   index_html(paste0("Minimal number of reads in cluster to be considered top cluster : ", mincln)) | 
|  | 330   index_html(paste0("Reserved Memory : ", round(max_memory/(1024*1024)), "G")) | 
|  | 331   index_html(paste0("Maximum number of processable reads with the reserved memory : ", max_number_reads_for_clustering)) | 
|  | 332 | 
|  | 333 | 
|  | 334   ## export to csv | 
|  | 335   clustering_info$Number_of_analyzed_reads = N_clustering | 
|  | 336   write.table(t(as.data.frame(clustering_info)), | 
|  | 337               file = clusters_summary_csv, sep="\t", col.names = FALSE) | 
|  | 338   cat("\n", file = clusters_summary_csv, append = TRUE) | 
|  | 339   write.table( | 
|  | 340     df_report_csv, file = clusters_summary_csv, | 
|  | 341     sep = "\t", col.names = TRUE, row.names = FALSE, quote = TRUE, append=TRUE) | 
|  | 342 } | 
|  | 343 | 
|  | 344 dummy_function = function(){ | 
|  | 345     print("dummy function") | 
|  | 346 } | 
|  | 347 | 
|  | 348 reformat_df_report = function(df_report){ | 
|  | 349     # for printing to csv  - this should be consise | 
|  | 350     df_report$TR_consensus = gsub("(<pre>)|(</pre>)","",df_report$TR_consensus) | 
|  | 351     df_report$tandem_rank = NULL | 
|  | 352     ## make suitable order and rename | 
|  | 353     if ("annotations_summary_custom" %in% colnames(df_report)){ | 
|  | 354         custom = "annotations_summary_custom" | 
|  | 355     }else{ | 
|  | 356         custom=character() | 
|  | 357     } | 
|  | 358     df_out = df_report[,c('index', | 
|  | 359                           'supercluster', | 
|  | 360                           'size_real', | 
|  | 361                           'size_adjusted', | 
|  | 362                           'supercluster_best_hit', | 
|  | 363                           'TAREAN_annotation', | 
|  | 364                           'annotations_summary', | 
|  | 365                           custom) | 
|  | 366                        ] | 
|  | 367 | 
|  | 368     colnames(df_out) = c('Cluster', | 
|  | 369                          'Supercluster', | 
|  | 370                          'Size', | 
|  | 371                          'Size_adjusted', | 
|  | 372                          'Automatic_annotation', | 
|  | 373                          'TAREAN_annotation', | 
|  | 374                          'Cluster_similarity_hits', | 
|  | 375                          custom) | 
|  | 376     return(df_out) | 
|  | 377 } | 
|  | 378 | 
|  | 379 reformat_df_to_profrep_classification = function(df_report){ | 
|  | 380     CL = df_report$index | 
|  | 381     best_hit = df_report$supercluster_best_hit | 
|  | 382     ## format conversion(in order): | 
|  | 383     replacement = list( | 
|  | 384         c("/", "|"), | 
|  | 385         c("Ty1_copia", "Ty1/copia"), | 
|  | 386         c("Ty3_gypsy", "Ty3/gypsy"), | 
|  | 387         c("TatIV_Ogre", "TatIV/Ogre"), | 
|  | 388         c("Ogre_Tat", "Ogre/Tat"), | 
|  | 389         c("EnSpm_CACTA", "EnSpm/CACTA"), | 
|  | 390         c("MuDR_Mutator", "MuDR/Mutator"), | 
|  | 391         c("PIF_Harbinger", "PIF/Harbinger"), | 
|  | 392         c("Tc1/Mariner", "Tc1/Mariner"), | 
|  | 393         c("All|", "") | 
|  | 394     ) | 
|  | 395     for (i in replacement){ | 
|  | 396         best_hit = gsub(i[1], i[2], best_hit, fixed = TRUE) | 
|  | 397     } | 
|  | 398     best_hit = gsub("^All", "", best_hit, fixed = FALSE) | 
|  | 399     best_hit = ifelse(best_hit == "", paste0("unknown_CL", CL), best_hit) | 
|  | 400     output = data.frame(Cluster = CL, classification = best_hit, stringsAsFactors = FALSE) | 
|  | 401     return(output) | 
|  | 402 } |