Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report
view fastqc_report.Rmd @ 19:8c79e5b7cfc0 draft default tip
add boxplot for per base sequence quality
author | mingchen0919 |
---|---|
date | Thu, 09 Nov 2017 09:49:46 -0500 |
parents | 8635a4cee6dd |
children |
line wrap: on
line source
--- title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)' output: html_document: number_sections: true toc: true theme: cosmo highlight: tango --- ```{r setup, include=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set( echo = ECHO, error = TRUE ) ``` # Fastqc Evaluation ## Evaluation of reads before trimming ```{r} if ('READS_1' == 'None') { stop("No pre-trimming reads provided!") } else { ## run fastqc evaluation fastqc_command = paste0('fastqc ') %>% (function(x) { ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) }) %>% (function(x) { ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) }) %>% (function(x) { paste0(x, '-o REPORT_DIR ') }) fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1') system(fastqc_command_reads_1, intern = TRUE) # Original html report reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1) original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1)) unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR') reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/') # fastqc_data.txt file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt') fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1)) # summary.txt file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt') summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1)) tags$ul( tags$li(original_html), tags$li(fastqc_data), tags$li(summary_data) ) } ``` ## Evaluation of reads after trimming ```{r} if ('READS_2' == 'None') { stop("No pre-trimming reads provided!") } else { ## run fastqc evaluation fastqc_command = paste0('fastqc ') %>% (function(x) { ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) }) %>% (function(x) { ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) }) %>% (function(x) { paste0(x, '-o REPORT_DIR ') }) fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1') system(fastqc_command_reads_2, intern = TRUE) # Original html report reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1) original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2)) unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR') reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/') # fastqc_data.txt file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt') fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2)) # summary.txt file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt') summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2)) tags$ul( tags$li(original_html), tags$li(fastqc_data), tags$li(summary_data) ) } ``` # Fastqc output visualization ## Overview ```{r} reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] combined_summary = cbind(reads_1_summary, reads_2_summary) names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' combined_summary[combined_summary == 'WARN'] = 'WARN (!)' knitr::kable(combined_summary) ``` ## Visualization by data module {.tabset} * Define a function to extract outputs for each module from fastqc output ```{r 'function definition'} extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { f = readLines(fastqc_data) start_line = grep(module_name, f) end_module_lines = grep('END_MODULE', f) end_line = end_module_lines[which(end_module_lines > start_line)[1]] module_data = f[(start_line+1):(end_line-1)] writeLines(module_data, 'temp.txt') read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char) } ``` ### Per base sequence quality ```{r 'per base sequence quality', fig.width=10} ## reads 1 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') pbsq_1$id = 1:length(pbsq_1$X.Base) pbsq_1$trim = 'before' ## reads 2 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') pbsq_2$id = 1:length(pbsq_2$X.Base) pbsq_2$trim = 'after' comb_pbsq = rbind(pbsq_1, pbsq_2) comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) p = ggplot(data = comb_pbsq) + geom_boxplot(mapping = aes(x = id, lower = Lower.Quartile, upper = Upper.Quartile, middle = Median, ymin = X10th.Percentile, ymax = X90th.Percentile, fill = "yellow"), stat = 'identity') + geom_line(mapping = aes(x = id, y = Mean, color = "red")) + scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + scale_fill_identity() + scale_color_identity() + ylim(0, max(comb_pbsq$Upper.Quartile) + 5) + xlab('Position in read (bp)') + facet_grid(. ~ trim) + theme(axis.text.x = element_text(angle=45)) p ``` ### Per tile sequence quality ```{r 'per tile sequence quality', fig.width=10} ## check if 'per tile sequence quality' module exits or not check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt')) if (length(check_ptsq) > 0) { ## reads 1 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') ptsq_1$trim = 'before' ## reads 2 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') ptsq_2$trim = 'after' comb_ptsq = rbind(ptsq_1, ptsq_2) comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) # convert integers to charaters comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + geom_raster() + facet_grid(. ~ trim) + xlab('Position in read (bp)') + ylab('') + theme(axis.text.x = element_text(angle=45)) ggplotly(p) } else { print('No "per tile sequence quality" data') } ``` ### Per sequence quality score ```{r 'Per sequence quality score', fig.width=10} ## reads 1 psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores') psqs_1$trim = 'before' ## reads 2 psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores') psqs_2$trim = 'after' comb_psqs = rbind(psqs_1, psqs_2) comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + geom_line(color = 'red') + facet_grid(. ~ trim) + xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + xlab('Mean Sequence Qaulity (Phred Score)') + ylab('') ggplotly(p) ``` ### Per base sequence content ```{r 'Per base sequence content', fig.width=10} ## reads 1 pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content') pbsc_1$id = 1:length(pbsc_1$X.Base) melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) melt_pbsc_1$trim = 'before' ## reads 2 pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content') pbsc_2$id = 1:length(pbsc_2$X.Base) melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) melt_pbsc_2$trim = 'after' comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + geom_line() + facet_grid(. ~ trim) + xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + ylim(0, 100) + xlab('Position in read (bp)') + ylab('') ggplotly(p) ``` ### Per sequence GC content ```{r 'Per sequence GC content', fig.width=10} ## reads 1 psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content') psGCc_1$trim = 'before' ## reads 2 psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content') psGCc_2$trim = 'after' comb_psGCc = rbind(psGCc_1, psGCc_2) comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + geom_line(color = 'red') + facet_grid(. ~ trim) + xlab('Mean Sequence Qaulity (Phred Score)') + ylab('') ggplotly(p) ``` ### Per base N content ```{r 'Per base N content', fig.width=10} ## reads 1 pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content') pbNc_1$id = 1:length(pbNc_1$X.Base) pbNc_1$trim = 'before' ## reads 2 pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content') pbNc_2$id = 1:length(pbNc_2$X.Base) pbNc_2$trim = 'after' comb_pbNc = rbind(pbNc_1, pbNc_2) comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + geom_line(color = 'red') + scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + facet_grid(. ~ trim) + ylim(0, 1) + xlab('N-Count') + ylab('') + theme(axis.text.x = element_text(angle=45)) ggplotly(p) ``` ### Sequence Length Distribution ```{r 'Sequence Length Distribution', fig.width=10} ## reads 1 sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution') sld_1$id = 1:length(sld_1$X.Length) sld_1$trim = 'before' ## reads 2 sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution') sld_2$id = 1:length(sld_2$X.Length) sld_2$trim = 'after' comb_sld = rbind(sld_1, sld_2) comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) p = ggplot(data = comb_sld, aes(x = id, y = Count)) + geom_line(color = 'red') + scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + facet_grid(. ~ trim) + xlab('Sequence Length (bp)') + ylab('') + theme(axis.text.x = element_text(angle=45)) ggplotly(p) ``` ### Sequence Duplication Levels ```{r 'Sequence Duplication Levels', fig.width=10} ## reads 1 sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') sdl_1$id = 1:length(sdl_1$Duplication_Level) melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) melt_sdl_1$trim = 'before' ## reads 2 sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') sdl_2$id = 1:length(sdl_2$Duplication_Level) melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) melt_sdl_2$trim = 'after' comb_sdl = rbind(melt_sdl_1, melt_sdl_2) comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + geom_line() + scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + facet_grid(. ~ trim) + xlab('Sequence Duplication Level') + ylab('') + theme(axis.text.x = element_text(angle=45)) ggplotly(p) ``` ### Adapter Content ```{r 'Adapter Content', fig.width=10} ## reads 1 ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content') ac_1$id = 1:length(ac_1$X.Position) melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) melt_ac_1$trim = 'before' ## reads 2 ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content') ac_2$id = 1:length(ac_2$X.Position) melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) melt_ac_2$trim = 'after' comb_ac = rbind(melt_ac_1, melt_ac_2) comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + geom_line() + facet_grid(. ~ trim) + xlim(min(comb_ac$id), max(comb_ac$id)) + ylim(0, 1) + xlab('Position in read (bp)') + ylab('') ggplotly(p) ``` ### Kmer Content {.tabset} #### Before ```{r 'Kmer Content (before)', fig.width=10} kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content') knitr::kable(kc_1) ``` #### After ```{r 'Kmer Content (after)', fig.width=10} kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content') knitr::kable(kc_2) ``` # Session Info ```{r 'session info'} sessionInfo() ``` # References * Bioinformatics, Babraham (2014). FastQC. * Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob (2016). rmarkdown: Dynamic Documents for R, 2016. In R package version 0.9, 6. * Xie, Yihui (2015). Dynamic Documents with R and knitr, CRC Press, Vol.29. * Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. [Link] * Wickham, H. (2016). ggplot2: elegant graphics for data analysis. Springer. Chicago