# HG changeset patch # User mingchen0919 # Date 1510005194 18000 # Node ID ac5c618e4d9730ae2c96641000b719851b21b9ab # Parent 1710b0e874f1f5e434b519d48d94533d759875e7 compare evaluation before and after trimming diff -r 1710b0e874f1 -r ac5c618e4d97 fastqc_report.Rmd --- a/fastqc_report.Rmd Sat Oct 21 09:25:49 2017 -0400 +++ b/fastqc_report.Rmd Mon Nov 06 16:53:14 2017 -0500 @@ -109,25 +109,27 @@ ```{r} reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] -reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1] +reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] combined_summary = cbind(reads_1_summary, reads_2_summary) names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) +combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' +combined_summary[combined_summary == 'WARN'] = 'WARN (!)' knitr::kable(combined_summary) ``` -## Visualization by module {.tabset} +## Visualization by data module {.tabset} * Define a function to extract outputs for each module from fastqc output ```{r 'function definition'} -extract_data_module = function(fastqc_data, module_name) { +extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { f = readLines(fastqc_data) start_line = grep(module_name, f) end_module_lines = grep('END_MODULE', f) end_line = end_module_lines[which(end_module_lines > start_line)[1]] module_data = f[(start_line+1):(end_line-1)] writeLines(module_data, 'temp.txt') - read.csv('temp.txt', sep = '\t') + read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char) } ``` @@ -138,7 +140,7 @@ pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') pbsq_1$id = 1:length(pbsq_1$X.Base) -melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') +melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable == 'Mean') melt_pbsq_1$trim = 'before' @@ -146,15 +148,17 @@ pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') pbsq_2$id = 1:length(pbsq_2$X.Base) -melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') +melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable == 'Mean') melt_pbsq_2$trim = 'after' comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) + p = ggplot(data = comb_pbsq) + geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + facet_grid(. ~ trim) + + ylim(0, max(comb_pbsq$value) + 5) + theme(axis.text.x = element_text(angle=45)) ggplotly(p) @@ -162,27 +166,249 @@ ### Per tile sequence quality -```{r 'per tile sequence quality'} +```{r 'per tile sequence quality', fig.width=10} +## check if 'per tile sequence quality' module exits or not +check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt')) +if (length(check_ptsq) > 0) { + ## reads 1 + ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') + ptsq_1$trim = 'before' + + ## reads 2 + ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') + ptsq_2$trim = 'after' + + comb_ptsq = rbind(ptsq_1, ptsq_2) + comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) + comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) + + # convert integers to charaters + comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) + + p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + + geom_raster() + + facet_grid(. ~ trim) + + xlab('Position in read (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) + ggplotly(p) +} else { + print('No "per tile sequence quality" data') +} + + +``` + +### Per sequence quality score + +```{r 'Per sequence quality score', fig.width=10} ## reads 1 -ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') -ptsq_1$trim = 'before' +psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores') +psqs_1$trim = 'before' ## reads 2 -ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') -ptsq_2$trim = 'after' +psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores') +psqs_2$trim = 'after' + +comb_psqs = rbind(psqs_1, psqs_2) +comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) + +p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +``` + + +### Per base sequence content + +```{r 'Per base sequence content', fig.width=10} +## reads 1 +pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content') +pbsc_1$id = 1:length(pbsc_1$X.Base) + +melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) +melt_pbsc_1$trim = 'before' + + +## reads 2 +pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content') +pbsc_2$id = 1:length(pbsc_2$X.Base) + +melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) +melt_pbsc_2$trim = 'after' + +comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) +comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) + +p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + + ylim(0, 100) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` -comb_ptsq = rbind(ptsq_1, ptsq_2) -comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) -comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) +### Per sequence GC content + +```{r 'Per sequence GC content', fig.width=10} +## reads 1 +psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content') +psGCc_1$trim = 'before' + +## reads 2 +psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content') +psGCc_2$trim = 'after' + +comb_psGCc = rbind(psGCc_1, psGCc_2) +comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) + +p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +``` + -p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) + - geom_raster() + - facet_grid(. ~ trim) + +### Per base N content + +```{r 'Per base N content', fig.width=10} +## reads 1 +pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content') +pbNc_1$id = 1:length(pbNc_1$X.Base) +pbNc_1$trim = 'before' + +## reads 2 +pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content') +pbNc_2$id = 1:length(pbNc_2$X.Base) +pbNc_2$trim = 'after' + +comb_pbNc = rbind(pbNc_1, pbNc_2) +comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) + +p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + + facet_grid(. ~ trim) + + ylim(0, 1) + + xlab('N-Count') + + ylab('') + theme(axis.text.x = element_text(angle=45)) ggplotly(p) ``` +### Sequence Length Distribution + +```{r 'Sequence Length Distribution', fig.width=10} +## reads 1 +sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution') +sld_1$id = 1:length(sld_1$X.Length) +sld_1$trim = 'before' + +## reads 2 +sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution') +sld_2$id = 1:length(sld_2$X.Length) +sld_2$trim = 'after' + +comb_sld = rbind(sld_1, sld_2) +comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) + +p = ggplot(data = comb_sld, aes(x = id, y = Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + + facet_grid(. ~ trim) + + xlab('Sequence Length (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` + +### Sequence Duplication Levels + +```{r 'Sequence Duplication Levels', fig.width=10} +## reads 1 +sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_1$id = 1:length(sdl_1$Duplication_Level) + +melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) +melt_sdl_1$trim = 'before' + + +## reads 2 +sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_2$id = 1:length(sdl_2$Duplication_Level) + +melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) +melt_sdl_2$trim = 'after' + +comb_sdl = rbind(melt_sdl_1, melt_sdl_2) +comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) + +p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + + geom_line() + + scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + + facet_grid(. ~ trim) + + xlab('Sequence Duplication Level') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` + +### Adapter Content + +```{r 'Adapter Content', fig.width=10} +## reads 1 +ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content') +ac_1$id = 1:length(ac_1$X.Position) + +melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) +melt_ac_1$trim = 'before' + +## reads 2 +ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content') +ac_2$id = 1:length(ac_2$X.Position) + +melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) +melt_ac_2$trim = 'after' + +comb_ac = rbind(melt_ac_1, melt_ac_2) +comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) + +p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_ac$id), max(comb_ac$id)) + + ylim(0, 1) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` + +### Kmer Content {.tabset} + +#### Before + +```{r 'Kmer Content (before)', fig.width=10} +kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content') +knitr::kable(kc_1) +``` + +#### After +```{r 'Kmer Content (after)', fig.width=10} +kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content') +knitr::kable(kc_2) +``` + # Session Info