Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
changeset 11:507eec497730 draft
update fastqc site
author | mingchen0919 |
---|---|
date | Tue, 07 Nov 2017 16:52:24 -0500 |
parents | 600c39b11913 |
children | 68ea2ebbf866 |
files | 01_evaluation_overview.Rmd 02_fastqc_original_reports.Rmd 02_per_base_sequence_quality.Rmd 03_per_tile_sequence_quality.Rmd 04_per_sequence_quality_score.Rmd 05_per_base_sequence_content.Rmd 06_per_sequence_gc_content.Rmd 07_per_base_n_content.Rmd 08_sequence_length_distribution.Rmd 09_sequence_duplication_levels.Rmd 10_adapter_content.Rmd 11_kmer_content.Rmd 1_per_base_quality_scores.Rmd 2_per_base_N_content.Rmd 3_per_sequence_quality_scores.Rmd 4_per_sequence_GC_content.Rmd 5_per_base_sequence_content.Rmd _site.yml fastqc_site.xml fastqc_site_render.R index.Rmd |
diffstat | 21 files changed, 886 insertions(+), 629 deletions(-) [+] |
line wrap: on
line diff
--- a/01_evaluation_overview.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ b/01_evaluation_overview.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -1,123 +1,124 @@ --- -title: "Evaluation Overview" -output: html_document +title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango --- ```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -```{bash 'copy data from datasets directory to working directory', echo=FALSE} -# Copy uploaded data to the working directory -for f in $(echo READS | sed "s/,/ /g") -do - cp $f ./ -done -``` - -```{bash 'run fastqc', echo=FALSE} -# run fastqc and place outputs into the report directory -for r in $(ls *.dat) -do - fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 -done -``` - -```{bash 'parse fastqc results', echo=FALSE} -##==== copy fastqc generated zip files from report output directory to job work directory == -cp -r REPORT_OUTPUT_DIR/*zip ./ - -# create a file to store data file paths -echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail -echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score -echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score -echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content -echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content -echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content -echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level -echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution -echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content - -for i in $(ls *.zip) -do - BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') - echo $BASE - unzip ${BASE}.zip > /dev/null 2>&1 - - ##====== pass,warning,fail (WSF) ============= - awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt - echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt - - ##====== per base quality scores (PBQS) ====== - awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt - echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt - - ##====== per sequence quality scores (PSQS) - awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt - echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt - - ##====== Per sequence GC content (PSGC) - awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt - echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt - - ##====== Per Base Sequence Content (PBSC) - awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt - echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt - - ##====== Per Base N Content (PBNC) - awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt - echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt - - ##====== Sequence Duplication Level (SDL) - awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt - echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt - - ##====== Sequence Length Distribution (SLD) - awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt - echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt - - ##====== Kmer Content ============ - awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt - echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt - -done +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) ``` -## Evaluation Overview +# Fastqc Evaluation + +## Evaluation of reads before trimming -```{r 'overview'} -PWF_file_paths = read.csv('PWF_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -rm('PWF_df') -for(i in 1:nrow(PWF_file_paths)) { - file_path = PWF_file_paths[i,2] - pwf_df = read.csv(file_path, - sep='\t', header=FALSE, stringsAsFactors = FALSE) - colnames(pwf_df) = c('item', PWF_file_paths[i,1]) - if (!exists('PWF_df')) { - PWF_df = pwf_df - } else { - PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) - } +```{r} +if ('READS_1' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1') + system(fastqc_command_reads_1, intern = TRUE) + + # Original html report + reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1)) + + unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt') + fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1)) + # summary.txt + file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt') + summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) } ``` +## Evaluation of reads after trimming + ```{r} -my_icon = c('ok', 'remove', 'star') -names(my_icon) = c('pass', 'fail', 'warn') -evaluate_list = list() -for (i in colnames(PWF_df)[-1]) { - evaluate_list[[i]] = formatter( - "span", - style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), - "color" = "white", - "width" = "50px", - "float" = "left", - "padding-right" = "5px") - ) +if ('READS_2' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1') + system(fastqc_command_reads_2, intern = TRUE) + + # Original html report + reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2)) + + unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt') + fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2)) + # summary.txt + file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt') + summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) } +``` -formattable(PWF_df, evaluate_list) + + +# Fastqc output visualization + +## Overview + +```{r} +reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] +reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] +combined_summary = cbind(reads_1_summary, reads_2_summary) +names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) +combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' +combined_summary[combined_summary == 'WARN'] = 'WARN (!)' +knitr::kable(combined_summary) +``` + +# Session Info + +```{r 'session info'} +sessionInfo() ``` \ No newline at end of file
--- a/02_fastqc_original_reports.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ ---- -title: "FastQC original reports" -output: html_document ---- - -```{r 'FastQC original reports', include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - - -Below are links to ***Fastqc*** original html reports. - -```{r 'html report links'} -html_report_list = list() -html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') -for (i in html_files) { - html_report_list[[i]] = tags$li(tags$a(href=i, i)) -} -tags$ul(html_report_list) -``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/02_per_base_sequence_quality.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,47 @@ +--- +title: 'Per base sequence quality' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per base sequence quality + +```{r 'per base sequence quality', fig.width=10} +## reads 1 +pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') +pbsq_1$id = 1:length(pbsq_1$X.Base) + +melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable == 'Mean') +melt_pbsq_1$trim = 'before' + + +## reads 2 +pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') +pbsq_2$id = 1:length(pbsq_2$X.Base) + +melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable == 'Mean') +melt_pbsq_2$trim = 'after' + +comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) +comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) + +p = ggplot(data = comb_pbsq) + + geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + + scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + + facet_grid(. ~ trim) + + ylim(0, max(comb_pbsq$value) + 5) + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) + +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/03_per_tile_sequence_quality.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,48 @@ +--- +title: 'Per Tile Sequence Quality' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per tile sequence quality + +```{r 'per tile sequence quality', fig.width=10} +## check if 'per tile sequence quality' module exits or not +check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt')) +if (length(check_ptsq) > 0) { + ## reads 1 + ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') + ptsq_1$trim = 'before' + + ## reads 2 + ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') + ptsq_2$trim = 'after' + + comb_ptsq = rbind(ptsq_1, ptsq_2) + comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) + comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) + + # convert integers to charaters + comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) + + p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + + geom_raster() + + facet_grid(. ~ trim) + + xlab('Position in read (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) + ggplotly(p) +} else { + print('No "per tile sequence quality" data') +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/04_per_sequence_quality_score.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,39 @@ +--- +title: 'Per sequence quality score' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per sequence quality score + +```{r 'Per sequence quality score', fig.width=10} +## reads 1 +psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores') +psqs_1$trim = 'before' + +## reads 2 +psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores') +psqs_2$trim = 'after' + +comb_psqs = rbind(psqs_1, psqs_2) +comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) + +p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/05_per_base_sequence_content.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,47 @@ +--- +title: 'Per base sequence content' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per base sequence content + +```{r 'Per base sequence content', fig.width=10} +## reads 1 +pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content') +pbsc_1$id = 1:length(pbsc_1$X.Base) + +melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) +melt_pbsc_1$trim = 'before' + + +## reads 2 +pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content') +pbsc_2$id = 1:length(pbsc_2$X.Base) + +melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) +melt_pbsc_2$trim = 'after' + +comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) +comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) + +p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + + ylim(0, 100) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/06_per_sequence_gc_content.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,38 @@ +--- +title: 'Per sequence GC content' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per sequence GC content + +```{r 'Per sequence GC content', fig.width=10} +## reads 1 +psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content') +psGCc_1$trim = 'before' + +## reads 2 +psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content') +psGCc_2$trim = 'after' + +comb_psGCc = rbind(psGCc_1, psGCc_2) +comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) + +p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlab('Mean Sequence Qaulity (Phred Score)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/07_per_base_n_content.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,43 @@ +--- +title: 'Per base N content' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Per base N content + +```{r 'Per base N content', fig.width=10} +## reads 1 +pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content') +pbNc_1$id = 1:length(pbNc_1$X.Base) +pbNc_1$trim = 'before' + +## reads 2 +pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content') +pbNc_2$id = 1:length(pbNc_2$X.Base) +pbNc_2$trim = 'after' + +comb_pbNc = rbind(pbNc_1, pbNc_2) +comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) + +p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + + facet_grid(. ~ trim) + + ylim(0, 1) + + xlab('N-Count') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/08_sequence_length_distribution.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,42 @@ +--- +title: 'Sequence Length Distribution' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Sequence Length Distribution + +```{r 'Sequence Length Distribution', fig.width=10} +## reads 1 +sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution') +sld_1$id = 1:length(sld_1$X.Length) +sld_1$trim = 'before' + +## reads 2 +sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution') +sld_2$id = 1:length(sld_2$X.Length) +sld_2$trim = 'after' + +comb_sld = rbind(sld_1, sld_2) +comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) + +p = ggplot(data = comb_sld, aes(x = id, y = Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + + facet_grid(. ~ trim) + + xlab('Sequence Length (bp)') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/09_sequence_duplication_levels.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,49 @@ +--- +title: 'Sequence Duplication Levels' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Sequence Duplication Levels + +```{r 'Sequence Duplication Levels', fig.width=10} +## reads 1 +sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_1$id = 1:length(sdl_1$Duplication_Level) + +melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) +melt_sdl_1$trim = 'before' + + +## reads 2 +sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_2$id = 1:length(sdl_2$Duplication_Level) + +melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) +melt_sdl_2$trim = 'after' + +comb_sdl = rbind(melt_sdl_1, melt_sdl_2) +comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) + +p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + + geom_line() + + scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + + facet_grid(. ~ trim) + + xlab('Sequence Duplication Level') + + ylab('') + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/10_adapter_content.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,46 @@ +--- +title: 'Adapter Content' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Adapter Content + +```{r 'Adapter Content', fig.width=10} +## reads 1 +ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content') +ac_1$id = 1:length(ac_1$X.Position) + +melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) +melt_ac_1$trim = 'before' + +## reads 2 +ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content') +ac_2$id = 1:length(ac_2$X.Position) + +melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) +melt_ac_2$trim = 'after' + +comb_ac = rbind(melt_ac_1, melt_ac_2) +comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) + +p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_ac$id), max(comb_ac$id)) + + ylim(0, 1) + + xlab('Position in read (bp)') + + ylab('') +ggplotly(p) +``` \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/11_kmer_content.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -0,0 +1,31 @@ +--- +title: 'Kmer Content' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) +``` + +### Kmer Content {.tabset} + +#### Before + +```{r 'Kmer Content (before)', fig.width=10} +kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content') +knitr::kable(kc_1) +``` + +#### After +```{r 'Kmer Content (after)', fig.width=10} +kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content') +knitr::kable(kc_2) +``` \ No newline at end of file
--- a/1_per_base_quality_scores.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ ---- -title: "Per Base Quality Scores" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - - -## Per Base Quality Scores - -```{r} -PBQS_df = data.frame() -PBQS_file_paths = read.csv('PBQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) - file_path = PBQS_file_paths[i,2] - pbqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:7], 'Base') - colnames(df2) = c(colnames(df2)[1:7], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) - PBQS_df = rbind(PBQS_df, pbqs_df) -} -``` - - -```{r} -# datatable(PBQS_df) -max_phred = max(PBQS_df$Mean) + 10 -hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% - hc_title( - text = "Per Base Quality Score" - ) %>% - hc_yAxis( - title = list(text = "Mean Base Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -```
--- a/2_per_base_N_content.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ ---- -title: "Per Base N Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Base N Content - -```{r} -PBNC_df = data.frame() -PBNC_file_paths = read.csv('PBNC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBNC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) - file_path = PBNC_file_paths[i,2] - pbnc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:2], 'Base') - colnames(df2) = c(colnames(df2)[1:2], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) - PBNC_df = rbind(PBNC_df, pbnc_df) -} -``` - - -```{r} -PBNC_df$N.Count = PBNC_df$N.Count * 100 -max_phred = max(PBNC_df$N.Count) + 5 -hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% - hc_title( - text = "Per Base N Content" - ) %>% - hc_xAxis( - title = list(text = "Base Position") - ) %>% - hc_yAxis( - title = list(text = "N %"), - plotLines = list( - list(label = list(text = "N = 5%"), - width = 2, - dashStyle = "dash", - color = "red", - value = 5) - ) - ) %>% - hc_exporting(enabled = TRUE) -```
--- a/3_per_sequence_quality_scores.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ ---- -title: "Per Sequence Quality Scores" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Sequence Quality Scores - -```{r} -PSQS_df = data.frame() -PSQS_file_paths = read.csv('PSQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) - file_path = PSQS_file_paths[i,2] - psqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) - PSQS_df = rbind(PSQS_df, psqs_df) -} -``` - - -```{r} -max_phred = max(PSQS_df$X.Quality) + 5 -hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence Quality Score" - ) %>% - hc_xAxis( - title = list(text = "Mean Sequence Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -```
--- a/4_per_sequence_GC_content.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ ---- -title: "Per Sequence GC Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Sequence GC Content - - -```{r} -PSGC_df = data.frame() -PSGC_file_paths = read.csv('PSGC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSGC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) - file_path = PSGC_file_paths[i,2] - psgc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) - PSGC_df = rbind(PSGC_df, psgc_df) -} -``` - - -```{r} -max_phred = max(PSGC_df$Count) + 5 -hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence GC Content" - ) %>% - hc_xAxis( - title = list(text = "% GC") - ) %>% - hc_exporting(enabled = TRUE) -```
--- a/5_per_base_sequence_content.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ ---- -title: "Per Base Sequence Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Base Sequence Content - -```{r} -PBSC_df = data.frame() -PBSC_file_paths = read.csv('PBSC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBSC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) - file_path = PBSC_file_paths[i,2] - pbsc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:5], 'Base') - colnames(df2) = c(colnames(df2)[1:5], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) - PBSC_df = rbind(PBSC_df, pbsc_df) -} -``` - - -```{r out.width="100%"} -PBSC_df_2 = select(PBSC_df, -X.Base) %>% - melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') -p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + - geom_line() + - facet_wrap(~ sample_id) -ggplotly(p) -``` -
--- a/_site.yml Tue Aug 15 15:50:21 2017 -0400 +++ b/_site.yml Tue Nov 07 16:52:24 2017 -0500 @@ -8,21 +8,29 @@ icon: fa-home href: index.html - text: "Evaluation Overview" - href: 01_evaluation_overview.html - - text: "Evaluation Items" + href: x01_evaluation_overview.html + - text: "Evaluation by data module" menu: - - text: "Per Base Quality Scores" - href: 1_per_base_quality_scores.html + - text: "Per Base Sequence Quality" + href: x02_per_base_sequence_quality.html + - text: "Per Tile Sequence Quality" + href: x03_per_tile_sequence_quality.html + - text: "Per Sequence Quality Score" + href: x04_per_sequence_quality_score.html + - text: "Per Base Sequence Content" + href: x05_per_base_sequence_content.html + - text: "Per Sequence GC Content" + href: x06_per_sequence_gc_content.html - text: "Per Base N Content" - href: 2_per_base_N_content.html - - text: "Per Sequence Quality Scores" - href: 3_per_sequence_quality_scores.html - - text: "Per Sequence GC Content" - href: 4_per_sequence_GC_content.html - - text: "Per Base Sequence Content" - href: 5_per_base_sequence_content.html - - text: "Original FastQC Reports" - href: 02_fastqc_original_reports.html + href: x07_per_base_n_content.html + - text: "Sequence Length Distribution" + href: x08_sequence_length_distribution.html + - text: "Sequence Duplication Levels" + href: x09_sequence_duplication_levels.html + - text: "Adapter Content" + href: x10_adapter_content.html + - text: "Kmer Content" + href: x11_kmer_content.html output: html_document: theme: cosmo
--- a/fastqc_site.xml Tue Aug 15 15:50:21 2017 -0400 +++ b/fastqc_site.xml Tue Nov 07 16:52:24 2017 -0500 @@ -1,9 +1,9 @@ -<tool id="fastqc_site" name="Fastqc Site" version="1.0.0"> +<tool id="fastqc_site" name="Fastqc Site" version="2.0.0"> <requirements> <requirement type="package" version="1.15.0.6-0">pandoc</requirement> <requirement type="package" version="1.14.1">bioconductor-deseq2</requirement> <requirement type="package" version="1.20.0">r-getopt</requirement> - <requirement type="package" version="1.2">r-rmarkdown</requirement> + <requirement type="package" version="1.3">r-rmarkdown</requirement> <requirement type="package" version="1.8.4">r-plyr</requirement> <requirement type="package" version="1.1.0">r-stringr</requirement> <requirement type="package" version="0.5.0">r-highcharter</requirement> @@ -14,22 +14,12 @@ <requirement type="package" version="0.3.5">r-htmltools</requirement> <requirement type="package" version="0.11.5">fastqc</requirement> </requirements> - <description> - Implements FastQC analysis and display results in R Markdown website. - </description> <stdio> - <regex match="Execution halted" - source="both" - level="fatal" - description="Execution halted." /> - <regex match="Error in" - source="both" - level="fatal" - description="An undefined error occured, please check your intput carefully and contact your administrator." /> - <regex match="Fatal error" - source="both" - level="fatal" - description="An undefined error occured, please check your intput carefully and contact your administrator." /> + <!--redirecting stderr to a file. "XXX" is used to match with nothing so that tool running won't be interrupted during testing--> + <regex match="XXX" + source="stderr" + level="warning" + description="Check the warnings_and_errors.txt file for more details."/> </stdio> <command> <![CDATA[ @@ -37,36 +27,65 @@ Rscript '${__tool_directory__}/fastqc_site_render.R' ## 1. input data - -r $reads -e $echo + -r $reads_1 + -n '$reads_1.name' + -R $reads_2 + -N '$reads_2.name' + -c $contaminants + -l $limits ## 2. output report and report site directory - -o $fastqc_site - -d $fastqc_site.files_path + -o $report + -d $report.files_path + -s $sink_message ## 3. Rmd templates sitting in the tool directory - ## _site.yml and index.Rmd template files - -s '${__tool_directory__}/_site.yml' - -i '${__tool_directory__}/index.Rmd' + ## _site.yml and index.Rmd template files + -S '${__tool_directory__}/_site.yml' + -I '${__tool_directory__}/index.Rmd' - ## other Rmd body template files - -p '${__tool_directory__}/01_evaluation_overview.Rmd' - -a '${__tool_directory__}/02_fastqc_original_reports.Rmd' - -b '${__tool_directory__}/1_per_base_quality_scores.Rmd' - -c '${__tool_directory__}/2_per_base_N_content.Rmd' - -f '${__tool_directory__}/3_per_sequence_quality_scores.Rmd' - -g '${__tool_directory__}/4_per_sequence_GC_content.Rmd' - -h '${__tool_directory__}/5_per_base_sequence_content.Rmd' + ## other Rmd body template files + -A '${__tool_directory__}/01_evaluation_overview.Rmd' + -B '${__tool_directory__}/02_per_base_sequence_quality.Rmd' + -C '${__tool_directory__}/03_per_tile_sequence_quality.Rmd' + -D '${__tool_directory__}/04_per_sequence_quality_score.Rmd' + -E '${__tool_directory__}/05_per_base_sequence_content.Rmd' + -F '${__tool_directory__}/06_per_sequence_gc_content.Rmd' + -G '${__tool_directory__}/07_per_base_n_content.Rmd' + -H '${__tool_directory__}/08_sequence_length_distribution.Rmd' + -J '${__tool_directory__}/09_sequence_duplication_levels.Rmd' + -K '${__tool_directory__}/10_adapter_content.Rmd' + -L '${__tool_directory__}/11_kmer_content.Rmd' + ]]> </command> <inputs> - <param format="fastq,fastq.gz,fastq.bz2,bam,sam" multiple="true" name="reads" type="data" label="Short reads data from history" /> - <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Display analysis code in report?" /> + <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_1" type="data" optional="false" + label="Short reads before trimming" + help="Short reads data from history. This could be reads before trimming."/> + <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_2" type="data" + label="Short reads after trimming" + help="Short reads data from history. This could be reads after trimming."/> + <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list" + help="Specifies a non-default file which contains the list of adapter sequences which will be explicitly + searched against the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash will be ignored."/> + <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file" + help="Specifies a non-default file which contains a set of criteria + which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder."/> + <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" + label="Display analysis code in report?"/> </inputs> <outputs> - <data format="html" name="fastqc_site" label="fastqc site" /> + <data format="html" name="report" label="fastqc site"/> + <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/> </outputs> <citations> <citation type="bibtex"> @@ -79,7 +98,8 @@ <citation type="bibtex"> @article{allaire2016rmarkdown, title={rmarkdown: Dynamic Documents for R, 2016}, - author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, + author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff + and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, journal={R package version 0.9}, volume={6}, year={2016} @@ -97,31 +117,14 @@ <citation type="bibtex"> @misc{plotly2017, title = {plotly: Create Interactive Web Graphics via 'plotly.js'}, - author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy}, + author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and + Marianne Corvellec and Pedro Despouy}, year = {2017}, note = {R package version 4.6.0}, url = {https://CRAN.R-project.org/package=plotly}, } </citation> <citation type="bibtex"> - @misc{highcharter2017, - title = {highcharter: A Wrapper for the 'Highcharts' Library}, - author = {Joshua Kunst}, - year = {2017}, - note = {R package version 0.5.0}, - url = {https://CRAN.R-project.org/package=highcharter}, - } - </citation> - <citation type="bibtex"> - @misc{formattable2016, - title = {formattable: Create 'Formattable' Data Structures}, - author = {Kun Ren and Kenton Russell}, - year = {2016}, - note = {R package version 0.2.0.1}, - url = {https://CRAN.R-project.org/package=formattable}, - } - </citation> - <citation> @article{ewels2016multiqc, title={MultiQC: summarize analysis results for multiple tools and samples in a single report}, author={Ewels, Philip and Magnusson, M{\aa}ns and Lundin, Sverker and K{\"a}ller, Max},
--- a/fastqc_site_render.R Tue Aug 15 15:50:21 2017 -0400 +++ b/fastqc_site_render.R Tue Nov 07 16:52:24 2017 -0500 @@ -1,195 +1,283 @@ -##======= Handle arguments from command line ======== -# setup R error handline to go to stderr -options(show.error.messages=FALSE, - error=function(){ - cat(geterrmessage(), file=stderr()) - quit("no", 1, F) - }) - -# we need that to not crash galaxy with an UTF8 error on German LC settings. -loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -# suppress warning -options(warn = -1) - -options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -args = commandArgs(trailingOnly=TRUE) - -suppressPackageStartupMessages({ - library(getopt) - library(tools) -}) - -# column 1: the long flag name -# column 2: the short flag alias. A SINGLE character string -# column 3: argument mask -# 0: no argument -# 1: argument required -# 2: argument is optional -# column 4: date type to which the flag's argument shall be cast. -# possible values: logical, integer, double, complex, character. -spec_list=list() - -##------- 1. input data --------------------- -spec_list$READS = c('reads', 'r', '1', 'character') -spec_list$ECHO = c('echo', 'e', '1', 'character') - -##--------2. output report and report site directory -------------- -spec_list$FASTQC_SITE = c('fastqc_site', 'o', '1', 'character') -spec_list$FASTQC_SITE_DIR = c('fastqc_site_dir', 'd', '1', 'character') - -##--------3. Rmd templates sitting in the tool directory ---------- - - ## _site.yml and index.Rmd files - spec_list$SITE_YML = c('site_yml', 's', 1, 'character') - spec_list$INDEX_Rmd = c('index_rmd', 'i', 1, 'character') - - ## other Rmd body template files - spec_list$x01 = c('x01_evaluation_overview', 'p', '1', 'character') - spec_list$x02 = c('x02_fastqc_original_reports', 'a', '1', 'character') - spec_list$x1 = c('x1_per_base_quality_scores', 'b', '1', 'character') - spec_list$x2 = c('x2_per_base_N_content', 'c', '1', 'character') - spec_list$x3 = c('x3_per_sequence_quality_scores', 'f', '1', 'character') - spec_list$x4 = c('x4_per_sequence_GC_content', 'g', '1', 'character') - spec_list$x5 = c('x5_per_base_sequence_content', 'h', '1', 'character') - -##------------------------------------------------------------------ - -spec = t(as.data.frame(spec_list)) -opt = getopt(spec) -# arguments are accessed by long flag name (the first column in the spec matrix) -# NOT by element name in the spec_list -# example: opt$help, opt$expression_file -##====== End of arguments handling ========== - -#------ Load libraries --------- +library(getopt) library(rmarkdown) +library(htmltools) library(plyr) +library(dplyr) library(stringr) -library(dplyr) library(highcharter) library(DT) library(reshape2) library(plotly) library(formattable) -library(htmltools) - +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -#----- 1. create the report directory ------------------------ -paste0('mkdir -p ', opt$fastqc_site_dir) %>% - system() - -#----- 2. generate Rmd files with Rmd templates -------------- -# a. templates without placeholder variables: -# copy templates from tool directory to the working directory. -# b. templates with placeholder variables: -# substitute variables with user input values and place them in the working directory. +##============ Sink warnings and errors to a file ============== +## use the sink() function to wrap all code within it. +##============================================================== +zz = file('warnings_and_errors.txt') +sink(zz) +sink(zz, type = 'message') + ##---------below is the code for rendering .Rmd templates----- + + ##=============STEP 1: handle command line arguments========== + ## + ##============================================================ + # column 1: the long flag name + # column 2: the short flag alias. A SINGLE character string + # column 3: argument mask + # 0: no argument + # 1: argument required + # 2: argument is optional + # column 4: date type to which the flag's argument shall be cast. + # possible values: logical, integer, double, complex, character. + #------------------------------------------------------------- + #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++ + # 1. short flag alias should match the flag in the command section in the XML file. + # 2. long flag name can be any legal R variable names + # 3. two names in args_list can have common string but one name should not be a part of another name. + # for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems. + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + args_list=list() + ##------- 1. input data --------------------- + args_list$ECHO = c('echo', 'e', '1', 'character') + args_list$READS_1 = c('reads_1', 'r', '1', 'character') + args_list$NAME_1 = c('name_1', 'n', '1', 'character') + args_list$READS_2 = c('reads_2', 'R', '1', 'character') + args_list$NAME_2 = c('name_2', 'N', '1', 'character') + args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character') + args_list$LIMITS = c('limits', 'l', '1', 'character') + ##--------2. output report and outputs -------------- + args_list$REPORT_HTML = c('report_html', 'o', '1', 'character') + args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') + args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character') + ##--------3. .Rmd templates in the tool directory ---------- + args_list$SITE_YML = c('site_yml', 'S', '1', 'character') + args_list$INDEX_RMD = c('index_rmd', 'I', '1', 'character') + args_list$X01_EVALUATION_OVERVIEW = c('x01_evaluation_overview', 'A', '1', 'character') + args_list$X02_PER_BASE_SEQUENCE_QUALITY = c('x02_per_base_sequence_quality', 'B', '1', 'character') + args_list$X03_PER_TILE_SEQUENCE_QUALITY = c('x03_per_tile_sequence_quality', 'C', '1', 'character') + args_list$X04_PER_SEQUENCE_QUALITY_SCORE = c('x04_per_sequence_quality_score', 'D', '1', 'character') + args_list$X05_PER_BASE_SEQUENCE_CONTENT = c('x05_per_base_sequence_content', 'E', '1', 'character') + args_list$X06_PER_SEQUENCE_GC_CONTENT = c('x06_per_sequence_gc_content', 'F', '1', 'character') + args_list$X07_PER_BASE_N_CONTENT = c('x07_per_base_n_content', 'G', '1', 'character') + args_list$X08_SEQUENCE_LENGTH_DISTRIBUTION = c('x08_sequence_length_distribution', 'H', '1', 'character') + args_list$X09_SEQUENCE_DUPLICATION_LEVELS = c('x09_sequence_duplication_levels', 'J', '1', 'character') + args_list$X10_ADAPTER_CONTENT = c('x10_adapter_content', 'K', '1', 'character') + args_list$X11_KMER_CONTENT = c('x11_kmer_content', 'L', '1', 'character') + ##----------------------------------------------------------- + opt = getopt(t(as.data.frame(args_list))) - #----- Copy index.Rmd and _site.yml files to job working direcotry ----- - file.copy(opt$index_rmd, 'index.Rmd', recursive=TRUE) - file.copy(opt$site_yml, '_site.yml', recursive=TRUE) - #--------------------------------------------------------- - - #----- 01_evaluation_overview.Rmd ----------------------- - readLines(opt$x01_evaluation_overview) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('READS', opt$reads, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('01_evaluation_overview.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 1_per_base_quality_scores.Rmd -------------------- - readLines(opt$x1_per_base_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('1_per_base_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 2_per_base_N_content.Rmd ------------------------- - readLines(opt$x2_per_base_N_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('2_per_base_N_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 3_per_sequence_quality_scores.Rmd ---------------- - readLines(opt$x3_per_sequence_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('3_per_sequence_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 4_per_sequence_GC_content.Rmd -------------------- - readLines(opt$x4_per_sequence_GC_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('4_per_sequence_GC_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 5_per_base_sequence_content.Rmd ------------------ - readLines(opt$x5_per_base_sequence_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('5_per_base_sequence_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) + + ##=======STEP 2: create report directory (optional)========== + ## + ##=========================================================== + dir.create(opt$report_dir) + + ##==STEP 3: copy index.Rmd and _site.yml to job working directory====== + ## + ##===================================================================== + file.copy(opt$index_rmd, 'index.Rmd') + file.copy(opt$site_yml, '_site.yml') + + ##=STEP 4: replace placeholders in .Rmd files with argument values= + ## + ##================================================================= + #++ need to replace placeholders with args values one by one+ + + # 01_evaluation_overview.Rmd + readLines(opt$x01_evaluation_overview) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('READS_1', opt$reads_1, x) + }) %>% + (function(x) { + gsub('NAME_1', opt$name_1, x) + }) %>% + (function(x) { + gsub('READS_2', opt$reads_2, x) + }) %>% + (function(x) { + gsub('NAME_2', opt$name_1, x) + }) %>% + (function(x) { + gsub('CONTAMINANTS', opt$contaminants, x) + }) %>% + (function(x) { + gsub('LIMITS', opt$limits, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x01_evaluation_overview.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 02_per_base_sequence_quality.Rmd + readLines(opt$x02_per_base_sequence_quality) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x02_per_base_sequence_quality.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 03_per_tile_sequence_quality.Rmd + readLines(opt$x03_per_tile_sequence_quality) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x03_per_tile_sequence_quality.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 04_per_sequence_quality_score.Rmd + readLines(opt$x04_per_sequence_quality_score) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x04_per_sequence_quality_score.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 05_per_base_sequence_content.Rmd + readLines(opt$x05_per_base_sequence_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x05_per_base_sequence_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 06_per_sequence_gc_content.Rmd + readLines(opt$x06_per_sequence_gc_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x06_per_sequence_gc_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 07_per_base_n_content.Rmd + readLines(opt$x07_per_base_n_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x07_per_base_n_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) - #----- 02_fastqc_original_reports.Rmd ------------------- - readLines(opt$x02_fastqc_original_reports) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('02_fastqc_original_reports.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) + # 08_sequence_length_distribution.Rmd + readLines(opt$x08_sequence_length_distribution) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x08_sequence_length_distribution.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 09_sequence_duplication_levels.Rmd + readLines(opt$x09_sequence_duplication_levels) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x09_sequence_duplication_levels.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 10_adapter_content.Rmd + readLines(opt$x10_adapter_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x10_adapter_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 11_kmer_content.Rmd + readLines(opt$x11_kmer_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x11_kmer_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + ##=============STEP 5: render all .Rmd templates================= + ## + ##=========================================================== + extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { + f = readLines(fastqc_data) + start_line = grep(module_name, f) + end_module_lines = grep('END_MODULE', f) + end_line = end_module_lines[which(end_module_lines > start_line)[1]] + module_data = f[(start_line+1):(end_line-1)] + writeLines(module_data, 'temp.txt') + read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char) + } + render_site() + + ##=============STEP 6: manipulate outputs==================== + ## + ##=========================================================== + file.copy('my_site/index.html', opt$report_html, recursive = TRUE) + system(paste0('cp -r my_site/* ', opt$report_dir)) - -#------ 3. render all Rmd files with render_site() -------- -render_site() - - -#-------4. manipulate outputs ----------------------------- -# a. copy index.html to the report output path -# b. copy all files in 'my_site' to the report output directory -file.copy('my_site/index.html', opt$fastqc_site, recursive=TRUE) -paste0('cp -r my_site/* ', opt$fastqc_site_dir) %>% - system() - - + ##--------end of code rendering .Rmd templates---------------- +sink() +##=========== End of sinking output============================= \ No newline at end of file