Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report
changeset 14:2efa46ce2c4c draft
upgrade fastqc_report
author | mingchen0919 |
---|---|
date | Wed, 18 Oct 2017 22:06:39 -0400 |
parents | 9d3586701985 |
children | d1d20f341632 |
files | fastqc_report.Rmd fastqc_report.xml fastqc_report_ori.Rmd fastqc_report_render.R fastqc_report_render_ori.R |
diffstat | 5 files changed, 606 insertions(+), 449 deletions(-) [+] |
line wrap: on
line diff
--- a/fastqc_report.Rmd Mon Oct 16 21:33:31 2017 -0400 +++ b/fastqc_report.Rmd Wed Oct 18 22:06:39 2017 -0400 @@ -1,383 +1,72 @@ --- -title: "Fastqc report: short reads quality evaluation" -author: "Ming Chen" -output: html_document +title: 'HTML report title' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango --- -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE) -library(plyr) -library(stringr) -library(dplyr) -library(highcharter) -library(DT) -library(reshape2) -library(plotly) -library(formattable) -library(htmltools) +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO +) ``` -```{bash 'create output directory', echo=FALSE} -# create extra files directory. very important! -mkdir REPORT_OUTPUT_DIR -``` +# Fastqc Analysis -# Fastqc analysis -```{bash 'copy data to working directory', echo=FALSE} -# Copy uploaded data to the working directory +* Copy fastq files to job working directory + +```{bash 'copy files'} for f in $(echo READS | sed "s/,/ /g") do cp $f ./ done ``` +* Run fastqc -```{bash 'run fastqc', echo=FALSE} +```{bash 'run fastqc'} for r in $(ls *.dat) do - fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 + fastqc -o REPORT_DIR $r > /dev/null 2>&1 done ``` -## Fastqc html reports +* Create links to original HTML reports -Below are links to ***Fastqc*** original html reports. ```{r 'html report links'} html_report_list = list() -html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') +html_files = list.files('REPORT_DIR', pattern = '.*html') for (i in html_files) { html_report_list[[i]] = tags$li(tags$a(href=i, i)) } tags$ul(html_report_list) ``` - -## Parsing fastqc data - -```{bash echo=FALSE} -##==== copy fastqc generated zip files from report output directory to job work directory == -cp -r REPORT_OUTPUT_DIR/*zip ./ - -# create a file to store data file paths -echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail -echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score -echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score -echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content -echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content -echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content -echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level -echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution -echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content - -for i in $(ls *.zip) -do - BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') - echo $BASE - unzip ${BASE}.zip > /dev/null 2>&1 - - ##====== pass,warning,fail (WSF) ============= - awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt - echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt +# Fastqc output summary - ##====== per base quality scores (PBQS) ====== - awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt - echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt - - ##====== per sequence quality scores (PSQS) - awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt - echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt - - ##====== Per sequence GC content (PSGC) - awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt - echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt - - ##====== Per Base Sequence Content (PBSC) - awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt - echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt - - ##====== Per Base N Content (PBNC) - awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt - echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt - - ##====== Sequence Duplication Level (SDL) - awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt - echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt - - ##====== Sequence Length Distribution (SLD) - awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt - echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt - - ##====== Kmer Content ============ - awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt - echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt - -done -``` +* Define a function to extract outputs for each module from fastqc output - -## Evaluation Overview - -```{r 'overview'} -PWF_file_paths = read.csv('PWF_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -rm('PWF_df') -for(i in 1:nrow(PWF_file_paths)) { - file_path = PWF_file_paths[i,2] - pwf_df = read.csv(file_path, - sep='\t', header=FALSE, stringsAsFactors = FALSE) - colnames(pwf_df) = c('item', PWF_file_paths[i,1]) - if (!exists('PWF_df')) { - PWF_df = pwf_df - } else { - PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) - } -} -``` - -```{r} -my_icon = c('ok', 'remove', 'star') -names(my_icon) = c('pass', 'fail', 'warn') -evaluate_list = list() -for (i in colnames(PWF_df)[-1]) { - evaluate_list[[i]] = formatter( - "span", - style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), - "color" = "white", - "width" = "50px", - "float" = "left", - "padding-right" = "5px") - ) -} - -formattable(PWF_df, evaluate_list) -``` - - -## Per Base Quality Scores - -```{r} -PBQS_df = data.frame() -PBQS_file_paths = read.csv('PBQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) - file_path = PBQS_file_paths[i,2] - pbqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:7], 'Base') - colnames(df2) = c(colnames(df2)[1:7], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) - PBQS_df = rbind(PBQS_df, pbqs_df) +```{r 'function definition'} +extract_data_module = function(fastqc_data, module_name) { + f = readLines(fastqc_data) + start_line = grep(module_name, f) + end_module_lines = grep('END_MODULE', f) + end_line = end_module_lines[which(end_module_lines > start_line)[1]] + module_data = f[(start_line+1):(end_line-1)] + writeLines(module_data, 'temp.txt') + read.csv('temp.txt', sep = '\t') } ``` - -```{r} -# datatable(PBQS_df) -max_phred = max(PBQS_df$Mean) + 10 -hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% - hc_title( - text = "Per Base Quality Score" - ) %>% - hc_yAxis( - title = list(text = "Mean Base Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` - - -## Per Base N Content +## -```{r} -PBNC_df = data.frame() -PBNC_file_paths = read.csv('PBNC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBNC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) - file_path = PBNC_file_paths[i,2] - pbnc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:2], 'Base') - colnames(df2) = c(colnames(df2)[1:2], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) - PBNC_df = rbind(PBNC_df, pbnc_df) -} -``` - +# Session Info -```{r} -PBNC_df$N.Count = PBNC_df$N.Count * 100 -max_phred = max(PBNC_df$N.Count) + 5 -hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% - hc_title( - text = "Per Base N Content" - ) %>% - hc_xAxis( - title = list(text = "Base Position") - ) %>% - hc_yAxis( - title = list(text = "N %"), - plotLines = list( - list(label = list(text = "N = 5%"), - width = 2, - dashStyle = "dash", - color = "red", - value = 5) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` - - - - -## Per Sequence Quality Scores - -```{r} -PSQS_df = data.frame() -PSQS_file_paths = read.csv('PSQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) - file_path = PSQS_file_paths[i,2] - psqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) - PSQS_df = rbind(PSQS_df, psqs_df) -} +```{r 'session info'} +sessionInfo() ``` - -```{r} -max_phred = max(PSQS_df$X.Quality) + 5 -hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence Quality Score" - ) %>% - hc_xAxis( - title = list(text = "Mean Sequence Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` - - -## Per Sequence GC Content - - -```{r} -PSGC_df = data.frame() -PSGC_file_paths = read.csv('PSGC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSGC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) - file_path = PSGC_file_paths[i,2] - psgc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) - PSGC_df = rbind(PSGC_df, psgc_df) -} -``` - - -```{r} -max_phred = max(PSGC_df$Count) + 5 -hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence GC Content" - ) %>% - hc_xAxis( - title = list(text = "% GC") - ) %>% - hc_exporting(enabled = TRUE) -``` - - -## Per Base Sequence Content - -```{r} -PBSC_df = data.frame() -PBSC_file_paths = read.csv('PBSC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBSC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) - file_path = PBSC_file_paths[i,2] - pbsc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:5], 'Base') - colnames(df2) = c(colnames(df2)[1:5], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) - PBSC_df = rbind(PBSC_df, pbsc_df) -} -``` - - -```{r out.width="100%"} -PBSC_df_2 = select(PBSC_df, -X.Base) %>% - melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') -p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + - geom_line() + - facet_wrap(~ sample_id) -ggplotly(p) -``` - - -## References - -* Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176. -* Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86. -* Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343. -* Highcharts. https://www.highcharts.com/. (access by May 26, 2017). -* R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. -* Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter -* Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly
--- a/fastqc_report.xml Mon Oct 16 21:33:31 2017 -0400 +++ b/fastqc_report.xml Wed Oct 18 22:06:39 2017 -0400 @@ -1,4 +1,4 @@ -<tool id="fastqc_report" name="Fastqc report" version="1.0.1"> +<tool id="fastqc_report" name="Fastqc report" version="2.0.0"> <description> Implements FastQC analysis and display results in R Markdown html. </description> @@ -6,7 +6,7 @@ <requirement type="package" version="1.15.0.6-0">pandoc</requirement> <requirement type="package" version="1.14.1">bioconductor-deseq2</requirement> <requirement type="package" version="1.20.0">r-getopt</requirement> - <requirement type="package" version="1.2">r-rmarkdown</requirement> + <requirement type="package" version="1.3">r-rmarkdown</requirement> <requirement type="package" version="1.8.4">r-plyr</requirement> <requirement type="package" version="1.1.0">r-stringr</requirement> <requirement type="package" version="0.5.0">r-highcharter</requirement> @@ -18,38 +18,35 @@ <requirement type="package" version="0.11.5">fastqc</requirement> </requirements> <stdio> - <regex match="Execution halted" - source="both" - level="fatal" - description="Execution halted." /> - <regex match="Error in" - source="both" - level="fatal" - description="An undefined error occured, please check your intput carefully and contact your administrator." /> - <regex match="Fatal error" - source="both" - level="fatal" - description="An undefined error occured, please check your intput carefully and contact your administrator." /> + <!--redirecting stderr to a file. "XXX" is used to match with nothing so that tool running won't be interrupted during testing--> + <regex match="XXX" + source="stderr" + level="warning" + description="Check the warnings_and_errors.txt file for more details."/> </stdio> <command> <![CDATA[ Rscript '${__tool_directory__}/fastqc_report_render.R' + -e $echo -r $reads - -p '${__tool_directory__}/fastqc_report.Rmd' + -o $report -d $report.files_path + -s $sink_message - -e $echo - + -p '${__tool_directory__}/fastqc_report.Rmd' ]]> </command> <inputs> - <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads" type="data" label="Short reads data from history" /> - <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Display analysis code in report?" /> + <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads" type="data" + label="Short reads data from history"/> + <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" + label="Display analysis code in report?"/> </inputs> <outputs> - <data format="html" name="report" label="fastqc report" /> + <data format="html" name="report" label="fastqc report"/> + <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/> </outputs> <citations> <citation type="bibtex"> @@ -62,7 +59,8 @@ <citation type="bibtex"> @article{allaire2016rmarkdown, title={rmarkdown: Dynamic Documents for R, 2016}, - author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, + author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff + and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, journal={R package version 0.9}, volume={6}, year={2016} @@ -80,7 +78,8 @@ <citation type="bibtex"> @misc{plotly2017, title = {plotly: Create Interactive Web Graphics via 'plotly.js'}, - author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy}, + author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and + Marianne Corvellec and Pedro Despouy}, year = {2017}, note = {R package version 4.6.0}, url = {https://CRAN.R-project.org/package=plotly},
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc_report_ori.Rmd Wed Oct 18 22:06:39 2017 -0400 @@ -0,0 +1,381 @@ +--- +title: "Fastqc report: short reads quality evaluation" +author: "Ming Chen" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE) +library(plyr) +library(stringr) +library(dplyr) +library(highcharter) +library(DT) +library(reshape2) +library(plotly) +library(formattable) +library(htmltools) +``` + + +```{bash 'create output directory', echo=FALSE} +# create extra files directory. very important! +mkdir REPORT_OUTPUT_DIR +``` + +# Fastqc analysis +```{bash 'copy data to working directory', echo=FALSE} +# Copy uploaded data to the working directory +for f in $(echo READS | sed "s/,/ /g") +do + cp $f ./ +done +``` + + +```{bash 'run fastqc', echo=FALSE} +for r in $(ls *.dat) +do + fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 +done +``` + +## Fastqc html reports + +Below are links to ***Fastqc*** original html reports. +```{r 'html report links'} +html_report_list = list() +html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') +for (i in html_files) { + html_report_list[[i]] = tags$li(tags$a(href=i, i)) +} +tags$ul(html_report_list) +``` + + +## Parsing fastqc data + +```{bash echo=FALSE} +##==== copy fastqc generated zip files from report output directory to job work directory == +cp -r REPORT_OUTPUT_DIR/*zip ./ + +# create a file to store data file paths +echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail +echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score +echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score +echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content +echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content +echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content +echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level +echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution +echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content + +for i in $(ls *.zip) +do + BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') + echo $BASE + unzip ${BASE}.zip > /dev/null 2>&1 + + ##====== pass,warning,fail (WSF) ============= + awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt + echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt + + ##====== per base quality scores (PBQS) ====== + awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt + echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt + + ##====== per sequence quality scores (PSQS) + awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt + echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt + + ##====== Per sequence GC content (PSGC) + awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt + echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt + + ##====== Per Base Sequence Content (PBSC) + awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt + echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt + + ##====== Per Base N Content (PBNC) + awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt + echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt + + ##====== Sequence Duplication Level (SDL) + awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt + echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt + + ##====== Sequence Length Distribution (SLD) + awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt + echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt + + ##====== Kmer Content ============ + awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt + echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt + +done +``` + + +## Evaluation Overview + +```{r 'overview'} +PWF_file_paths = read.csv('PWF_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +rm('PWF_df') +for(i in 1:nrow(PWF_file_paths)) { + file_path = PWF_file_paths[i,2] + pwf_df = read.csv(file_path, + sep='\t', header=FALSE, stringsAsFactors = FALSE) + colnames(pwf_df) = c('item', PWF_file_paths[i,1]) + if (!exists('PWF_df')) { + PWF_df = pwf_df + } else { + PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) + } +} +``` + +```{r} +my_icon = c('ok', 'remove', 'star') +names(my_icon) = c('pass', 'fail', 'warn') +evaluate_list = list() +for (i in colnames(PWF_df)[-1]) { + evaluate_list[[i]] = formatter( + "span", + style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), + "color" = "white", + "width" = "50px", + "float" = "left", + "padding-right" = "5px") + ) +} + +formattable(PWF_df, evaluate_list) +``` + + +## Per Base Quality Scores + +```{r} +PBQS_df = data.frame() +PBQS_file_paths = read.csv('PBQS_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +for(i in 1:nrow(PBQS_file_paths)) { + # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) + file_path = PBQS_file_paths[i,2] + pbqs_df = read.csv(file_path, + sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% + mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), + Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% + (function (df) { + df1 = select(df, -Base2) + df2 = select(df, -Base1) %>% filter(Base2 != '') + colnames(df1) = c(colnames(df1)[1:7], 'Base') + colnames(df2) = c(colnames(df2)[1:7], 'Base') + res = rbind(df1, df2) %>% arrange(Base) + return(res) + }) + pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) + PBQS_df = rbind(PBQS_df, pbqs_df) +} +``` + + +```{r} +# datatable(PBQS_df) +max_phred = max(PBQS_df$Mean) + 10 +hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% + hc_title( + text = "Per Base Quality Score" + ) %>% + hc_yAxis( + title = list(text = "Mean Base Quality Score"), + min = 0, + max = max_phred, + plotLines = list( + list(label = list(text = "Phred Score = 27"), + width = 2, + dashStyle = "dash", + color = "green", + value = 27), + list(label = list(text = "Phred Score = 20"), + width = 2, + color = "red", + value = 20) + ) + ) %>% + hc_exporting(enabled = TRUE) +``` + + +## Per Base N Content + +```{r} +PBNC_df = data.frame() +PBNC_file_paths = read.csv('PBNC_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +for(i in 1:nrow(PBNC_file_paths)) { + # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) + file_path = PBNC_file_paths[i,2] + pbnc_df = read.csv(file_path, + sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% + mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), + Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% + (function (df) { + df1 = select(df, -Base2) + df2 = select(df, -Base1) %>% filter(Base2 != '') + colnames(df1) = c(colnames(df1)[1:2], 'Base') + colnames(df2) = c(colnames(df2)[1:2], 'Base') + res = rbind(df1, df2) %>% arrange(Base) + return(res) + }) + pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) + PBNC_df = rbind(PBNC_df, pbnc_df) +} +``` + + +```{r} +PBNC_df$N.Count = PBNC_df$N.Count * 100 +max_phred = max(PBNC_df$N.Count) + 5 +hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% + hc_title( + text = "Per Base N Content" + ) %>% + hc_xAxis( + title = list(text = "Base Position") + ) %>% + hc_yAxis( + title = list(text = "N %"), + plotLines = list( + list(label = list(text = "N = 5%"), + width = 2, + dashStyle = "dash", + color = "red", + value = 5) + ) + ) %>% + hc_exporting(enabled = TRUE) +``` + + + + +## Per Sequence Quality Scores + +```{r} +PSQS_df = data.frame() +PSQS_file_paths = read.csv('PSQS_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +for(i in 1:nrow(PSQS_file_paths)) { + # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) + file_path = PSQS_file_paths[i,2] + psqs_df = read.csv(file_path, + sep='\t', header=TRUE, stringsAsFactors = FALSE) + psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) + PSQS_df = rbind(PSQS_df, psqs_df) +} +``` + + +```{r} +max_phred = max(PSQS_df$X.Quality) + 5 +hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% + hc_title( + text = "Per Sequence Quality Score" + ) %>% + hc_xAxis( + title = list(text = "Mean Sequence Quality Score"), + min = 0, + max = max_phred, + plotLines = list( + list(label = list(text = "Phred Score = 27"), + width = 2, + dashStyle = "dash", + color = "green", + value = 27), + list(label = list(text = "Phred Score = 20"), + width = 2, + color = "red", + value = 20) + ) + ) %>% + hc_exporting(enabled = TRUE) +``` + + +## Per Sequence GC Content + + +```{r} +PSGC_df = data.frame() +PSGC_file_paths = read.csv('PSGC_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +for(i in 1:nrow(PSGC_file_paths)) { + # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) + file_path = PSGC_file_paths[i,2] + psgc_df = read.csv(file_path, + sep='\t', header=TRUE, stringsAsFactors = FALSE) + psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) + PSGC_df = rbind(PSGC_df, psgc_df) +} +``` + + +```{r} +max_phred = max(PSGC_df$Count) + 5 +hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% + hc_title( + text = "Per Sequence GC Content" + ) %>% + hc_xAxis( + title = list(text = "% GC") + ) %>% + hc_exporting(enabled = TRUE) +``` + + +## Per Base Sequence Content + +```{r} +PBSC_df = data.frame() +PBSC_file_paths = read.csv('PBSC_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +for(i in 1:nrow(PBSC_file_paths)) { + # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) + file_path = PBSC_file_paths[i,2] + pbsc_df = read.csv(file_path, + sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% + mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), + Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% + (function (df) { + df1 = select(df, -Base2) + df2 = select(df, -Base1) %>% filter(Base2 != '') + colnames(df1) = c(colnames(df1)[1:5], 'Base') + colnames(df2) = c(colnames(df2)[1:5], 'Base') + res = rbind(df1, df2) %>% arrange(Base) + return(res) + }) + pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) + PBSC_df = rbind(PBSC_df, pbsc_df) +} +``` + + +```{r out.width="100%"} +PBSC_df_2 = select(PBSC_df, -X.Base) %>% + melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') +p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + + geom_line() + + facet_wrap(~ sample_id) +ggplotly(p) +``` + + +# Session Info + +```{r 'session info'} +sessionInfo() +``` + +
--- a/fastqc_report_render.R Mon Oct 16 21:33:31 2017 -0400 +++ b/fastqc_report_render.R Wed Oct 18 22:06:39 2017 -0400 @@ -1,87 +1,88 @@ -##======= Handle arguments from command line ======== -# setup R error handline to go to stderr -options(show.error.messages=FALSE, - error=function(){ - cat(geterrmessage(), file=stderr()) - quit("no", 1, F) - }) - -# we need that to not crash galaxy with an UTF8 error on German LC settings. -loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -# suppress warning -options(warn = -1) - -options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -args = commandArgs(trailingOnly=TRUE) +library(getopt) +library(rmarkdown) +library(htmltools) +library(plyr) +library(dplyr) +library(stringr) +library(highcharter) +library(DT) +library(reshape2) +library(plotly) +library(formattable) -suppressPackageStartupMessages({ - library(getopt) - library(tools) -}) - -# column 1: the long flag name -# column 2: the short flag alias. A SINGLE character string -# column 3: argument mask -# 0: no argument -# 1: argument required -# 2: argument is optional -# column 4: date type to which the flag's argument shall be cast. -# possible values: logical, integer, double, complex, character. -spec_list=list() -spec_list$READS = c('reads', 'r', '1', 'character') -spec_list$ECHO = c('echo', 'e', '1', 'character') -spec_list$FASTQC_TPL = c('fastqc_tpl', 'p', 1, 'character') -spec_list$REPORT = c('report', 'o', '1', 'character') -spec_list$REPORT_OUTPUT_DIR = c('report_output_dir', 'd', '1', 'character') +##============ Sink warnings and errors to a file ============== +## use the sink() function to wrap all code within it. +##============================================================== +zz = file('warnings_and_errors.txt') +sink(zz) +sink(zz, type = 'message') + ##---------below is the code for rendering .Rmd templates----- + + ##=============STEP 1: handle command line arguments========== + ## + ##============================================================ + # column 1: the long flag name + # column 2: the short flag alias. A SINGLE character string + # column 3: argument mask + # 0: no argument + # 1: argument required + # 2: argument is optional + # column 4: date type to which the flag's argument shall be cast. + # possible values: logical, integer, double, complex, character. + #------------------------------------------------------------- + #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++ + # 1. short flag alias should match the flag in the command section in the XML file. + # 2. long flag name can be any legal R variable names + # 3. two names in args_list can have common string but one name should not be a part of another name. + # for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems. + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + args_list=list() + ##------- 1. input data --------------------- + args_list$ECHO = c('echo', 'e', '1', 'character') + args_list$READS = c('reads', 'r', '1', 'character') + ##--------2. output report and outputs -------------- + args_list$REPORT_HTML = c('report_html', 'r', '1', 'character') + args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') + args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character') + ##--------3. .Rmd templates in the tool directory ---------- + args_list$FASTQC_REPORT_RMD = c('fastqc_report_rmd', 't', '1', 'character') + ##----------------------------------------------------------- + opt = getopt(t(as.data.frame(args_list))) -spec = t(as.data.frame(spec_list)) - -opt = getopt(spec) -# arguments are accessed by long flag name (the first column in the spec matrix) -# NOT by element name in the spec_list -# example: opt$help, opt$expression_file -##====== End of arguments handling ========== - + + ##=======STEP 2: create report directory (optional)========== + ## + ##=========================================================== + dir.create(opt$report_dir) + + ##=STEP 3: replace placeholders in .Rmd with argument values= + ## + ##=========================================================== + #++ need to replace placeholders with args values one by one+ + readLines(opt$fastqc_report_rmd) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('READS', opt$reads, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$output_dir, x) + }) %>% + (function(x) { + fileConn = file('fastqc_report.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + -mgsub = function(pattern, replacement, x) { - if(length(pattern) != length(replacement) ) { - stop("pattern and replacement have to be the same in length") - } - - result = x - - for(i in 1:length(pattern)) { - result = try( gsub(pattern[i], replacement[i], x = result) ) - } - - result -} + ##=============STEP 4: render .Rmd templates================= + ## + ##=========================================================== + render('fastqc_report.Rmd', output_file = opt$report_html) -##====== replace variables in tpl file ====== -p = c('READS', - 'ECHO', - 'FASTQC_TPL', - 'REPORT_OUTPUT_DIR', - 'REPORT') -r = c(opt$reads, - opt$echo, - opt$fastqc_tpl, - opt$report_output_dir, - opt$report) - -fastqc_report_tpl = mgsub(p, r, readLines(opt$fastqc_tpl)) - -##====== write replaced text into Rmd file === -fileConn = file('fastqc_report.Rmd') -writeLines(fastqc_report_tpl, con=fileConn) -close(fileConn) - -##====== render Rmd files ==================== -rmarkdown::render('fastqc_report.Rmd') -file.copy('fastqc_report.html', opt$report, recursive=TRUE) -paste0('cp -r ./* ', opt$report_output_dir) %>% - system() - + ##--------end of code rendering .Rmd templates---------------- +sink() +##=========== End of sinking output============================= \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc_report_render_ori.R Wed Oct 18 22:06:39 2017 -0400 @@ -0,0 +1,87 @@ +##======= Handle arguments from command line ======== +# setup R error handline to go to stderr +options(show.error.messages = FALSE, +error = function(){ + cat(geterrmessage(), file = stderr()) + quit("no", 1, F) +}) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# suppress warning +options(warn = - 1) + +options(stringsAsFactors = FALSE, useFancyQuotes = FALSE) +args = commandArgs(trailingOnly = TRUE) + +suppressPackageStartupMessages({ + library(getopt) + library(tools) +}) + +# column 1: the long flag name +# column 2: the short flag alias. A SINGLE character string +# column 3: argument mask +# 0: no argument +# 1: argument required +# 2: argument is optional +# column 4: date type to which the flag's argument shall be cast. +# possible values: logical, integer, double, complex, character. +spec_list = list() +spec_list$READS = c('reads', 'r', '1', 'character') +spec_list$ECHO = c('echo', 'e', '1', 'character') +spec_list$FASTQC_TPL = c('fastqc_tpl', 'p', 1, 'character') +spec_list$REPORT = c('report', 'o', '1', 'character') +spec_list$REPORT_OUTPUT_DIR = c('report_output_dir', 'd', '1', 'character') + + +spec = t(as.data.frame(spec_list)) + +opt = getopt(spec) +# arguments are accessed by long flag name (the first column in the spec matrix) +# NOT by element name in the spec_list +# example: opt$help, opt$expression_file +##====== End of arguments handling ========== + + +mgsub = function(pattern, replacement, x) { + if (length(pattern) != length(replacement)) { + stop("pattern and replacement have to be the same in length") + } + + result = x + + for (i in 1 : length(pattern)) { + result = try(gsub(pattern[i], replacement[i], x = result)) + } + + result +} + + +##====== replace variables in tpl file ====== +p = c('READS', +'ECHO', +'FASTQC_TPL', +'REPORT_OUTPUT_DIR', +'REPORT') +r = c(opt$reads, +opt$echo, +opt$fastqc_tpl, +opt$report_output_dir, +opt$report) + +fastqc_report_tpl = mgsub(p, r, readLines(opt$fastqc_tpl)) + +##====== write replaced text into Rmd file === +fileConn = file('fastqc_report.Rmd') +writeLines(fastqc_report_tpl, con = fileConn) +close(fileConn) + +##====== render Rmd files ==================== +rmarkdown::render('fastqc_report.Rmd') +file.copy('fastqc_report.html', opt$report, recursive = TRUE) +paste0('cp -r ./* ', opt$report_output_dir) %>% +system() +