changeset 14:2efa46ce2c4c draft

upgrade fastqc_report
author mingchen0919
date Wed, 18 Oct 2017 22:06:39 -0400
parents 9d3586701985
children d1d20f341632
files fastqc_report.Rmd fastqc_report.xml fastqc_report_ori.Rmd fastqc_report_render.R fastqc_report_render_ori.R
diffstat 5 files changed, 606 insertions(+), 449 deletions(-) [+]
line wrap: on
line diff
--- a/fastqc_report.Rmd	Mon Oct 16 21:33:31 2017 -0400
+++ b/fastqc_report.Rmd	Wed Oct 18 22:06:39 2017 -0400
@@ -1,383 +1,72 @@
 ---
-title: "Fastqc report: short reads quality evaluation"
-author: "Ming Chen"
-output: html_document
+title: 'HTML report title'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
 ---
 
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE)
-library(plyr)
-library(stringr)
-library(dplyr)
-library(highcharter)
-library(DT)
-library(reshape2)
-library(plotly)
-library(formattable)
-library(htmltools)
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO
+)
 ```
 
 
-```{bash 'create output directory', echo=FALSE}
-# create extra files directory. very important!
-mkdir REPORT_OUTPUT_DIR
-```
+# Fastqc Analysis
 
-# Fastqc analysis
-```{bash 'copy data to working directory', echo=FALSE}
-# Copy uploaded data to the working directory
+* Copy fastq files to job working directory
+
+```{bash 'copy files'}
 for f in $(echo READS | sed "s/,/ /g")
 do
     cp $f ./
 done
 ```
 
+* Run fastqc
 
-```{bash 'run fastqc', echo=FALSE}
+```{bash 'run fastqc'}
 for r in $(ls *.dat)
 do
-    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
+    fastqc -o REPORT_DIR $r > /dev/null 2>&1
 done
 ```
 
-## Fastqc html reports
+* Create links to original HTML reports
 
-Below are links to ***Fastqc*** original html reports.
 ```{r 'html report links'}
 html_report_list = list()
-html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html')
+html_files = list.files('REPORT_DIR', pattern = '.*html')
 for (i in html_files) {
   html_report_list[[i]] = tags$li(tags$a(href=i, i))
 }
 tags$ul(html_report_list)
 ```
 
-
-## Parsing fastqc data
-
-```{bash echo=FALSE}
-##==== copy fastqc generated zip files from report output directory to job work directory ==
-cp -r REPORT_OUTPUT_DIR/*zip ./
-
-# create a file to store data file paths
-echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
-echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
-echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
-echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
-echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
-echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
-echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
-echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
-echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
-
-for i in $(ls *.zip)
-do
-    BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
-    echo $BASE
-    unzip ${BASE}.zip > /dev/null 2>&1
-    
-    ##====== pass,warning,fail (WSF) =============
-    awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
-    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
+# Fastqc output summary
 
-    ##====== per base quality scores (PBQS) ======
-    awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
-    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
-
-    ##====== per sequence quality scores (PSQS)
-    awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
-    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
-
-    ##====== Per sequence GC content (PSGC)
-    awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
-    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
-    
-    ##====== Per Base Sequence Content (PBSC)
-    awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
-    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
-    
-    ##====== Per Base N Content (PBNC)
-    awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
-    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
-    
-    ##====== Sequence Duplication Level (SDL)
-    awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
-    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
-    
-    ##====== Sequence Length Distribution (SLD)
-    awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
-    echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
-    
-    ##====== Kmer Content ============
-    awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
-    echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
-    
-done
-```
+* Define a function to extract outputs for each module from fastqc output
 
-
-## Evaluation Overview
-
-```{r 'overview'}
-PWF_file_paths = read.csv('PWF_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-rm('PWF_df')
-for(i in 1:nrow(PWF_file_paths)) {
-  file_path = PWF_file_paths[i,2]
-  pwf_df = read.csv(file_path,
-                     sep='\t', header=FALSE, stringsAsFactors = FALSE)
-  colnames(pwf_df) = c('item', PWF_file_paths[i,1])
-  if (!exists('PWF_df')) {
-    PWF_df = pwf_df
-  } else {
-    PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
-  }
-}
-```
-
-```{r}
-my_icon = c('ok', 'remove', 'star')
-names(my_icon) = c('pass', 'fail', 'warn')
-evaluate_list = list()
-for (i in colnames(PWF_df)[-1]) {
-  evaluate_list[[i]] = formatter(
-      "span", 
-      style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), 
-                        "color" = "white",
-                        "width" = "50px",
-                        "float" = "left",
-                        "padding-right" = "5px")
-    )
-}
-
-formattable(PWF_df, evaluate_list)
-```
-
-
-## Per Base Quality Scores
-
-```{r}
-PBQS_df = data.frame()
-PBQS_file_paths = read.csv('PBQS_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBQS_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
-  file_path = PBQS_file_paths[i,2]
-  pbqs_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:7], 'Base')
-    colnames(df2) = c(colnames(df2)[1:7], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
-  PBQS_df = rbind(PBQS_df, pbqs_df)
+```{r 'function definition'}
+extract_data_module = function(fastqc_data, module_name) {
+  f = readLines(fastqc_data)
+  start_line = grep(module_name, f)
+  end_module_lines = grep('END_MODULE', f)
+  end_line = end_module_lines[which(end_module_lines > start_line)[1]]
+  module_data = f[(start_line+1):(end_line-1)]
+  writeLines(module_data, 'temp.txt')
+  read.csv('temp.txt', sep = '\t')
 }
 ```
 
-
-```{r}
-# datatable(PBQS_df)
-max_phred = max(PBQS_df$Mean) + 10
-hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
-  hc_title(
-    text = "Per Base Quality Score"
-  ) %>%
-  hc_yAxis(
-    title = list(text = "Mean Base Quality Score"),
-    min = 0,
-    max = max_phred,
-    plotLines = list(
-      list(label = list(text = "Phred Score = 27"),
-           width = 2,
-           dashStyle = "dash",
-           color = "green",
-           value = 27),
-      list(label = list(text = "Phred Score = 20"),
-           width = 2,
-           color = "red",
-           value = 20)
-    )
-  ) %>% 
-  hc_exporting(enabled = TRUE)
-```
-
-
-## Per Base N Content
+## 
 
-```{r}
-PBNC_df = data.frame()
-PBNC_file_paths = read.csv('PBNC_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBNC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
-  file_path = PBNC_file_paths[i,2]
-  pbnc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:2], 'Base')
-    colnames(df2) = c(colnames(df2)[1:2], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
-  PBNC_df = rbind(PBNC_df, pbnc_df)
-}
-```
-
+# Session Info
 
-```{r}
-PBNC_df$N.Count = PBNC_df$N.Count * 100
-max_phred = max(PBNC_df$N.Count) + 5
-hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Base N Content"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "Base Position")
-  ) %>%
-  hc_yAxis(
-    title = list(text = "N %"),
-    plotLines = list(
-      list(label = list(text = "N = 5%"),
-           width = 2,
-           dashStyle = "dash",
-           color = "red",
-           value = 5)
-    )
-  ) %>% 
-  hc_exporting(enabled = TRUE)
-```
-
-
-
-
-## Per Sequence Quality Scores
-
-```{r}
-PSQS_df = data.frame()
-PSQS_file_paths = read.csv('PSQS_file_paths.txt', 
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PSQS_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
-  file_path = PSQS_file_paths[i,2]
-  psqs_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) 
-  psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
-  PSQS_df = rbind(PSQS_df, psqs_df)
-}
+```{r 'session info'}
+sessionInfo()
 ```
 
-
-```{r}
-max_phred = max(PSQS_df$X.Quality) + 5
-hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Sequence Quality Score"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "Mean Sequence Quality Score"),
-    min = 0,
-    max = max_phred,
-    plotLines = list(
-      list(label = list(text = "Phred Score = 27"),
-           width = 2,
-           dashStyle = "dash",
-           color = "green",
-           value = 27),
-      list(label = list(text = "Phred Score = 20"),
-           width = 2,
-           color = "red",
-           value = 20)
-    )
-  ) %>% 
-  hc_exporting(enabled = TRUE)
-```
-
-
-## Per Sequence GC Content
-
-
-```{r}
-PSGC_df = data.frame()
-PSGC_file_paths = read.csv('PSGC_file_paths.txt', 
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PSGC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
-  file_path = PSGC_file_paths[i,2]
-  psgc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) 
-  psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
-  PSGC_df = rbind(PSGC_df, psgc_df)
-}
-```
-
-
-```{r}
-max_phred = max(PSGC_df$Count) + 5
-hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Sequence GC Content"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "% GC")
-  ) %>%
-  hc_exporting(enabled = TRUE)
-```
-
-
-## Per Base Sequence Content
-
-```{r}
-PBSC_df = data.frame()
-PBSC_file_paths = read.csv('PBSC_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBSC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
-  file_path = PBSC_file_paths[i,2]
-  pbsc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:5], 'Base')
-    colnames(df2) = c(colnames(df2)[1:5], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
-  PBSC_df = rbind(PBSC_df, pbsc_df)
-}
-```
-
-
-```{r out.width="100%"}
-PBSC_df_2 = select(PBSC_df, -X.Base) %>%
-  melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
-p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
-  geom_line() +
-  facet_wrap(~ sample_id)
-ggplotly(p)
-```
-
-
-## References
-
-* Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176.
-* Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86.
-* Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343.
-* Highcharts. https://www.highcharts.com/. (access by May 26, 2017).
-* R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
-* Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter
-* Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly
--- a/fastqc_report.xml	Mon Oct 16 21:33:31 2017 -0400
+++ b/fastqc_report.xml	Wed Oct 18 22:06:39 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="fastqc_report" name="Fastqc report" version="1.0.1">
+<tool id="fastqc_report" name="Fastqc report" version="2.0.0">
     <description>
         Implements FastQC analysis and display results in R Markdown html.
     </description>
@@ -6,7 +6,7 @@
         <requirement type="package" version="1.15.0.6-0">pandoc</requirement>
         <requirement type="package" version="1.14.1">bioconductor-deseq2</requirement>
         <requirement type="package" version="1.20.0">r-getopt</requirement>
-        <requirement type="package" version="1.2">r-rmarkdown</requirement>
+        <requirement type="package" version="1.3">r-rmarkdown</requirement>
         <requirement type="package" version="1.8.4">r-plyr</requirement>
         <requirement type="package" version="1.1.0">r-stringr</requirement>
         <requirement type="package" version="0.5.0">r-highcharter</requirement>
@@ -18,38 +18,35 @@
         <requirement type="package" version="0.11.5">fastqc</requirement>
     </requirements>
     <stdio>
-        <regex match="Execution halted"
-               source="both"
-               level="fatal"
-               description="Execution halted." />
-        <regex match="Error in"
-               source="both"
-               level="fatal"
-               description="An undefined error occured, please check your intput carefully and contact your administrator." />
-        <regex match="Fatal error"
-               source="both"
-               level="fatal"
-               description="An undefined error occured, please check your intput carefully and contact your administrator." />
+        <!--redirecting stderr to a file. "XXX" is used to match with nothing so that tool running won't be interrupted during testing-->
+        <regex match="XXX"
+               source="stderr"
+               level="warning"
+               description="Check the warnings_and_errors.txt file for more details."/>
     </stdio>
     <command>
         <![CDATA[
 
         Rscript '${__tool_directory__}/fastqc_report_render.R'
+            -e $echo
             -r $reads
-            -p '${__tool_directory__}/fastqc_report.Rmd'
+
 		    -o $report
 		    -d $report.files_path
+		    -s $sink_message
 
-		    -e $echo
-
+		    -p '${__tool_directory__}/fastqc_report.Rmd'
         ]]>
     </command>
     <inputs>
-        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads" type="data" label="Short reads data from history" />
-        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Display analysis code in report?" />
+        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads" type="data"
+               label="Short reads data from history"/>
+        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
+               label="Display analysis code in report?"/>
     </inputs>
     <outputs>
-        <data format="html" name="report" label="fastqc report" />
+        <data format="html" name="report" label="fastqc report"/>
+        <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
     </outputs>
     <citations>
         <citation type="bibtex">
@@ -62,7 +59,8 @@
         <citation type="bibtex">
             @article{allaire2016rmarkdown,
             title={rmarkdown: Dynamic Documents for R, 2016},
-            author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
+            author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff
+            and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
             journal={R package version 0.9},
             volume={6},
             year={2016}
@@ -80,7 +78,8 @@
         <citation type="bibtex">
             @misc{plotly2017,
             title = {plotly: Create Interactive Web Graphics via 'plotly.js'},
-            author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy},
+            author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and
+            Marianne Corvellec and Pedro Despouy},
             year = {2017},
             note = {R package version 4.6.0},
             url = {https://CRAN.R-project.org/package=plotly},
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_report_ori.Rmd	Wed Oct 18 22:06:39 2017 -0400
@@ -0,0 +1,381 @@
+---
+title: "Fastqc report: short reads quality evaluation"
+author: "Ming Chen"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE)
+library(plyr)
+library(stringr)
+library(dplyr)
+library(highcharter)
+library(DT)
+library(reshape2)
+library(plotly)
+library(formattable)
+library(htmltools)
+```
+
+
+```{bash 'create output directory', echo=FALSE}
+# create extra files directory. very important!
+mkdir REPORT_OUTPUT_DIR
+```
+
+# Fastqc analysis
+```{bash 'copy data to working directory', echo=FALSE}
+# Copy uploaded data to the working directory
+for f in $(echo READS | sed "s/,/ /g")
+do
+    cp $f ./
+done
+```
+
+
+```{bash 'run fastqc', echo=FALSE}
+for r in $(ls *.dat)
+do
+    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
+done
+```
+
+## Fastqc html reports
+
+Below are links to ***Fastqc*** original html reports.
+```{r 'html report links'}
+html_report_list = list()
+html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html')
+for (i in html_files) {
+  html_report_list[[i]] = tags$li(tags$a(href=i, i))
+}
+tags$ul(html_report_list)
+```
+
+
+## Parsing fastqc data
+
+```{bash echo=FALSE}
+##==== copy fastqc generated zip files from report output directory to job work directory ==
+cp -r REPORT_OUTPUT_DIR/*zip ./
+
+# create a file to store data file paths
+echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
+echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
+echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
+echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
+echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
+echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
+echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
+echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
+echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
+
+for i in $(ls *.zip)
+do
+    BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
+    echo $BASE
+    unzip ${BASE}.zip > /dev/null 2>&1
+    
+    ##====== pass,warning,fail (WSF) =============
+    awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
+    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
+
+    ##====== per base quality scores (PBQS) ======
+    awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
+    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
+
+    ##====== per sequence quality scores (PSQS)
+    awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
+    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
+
+    ##====== Per sequence GC content (PSGC)
+    awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
+    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
+    
+    ##====== Per Base Sequence Content (PBSC)
+    awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
+    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
+    
+    ##====== Per Base N Content (PBNC)
+    awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
+    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
+    
+    ##====== Sequence Duplication Level (SDL)
+    awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
+    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
+    
+    ##====== Sequence Length Distribution (SLD)
+    awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
+    echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
+    
+    ##====== Kmer Content ============
+    awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
+    echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
+    
+done
+```
+
+
+## Evaluation Overview
+
+```{r 'overview'}
+PWF_file_paths = read.csv('PWF_file_paths.txt',
+                           header = TRUE, stringsAsFactors = FALSE)
+rm('PWF_df')
+for(i in 1:nrow(PWF_file_paths)) {
+  file_path = PWF_file_paths[i,2]
+  pwf_df = read.csv(file_path,
+                     sep='\t', header=FALSE, stringsAsFactors = FALSE)
+  colnames(pwf_df) = c('item', PWF_file_paths[i,1])
+  if (!exists('PWF_df')) {
+    PWF_df = pwf_df
+  } else {
+    PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
+  }
+}
+```
+
+```{r}
+my_icon = c('ok', 'remove', 'star')
+names(my_icon) = c('pass', 'fail', 'warn')
+evaluate_list = list()
+for (i in colnames(PWF_df)[-1]) {
+  evaluate_list[[i]] = formatter(
+      "span", 
+      style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), 
+                        "color" = "white",
+                        "width" = "50px",
+                        "float" = "left",
+                        "padding-right" = "5px")
+    )
+}
+
+formattable(PWF_df, evaluate_list)
+```
+
+
+## Per Base Quality Scores
+
+```{r}
+PBQS_df = data.frame()
+PBQS_file_paths = read.csv('PBQS_file_paths.txt',
+                           header = TRUE, stringsAsFactors = FALSE)
+for(i in 1:nrow(PBQS_file_paths)) {
+  # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
+  file_path = PBQS_file_paths[i,2]
+  pbqs_df = read.csv(file_path,
+                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
+    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
+           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
+  (function (df) {
+    df1 = select(df, -Base2)
+    df2 = select(df, -Base1) %>% filter(Base2 != '')
+    colnames(df1) = c(colnames(df1)[1:7], 'Base')
+    colnames(df2) = c(colnames(df2)[1:7], 'Base')
+    res = rbind(df1, df2) %>% arrange(Base)
+    return(res)
+  })
+  pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
+  PBQS_df = rbind(PBQS_df, pbqs_df)
+}
+```
+
+
+```{r}
+# datatable(PBQS_df)
+max_phred = max(PBQS_df$Mean) + 10
+hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
+  hc_title(
+    text = "Per Base Quality Score"
+  ) %>%
+  hc_yAxis(
+    title = list(text = "Mean Base Quality Score"),
+    min = 0,
+    max = max_phred,
+    plotLines = list(
+      list(label = list(text = "Phred Score = 27"),
+           width = 2,
+           dashStyle = "dash",
+           color = "green",
+           value = 27),
+      list(label = list(text = "Phred Score = 20"),
+           width = 2,
+           color = "red",
+           value = 20)
+    )
+  ) %>% 
+  hc_exporting(enabled = TRUE)
+```
+
+
+## Per Base N Content
+
+```{r}
+PBNC_df = data.frame()
+PBNC_file_paths = read.csv('PBNC_file_paths.txt',
+                           header = TRUE, stringsAsFactors = FALSE)
+for(i in 1:nrow(PBNC_file_paths)) {
+  # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
+  file_path = PBNC_file_paths[i,2]
+  pbnc_df = read.csv(file_path,
+                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
+    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
+           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
+  (function (df) {
+    df1 = select(df, -Base2)
+    df2 = select(df, -Base1) %>% filter(Base2 != '')
+    colnames(df1) = c(colnames(df1)[1:2], 'Base')
+    colnames(df2) = c(colnames(df2)[1:2], 'Base')
+    res = rbind(df1, df2) %>% arrange(Base)
+    return(res)
+  })
+  pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
+  PBNC_df = rbind(PBNC_df, pbnc_df)
+}
+```
+
+
+```{r}
+PBNC_df$N.Count = PBNC_df$N.Count * 100
+max_phred = max(PBNC_df$N.Count) + 5
+hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
+  hc_title(
+    text = "Per Base N Content"
+  ) %>%
+  hc_xAxis(
+    title = list(text = "Base Position")
+  ) %>%
+  hc_yAxis(
+    title = list(text = "N %"),
+    plotLines = list(
+      list(label = list(text = "N = 5%"),
+           width = 2,
+           dashStyle = "dash",
+           color = "red",
+           value = 5)
+    )
+  ) %>% 
+  hc_exporting(enabled = TRUE)
+```
+
+
+
+
+## Per Sequence Quality Scores
+
+```{r}
+PSQS_df = data.frame()
+PSQS_file_paths = read.csv('PSQS_file_paths.txt', 
+                           header = TRUE, stringsAsFactors = FALSE)
+for(i in 1:nrow(PSQS_file_paths)) {
+  # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
+  file_path = PSQS_file_paths[i,2]
+  psqs_df = read.csv(file_path,
+                     sep='\t', header=TRUE, stringsAsFactors = FALSE) 
+  psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
+  PSQS_df = rbind(PSQS_df, psqs_df)
+}
+```
+
+
+```{r}
+max_phred = max(PSQS_df$X.Quality) + 5
+hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
+  hc_title(
+    text = "Per Sequence Quality Score"
+  ) %>%
+  hc_xAxis(
+    title = list(text = "Mean Sequence Quality Score"),
+    min = 0,
+    max = max_phred,
+    plotLines = list(
+      list(label = list(text = "Phred Score = 27"),
+           width = 2,
+           dashStyle = "dash",
+           color = "green",
+           value = 27),
+      list(label = list(text = "Phred Score = 20"),
+           width = 2,
+           color = "red",
+           value = 20)
+    )
+  ) %>% 
+  hc_exporting(enabled = TRUE)
+```
+
+
+## Per Sequence GC Content
+
+
+```{r}
+PSGC_df = data.frame()
+PSGC_file_paths = read.csv('PSGC_file_paths.txt', 
+                           header = TRUE, stringsAsFactors = FALSE)
+for(i in 1:nrow(PSGC_file_paths)) {
+  # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
+  file_path = PSGC_file_paths[i,2]
+  psgc_df = read.csv(file_path,
+                     sep='\t', header=TRUE, stringsAsFactors = FALSE) 
+  psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
+  PSGC_df = rbind(PSGC_df, psgc_df)
+}
+```
+
+
+```{r}
+max_phred = max(PSGC_df$Count) + 5
+hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
+  hc_title(
+    text = "Per Sequence GC Content"
+  ) %>%
+  hc_xAxis(
+    title = list(text = "% GC")
+  ) %>%
+  hc_exporting(enabled = TRUE)
+```
+
+
+## Per Base Sequence Content
+
+```{r}
+PBSC_df = data.frame()
+PBSC_file_paths = read.csv('PBSC_file_paths.txt',
+                           header = TRUE, stringsAsFactors = FALSE)
+for(i in 1:nrow(PBSC_file_paths)) {
+  # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
+  file_path = PBSC_file_paths[i,2]
+  pbsc_df = read.csv(file_path,
+                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
+    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
+           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
+  (function (df) {
+    df1 = select(df, -Base2)
+    df2 = select(df, -Base1) %>% filter(Base2 != '')
+    colnames(df1) = c(colnames(df1)[1:5], 'Base')
+    colnames(df2) = c(colnames(df2)[1:5], 'Base')
+    res = rbind(df1, df2) %>% arrange(Base)
+    return(res)
+  })
+  pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
+  PBSC_df = rbind(PBSC_df, pbsc_df)
+}
+```
+
+
+```{r out.width="100%"}
+PBSC_df_2 = select(PBSC_df, -X.Base) %>%
+  melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
+p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
+  geom_line() +
+  facet_wrap(~ sample_id)
+ggplotly(p)
+```
+
+
+# Session Info
+
+```{r 'session info'}
+sessionInfo()
+```
+
+
--- a/fastqc_report_render.R	Mon Oct 16 21:33:31 2017 -0400
+++ b/fastqc_report_render.R	Wed Oct 18 22:06:39 2017 -0400
@@ -1,87 +1,88 @@
-##======= Handle arguments from command line ========
-# setup R error handline to go to stderr
-options(show.error.messages=FALSE,
-        error=function(){
-          cat(geterrmessage(), file=stderr())
-          quit("no", 1, F)
-        })
-
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
-loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-# suppress warning
-options(warn = -1)
-
-options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)
-args = commandArgs(trailingOnly=TRUE)
+library(getopt)
+library(rmarkdown)
+library(htmltools)
+library(plyr)
+library(dplyr)
+library(stringr)
+library(highcharter)
+library(DT)
+library(reshape2)
+library(plotly)
+library(formattable)
 
-suppressPackageStartupMessages({
-  library(getopt)
-  library(tools)
-})
-
-# column 1: the long flag name
-# column 2: the short flag alias. A SINGLE character string
-# column 3: argument mask
-#           0: no argument
-#           1: argument required
-#           2: argument is optional
-# column 4: date type to which the flag's argument shall be cast.
-#           possible values: logical, integer, double, complex, character.
-spec_list=list()
-spec_list$READS = c('reads', 'r', '1', 'character')
-spec_list$ECHO = c('echo', 'e', '1', 'character')
-spec_list$FASTQC_TPL = c('fastqc_tpl', 'p', 1, 'character')
-spec_list$REPORT = c('report', 'o', '1', 'character')
-spec_list$REPORT_OUTPUT_DIR = c('report_output_dir', 'd', '1', 'character')
+##============ Sink warnings and errors to a file ==============
+## use the sink() function to wrap all code within it.
+##==============================================================
+zz = file('warnings_and_errors.txt')
+sink(zz)
+sink(zz, type = 'message')
+  ##---------below is the code for rendering .Rmd templates-----
+  
+  ##=============STEP 1: handle command line arguments==========
+  ##
+  ##============================================================
+  # column 1: the long flag name
+  # column 2: the short flag alias. A SINGLE character string
+  # column 3: argument mask
+  #           0: no argument
+  #           1: argument required
+  #           2: argument is optional
+  # column 4: date type to which the flag's argument shall be cast.
+  #           possible values: logical, integer, double, complex, character.
+  #-------------------------------------------------------------
+  #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++
+  # 1. short flag alias should match the flag in the command section in the XML file.
+  # 2. long flag name can be any legal R variable names
+  # 3. two names in args_list can have common string but one name should not be a part of another name.
+  #    for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems.
+  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+  args_list=list()
+  ##------- 1. input data ---------------------
+  args_list$ECHO = c('echo', 'e', '1', 'character')
+  args_list$READS = c('reads', 'r', '1', 'character')
+  ##--------2. output report and outputs --------------
+  args_list$REPORT_HTML = c('report_html', 'r', '1', 'character')
+  args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character')
+  args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character')
+  ##--------3. .Rmd templates in the tool directory ----------
+  args_list$FASTQC_REPORT_RMD = c('fastqc_report_rmd', 't', '1', 'character')
+  ##-----------------------------------------------------------
+  opt = getopt(t(as.data.frame(args_list)))
 
 
-spec = t(as.data.frame(spec_list))
-
-opt = getopt(spec)
-# arguments are accessed by long flag name (the first column in the spec matrix)
-#                        NOT by element name in the spec_list
-# example: opt$help, opt$expression_file
-##====== End of arguments handling ==========
-
+  
+  ##=======STEP 2: create report directory (optional)==========
+  ##
+  ##===========================================================
+  dir.create(opt$report_dir)
+  
+  ##=STEP 3: replace placeholders in .Rmd with argument values=
+  ##
+  ##===========================================================
+  #++ need to replace placeholders with args values one by one+
+  readLines(opt$fastqc_report_rmd) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('READS', opt$reads, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$output_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('fastqc_report.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
 
-mgsub = function(pattern, replacement, x) {
-  if(length(pattern) != length(replacement) ) {
-    stop("pattern and replacement have to be the same in length")
-  }
-  
-  result = x
-  
-  for(i in 1:length(pattern)) {
-    result = try( gsub(pattern[i], replacement[i], x = result) )
-  }
-  
-  result
-}
+  ##=============STEP 4: render .Rmd templates=================
+  ##
+  ##===========================================================
+  render('fastqc_report.Rmd', output_file = opt$report_html)
 
 
-##====== replace variables in tpl file ======
-p = c('READS', 
-      'ECHO',
-      'FASTQC_TPL',
-      'REPORT_OUTPUT_DIR',
-      'REPORT')
-r = c(opt$reads,
-      opt$echo,
-      opt$fastqc_tpl,
-      opt$report_output_dir,
-      opt$report)
-
-fastqc_report_tpl = mgsub(p, r, readLines(opt$fastqc_tpl))
-
-##====== write replaced text into Rmd file ===
-fileConn = file('fastqc_report.Rmd')
-writeLines(fastqc_report_tpl, con=fileConn)
-close(fileConn)
-
-##====== render Rmd files ====================
-rmarkdown::render('fastqc_report.Rmd')
-file.copy('fastqc_report.html', opt$report, recursive=TRUE)
-paste0('cp -r ./* ', opt$report_output_dir) %>%
-  system()
-
+  ##--------end of code rendering .Rmd templates----------------
+sink()
+##=========== End of sinking output=============================
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_report_render_ori.R	Wed Oct 18 22:06:39 2017 -0400
@@ -0,0 +1,87 @@
+##======= Handle arguments from command line ========
+# setup R error handline to go to stderr
+options(show.error.messages = FALSE,
+error = function(){
+    cat(geterrmessage(), file = stderr())
+    quit("no", 1, F)
+})
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+# suppress warning
+options(warn = - 1)
+
+options(stringsAsFactors = FALSE, useFancyQuotes = FALSE)
+args = commandArgs(trailingOnly = TRUE)
+
+suppressPackageStartupMessages({
+    library(getopt)
+    library(tools)
+})
+
+# column 1: the long flag name
+# column 2: the short flag alias. A SINGLE character string
+# column 3: argument mask
+#           0: no argument
+#           1: argument required
+#           2: argument is optional
+# column 4: date type to which the flag's argument shall be cast.
+#           possible values: logical, integer, double, complex, character.
+spec_list = list()
+spec_list$READS = c('reads', 'r', '1', 'character')
+spec_list$ECHO = c('echo', 'e', '1', 'character')
+spec_list$FASTQC_TPL = c('fastqc_tpl', 'p', 1, 'character')
+spec_list$REPORT = c('report', 'o', '1', 'character')
+spec_list$REPORT_OUTPUT_DIR = c('report_output_dir', 'd', '1', 'character')
+
+
+spec = t(as.data.frame(spec_list))
+
+opt = getopt(spec)
+# arguments are accessed by long flag name (the first column in the spec matrix)
+#                        NOT by element name in the spec_list
+# example: opt$help, opt$expression_file
+##====== End of arguments handling ==========
+
+
+mgsub = function(pattern, replacement, x) {
+    if (length(pattern) != length(replacement)) {
+        stop("pattern and replacement have to be the same in length")
+    }
+
+    result = x
+
+    for (i in 1 : length(pattern)) {
+        result = try(gsub(pattern[i], replacement[i], x = result))
+    }
+
+    result
+}
+
+
+##====== replace variables in tpl file ======
+p = c('READS',
+'ECHO',
+'FASTQC_TPL',
+'REPORT_OUTPUT_DIR',
+'REPORT')
+r = c(opt$reads,
+opt$echo,
+opt$fastqc_tpl,
+opt$report_output_dir,
+opt$report)
+
+fastqc_report_tpl = mgsub(p, r, readLines(opt$fastqc_tpl))
+
+##====== write replaced text into Rmd file ===
+fileConn = file('fastqc_report.Rmd')
+writeLines(fastqc_report_tpl, con = fileConn)
+close(fileConn)
+
+##====== render Rmd files ====================
+rmarkdown::render('fastqc_report.Rmd')
+file.copy('fastqc_report.html', opt$report, recursive = TRUE)
+paste0('cp -r ./* ', opt$report_output_dir) %>%
+system()
+