Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site

--- a/01_evaluation_overview.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ b/01_evaluation_overview.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -1,123 +1,124 @@
 ---
-title: "Evaluation Overview"
-output: html_document
+title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
 ---

 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-```{bash 'copy data from datasets directory to working directory', echo=FALSE}
-# Copy uploaded data to the working directory
-for f in $(echo READS | sed "s/,/ /g")
-do
-    cp $f ./
-done
-```
-
-```{bash 'run fastqc', echo=FALSE}
-# run fastqc and place outputs into the report directory
-for r in $(ls *.dat)
-do
-    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
-done
-```
-
-```{bash 'parse fastqc results', echo=FALSE}
-##==== copy fastqc generated zip files from report output directory to job work directory ==
-cp -r REPORT_OUTPUT_DIR/*zip ./
-
-# create a file to store data file paths
-echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
-echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
-echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
-echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
-echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
-echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
-echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
-echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
-echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
-
-for i in $(ls *.zip)
-do
-    BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
-    echo $BASE
-    unzip ${BASE}.zip > /dev/null 2>&1
-
-    ##====== pass,warning,fail (WSF) =============
-    awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
-    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
-
-    ##====== per base quality scores (PBQS) ======
-    awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
-    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
-
-    ##====== per sequence quality scores (PSQS)
-    awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
-    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
-
-    ##====== Per sequence GC content (PSGC)
-    awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
-    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
-
-    ##====== Per Base Sequence Content (PBSC)
-    awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
-    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
-
-    ##====== Per Base N Content (PBNC)
-    awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
-    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
-
-    ##====== Sequence Duplication Level (SDL)
-    awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
-    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
-
-    ##====== Sequence Length Distribution (SLD)
-    awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
-    echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
-
-    ##====== Kmer Content ============
-    awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
-    echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
-
-done
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
 ```


-## Evaluation Overview
+# Fastqc Evaluation
+
+## Evaluation of reads before trimming

-```{r 'overview'}
-PWF_file_paths = read.csv('PWF_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-rm('PWF_df')
-for(i in 1:nrow(PWF_file_paths)) {
-  file_path = PWF_file_paths[i,2]
-  pwf_df = read.csv(file_path,
-                     sep='\t', header=FALSE, stringsAsFactors = FALSE)
-  colnames(pwf_df) = c('item', PWF_file_paths[i,1])
-  if (!exists('PWF_df')) {
-    PWF_df = pwf_df
-  } else {
-    PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
-  }
+```{r}
+if ('READS_1' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>%
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>%
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>%
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
+  system(fastqc_command_reads_1, intern = TRUE)
+
+  # Original html report
+  reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
+
+  unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
+  # summary.txt
+  file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
+  summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
+
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
 }
 ```


+## Evaluation of reads after trimming
+
 ```{r}
-my_icon = c('ok', 'remove', 'star')
-names(my_icon) = c('pass', 'fail', 'warn')
-evaluate_list = list()
-for (i in colnames(PWF_df)[-1]) {
-  evaluate_list[[i]] = formatter(
-      "span",
-      style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')),
-                        "color" = "white",
-                        "width" = "50px",
-                        "float" = "left",
-                        "padding-right" = "5px")
-    )
+if ('READS_2' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>%
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>%
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>%
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
+  system(fastqc_command_reads_2, intern = TRUE)
+
+  # Original html report
+  reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
+
+  unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
+  # summary.txt
+  file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
+  summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
+
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
 }
+```

-formattable(PWF_df, evaluate_list)
+
+
+# Fastqc output visualization
+
+## Overview
+
+```{r}
+reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
+reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
+combined_summary = cbind(reads_1_summary, reads_2_summary)
+names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
+combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
+combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
+knitr::kable(combined_summary)
+```
+
+# Session Info
+
+```{r 'session info'}
+sessionInfo()
 ```
\ No newline at end of file
--- a/02_fastqc_original_reports.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
----
-title: "FastQC original reports"
-output: html_document
----
-
-```{r 'FastQC original reports', include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-
-Below are links to ***Fastqc*** original html reports.
-
-```{r 'html report links'}
-html_report_list = list()
-html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html')
-for (i in html_files) {
-  html_report_list[[i]] = tags$li(tags$a(href=i, i))
-}
-tags$ul(html_report_list)
-```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/02_per_base_sequence_quality.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,47 @@
+---
+title: 'Per base sequence quality'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per base sequence quality
+
+```{r 'per base sequence quality', fig.width=10}
+## reads 1
+pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
+pbsq_1$id = 1:length(pbsq_1$X.Base)
+
+melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable == 'Mean')
+melt_pbsq_1$trim = 'before'
+
+
+## reads 2
+pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
+pbsq_2$id = 1:length(pbsq_2$X.Base)
+
+melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable == 'Mean')
+melt_pbsq_2$trim = 'after'
+
+comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
+comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
+
+p = ggplot(data = comb_pbsq) +
+  geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
+  scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
+  facet_grid(. ~ trim) +
+  ylim(0, max(comb_pbsq$value) + 5) +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
+
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/03_per_tile_sequence_quality.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,48 @@
+---
+title: 'Per Tile Sequence Quality'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per tile sequence quality
+
+```{r 'per tile sequence quality', fig.width=10}
+## check if 'per tile sequence quality' module exits or not
+check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt'))
+if (length(check_ptsq) > 0) {
+    ## reads 1
+  ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
+  ptsq_1$trim = 'before'
+
+  ## reads 2
+  ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
+  ptsq_2$trim = 'after'
+
+  comb_ptsq = rbind(ptsq_1, ptsq_2)
+  comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
+  comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
+
+  # convert integers to charaters
+  comb_ptsq$Tile = as.character(comb_ptsq$X.Tile)
+
+  p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) +
+    geom_raster() +
+    facet_grid(. ~ trim) +
+    xlab('Position in read (bp)') +
+    ylab('') +
+    theme(axis.text.x = element_text(angle=45))
+  ggplotly(p)
+} else {
+  print('No "per tile sequence quality" data')
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/04_per_sequence_quality_score.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,39 @@
+---
+title: 'Per sequence quality score'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per sequence quality score
+
+```{r 'Per sequence quality score', fig.width=10}
+## reads 1
+psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores')
+psqs_1$trim = 'before'
+
+## reads 2
+psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores')
+psqs_2$trim = 'after'
+
+comb_psqs = rbind(psqs_1, psqs_2)
+comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim)
+
+p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) +
+  geom_line(color = 'red') +
+  facet_grid(. ~ trim) +
+  xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) +
+  xlab('Mean Sequence Qaulity (Phred Score)') +
+  ylab('')
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/05_per_base_sequence_content.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,47 @@
+---
+title: 'Per base sequence content'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per base sequence content
+
+```{r 'Per base sequence content', fig.width=10}
+## reads 1
+pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content')
+pbsc_1$id = 1:length(pbsc_1$X.Base)
+
+melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id'))
+melt_pbsc_1$trim = 'before'
+
+
+## reads 2
+pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content')
+pbsc_2$id = 1:length(pbsc_2$X.Base)
+
+melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id'))
+melt_pbsc_2$trim = 'after'
+
+comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2)
+comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim)
+
+p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) +
+  geom_line() +
+  facet_grid(. ~ trim) +
+  xlim(min(comb_pbsc$id), max(comb_pbsc$id)) +
+  ylim(0, 100) +
+  xlab('Position in read (bp)') +
+  ylab('')
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/06_per_sequence_gc_content.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,38 @@
+---
+title: 'Per sequence GC content'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per sequence GC content
+
+```{r 'Per sequence GC content', fig.width=10}
+## reads 1
+psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content')
+psGCc_1$trim = 'before'
+
+## reads 2
+psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content')
+psGCc_2$trim = 'after'
+
+comb_psGCc = rbind(psGCc_1, psGCc_2)
+comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim)
+
+p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) +
+  geom_line(color = 'red') +
+  facet_grid(. ~ trim) +
+  xlab('Mean Sequence Qaulity (Phred Score)') +
+  ylab('')
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/07_per_base_n_content.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,43 @@
+---
+title: 'Per base N content'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Per base N content
+
+```{r 'Per base N content', fig.width=10}
+## reads 1
+pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content')
+pbNc_1$id = 1:length(pbNc_1$X.Base)
+pbNc_1$trim = 'before'
+
+## reads 2
+pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content')
+pbNc_2$id = 1:length(pbNc_2$X.Base)
+pbNc_2$trim = 'after'
+
+comb_pbNc = rbind(pbNc_1, pbNc_2)
+comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim)
+
+p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) +
+  geom_line(color = 'red') +
+  scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) +
+  facet_grid(. ~ trim) +
+  ylim(0, 1) +
+  xlab('N-Count') +
+  ylab('') +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/08_sequence_length_distribution.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,42 @@
+---
+title: 'Sequence Length Distribution'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Sequence Length Distribution
+
+```{r 'Sequence Length Distribution', fig.width=10}
+## reads 1
+sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution')
+sld_1$id = 1:length(sld_1$X.Length)
+sld_1$trim = 'before'
+
+## reads 2
+sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution')
+sld_2$id = 1:length(sld_2$X.Length)
+sld_2$trim = 'after'
+
+comb_sld = rbind(sld_1, sld_2)
+comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim)
+
+p = ggplot(data = comb_sld, aes(x = id, y = Count)) +
+  geom_line(color = 'red') +
+  scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) +
+  facet_grid(. ~ trim) +
+  xlab('Sequence Length (bp)') +
+  ylab('') +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/09_sequence_duplication_levels.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,49 @@
+---
+title: 'Sequence Duplication Levels'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Sequence Duplication Levels
+
+```{r 'Sequence Duplication Levels', fig.width=10}
+## reads 1
+sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+sdl_1$id = 1:length(sdl_1$Duplication_Level)
+
+melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id'))
+melt_sdl_1$trim = 'before'
+
+
+## reads 2
+sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+sdl_2$id = 1:length(sdl_2$Duplication_Level)
+
+melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id'))
+melt_sdl_2$trim = 'after'
+
+comb_sdl = rbind(melt_sdl_1, melt_sdl_2)
+comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim)
+
+p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) +
+  geom_line() +
+  scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) +
+  facet_grid(. ~ trim) +
+  xlab('Sequence Duplication Level') +
+  ylab('') +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/10_adapter_content.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,46 @@
+---
+title: 'Adapter Content'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Adapter Content
+
+```{r 'Adapter Content', fig.width=10}
+## reads 1
+ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content')
+ac_1$id = 1:length(ac_1$X.Position)
+
+melt_ac_1 = melt(ac_1, id=c('X.Position', 'id'))
+melt_ac_1$trim = 'before'
+
+## reads 2
+ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content')
+ac_2$id = 1:length(ac_2$X.Position)
+
+melt_ac_2 = melt(ac_2, id=c('X.Position', 'id'))
+melt_ac_2$trim = 'after'
+
+comb_ac = rbind(melt_ac_1, melt_ac_2)
+comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim)
+
+p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) +
+  geom_line() +
+  facet_grid(. ~ trim) +
+  xlim(min(comb_ac$id), max(comb_ac$id)) +
+  ylim(0, 1) +
+  xlab('Position in read (bp)') +
+  ylab('')
+ggplotly(p)
+```
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/11_kmer_content.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -0,0 +1,31 @@
+---
+title: 'Kmer Content'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
+```
+
+### Kmer Content {.tabset}
+
+#### Before
+
+```{r 'Kmer Content (before)', fig.width=10}
+kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content')
+knitr::kable(kc_1)
+```
+
+#### After
+```{r 'Kmer Content (after)', fig.width=10}
+kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content')
+knitr::kable(kc_2)
+```
\ No newline at end of file
--- a/1_per_base_quality_scores.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,62 +0,0 @@
----
-title: "Per Base Quality Scores"
-output: html_document
----
-
-```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-
-## Per Base Quality Scores
-
-```{r}
-PBQS_df = data.frame()
-PBQS_file_paths = read.csv('PBQS_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBQS_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
-  file_path = PBQS_file_paths[i,2]
-  pbqs_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:7], 'Base')
-    colnames(df2) = c(colnames(df2)[1:7], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
-  PBQS_df = rbind(PBQS_df, pbqs_df)
-}
-```
-
-
-```{r}
-# datatable(PBQS_df)
-max_phred = max(PBQS_df$Mean) + 10
-hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
-  hc_title(
-    text = "Per Base Quality Score"
-  ) %>%
-  hc_yAxis(
-    title = list(text = "Mean Base Quality Score"),
-    min = 0,
-    max = max_phred,
-    plotLines = list(
-      list(label = list(text = "Phred Score = 27"),
-           width = 2,
-           dashStyle = "dash",
-           color = "green",
-           value = 27),
-      list(label = list(text = "Phred Score = 20"),
-           width = 2,
-           color = "red",
-           value = 20)
-    )
-  ) %>%
-  hc_exporting(enabled = TRUE)
-```
--- a/2_per_base_N_content.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,58 +0,0 @@
----
-title: "Per Base N Content"
-output: html_document
----
-
-```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-## Per Base N Content
-
-```{r}
-PBNC_df = data.frame()
-PBNC_file_paths = read.csv('PBNC_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBNC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
-  file_path = PBNC_file_paths[i,2]
-  pbnc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:2], 'Base')
-    colnames(df2) = c(colnames(df2)[1:2], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
-  PBNC_df = rbind(PBNC_df, pbnc_df)
-}
-```
-
-
-```{r}
-PBNC_df$N.Count = PBNC_df$N.Count * 100
-max_phred = max(PBNC_df$N.Count) + 5
-hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Base N Content"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "Base Position")
-  ) %>%
-  hc_yAxis(
-    title = list(text = "N %"),
-    plotLines = list(
-      list(label = list(text = "N = 5%"),
-           width = 2,
-           dashStyle = "dash",
-           color = "red",
-           value = 5)
-    )
-  ) %>%
-  hc_exporting(enabled = TRUE)
-```
--- a/3_per_sequence_quality_scores.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
----
-title: "Per Sequence Quality Scores"
-output: html_document
----
-
-```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-## Per Sequence Quality Scores
-
-```{r}
-PSQS_df = data.frame()
-PSQS_file_paths = read.csv('PSQS_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PSQS_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
-  file_path = PSQS_file_paths[i,2]
-  psqs_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE)
-  psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
-  PSQS_df = rbind(PSQS_df, psqs_df)
-}
-```
-
-
-```{r}
-max_phred = max(PSQS_df$X.Quality) + 5
-hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Sequence Quality Score"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "Mean Sequence Quality Score"),
-    min = 0,
-    max = max_phred,
-    plotLines = list(
-      list(label = list(text = "Phred Score = 27"),
-           width = 2,
-           dashStyle = "dash",
-           color = "green",
-           value = 27),
-      list(label = list(text = "Phred Score = 20"),
-           width = 2,
-           color = "red",
-           value = 20)
-    )
-  ) %>%
-  hc_exporting(enabled = TRUE)
-```
--- a/4_per_sequence_GC_content.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
----
-title: "Per Sequence GC Content"
-output: html_document
----
-
-```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-## Per Sequence GC Content
-
-
-```{r}
-PSGC_df = data.frame()
-PSGC_file_paths = read.csv('PSGC_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PSGC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
-  file_path = PSGC_file_paths[i,2]
-  psgc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE)
-  psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
-  PSGC_df = rbind(PSGC_df, psgc_df)
-}
-```
-
-
-```{r}
-max_phred = max(PSGC_df$Count) + 5
-hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
-  hc_title(
-    text = "Per Sequence GC Content"
-  ) %>%
-  hc_xAxis(
-    title = list(text = "% GC")
-  ) %>%
-  hc_exporting(enabled = TRUE)
-```
--- a/5_per_base_sequence_content.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,45 +0,0 @@
----
-title: "Per Base Sequence Content"
-output: html_document
----
-
-```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-## Per Base Sequence Content
-
-```{r}
-PBSC_df = data.frame()
-PBSC_file_paths = read.csv('PBSC_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-for(i in 1:nrow(PBSC_file_paths)) {
-  # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
-  file_path = PBSC_file_paths[i,2]
-  pbsc_df = read.csv(file_path,
-                     sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
-    mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
-           Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
-  (function (df) {
-    df1 = select(df, -Base2)
-    df2 = select(df, -Base1) %>% filter(Base2 != '')
-    colnames(df1) = c(colnames(df1)[1:5], 'Base')
-    colnames(df2) = c(colnames(df2)[1:5], 'Base')
-    res = rbind(df1, df2) %>% arrange(Base)
-    return(res)
-  })
-  pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
-  PBSC_df = rbind(PBSC_df, pbsc_df)
-}
-```
-
-
-```{r out.width="100%"}
-PBSC_df_2 = select(PBSC_df, -X.Base) %>%
-  melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
-p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
-  geom_line() +
-  facet_wrap(~ sample_id)
-ggplotly(p)
-```
-
--- a/_site.yml	Tue Aug 15 15:50:21 2017 -0400
+++ b/_site.yml	Tue Nov 07 16:52:24 2017 -0500
@@ -8,21 +8,29 @@
           icon: fa-home
           href: index.html
         - text: "Evaluation Overview"
-          href: 01_evaluation_overview.html
-        - text: "Evaluation Items"
+          href: x01_evaluation_overview.html
+        - text: "Evaluation by data module"
           menu:
-            - text: "Per Base Quality Scores"
-              href: 1_per_base_quality_scores.html
+            - text: "Per Base Sequence Quality"
+              href: x02_per_base_sequence_quality.html
+            - text: "Per Tile Sequence Quality"
+              href: x03_per_tile_sequence_quality.html
+            - text: "Per Sequence Quality Score"
+              href: x04_per_sequence_quality_score.html
+            - text: "Per Base Sequence Content"
+              href: x05_per_base_sequence_content.html
+            - text: "Per Sequence GC Content"
+              href: x06_per_sequence_gc_content.html
             - text: "Per Base N Content"
-              href: 2_per_base_N_content.html
-            - text: "Per Sequence Quality Scores"
-              href: 3_per_sequence_quality_scores.html
-            - text: "Per Sequence GC Content"
-              href: 4_per_sequence_GC_content.html
-            - text: "Per Base Sequence Content"
-              href: 5_per_base_sequence_content.html
-        - text: "Original FastQC Reports"
-          href: 02_fastqc_original_reports.html
+              href: x07_per_base_n_content.html
+            - text: "Sequence Length Distribution"
+              href: x08_sequence_length_distribution.html
+            - text: "Sequence Duplication Levels"
+              href: x09_sequence_duplication_levels.html
+            - text: "Adapter Content"
+              href: x10_adapter_content.html
+            - text: "Kmer Content"
+              href: x11_kmer_content.html
 output:
   html_document:
     theme: cosmo
--- a/fastqc_site.xml	Tue Aug 15 15:50:21 2017 -0400
+++ b/fastqc_site.xml	Tue Nov 07 16:52:24 2017 -0500
@@ -1,9 +1,9 @@
-<tool id="fastqc_site" name="Fastqc Site" version="1.0.0">
+<tool id="fastqc_site" name="Fastqc Site" version="2.0.0">
     <requirements>
         <requirement type="package" version="1.15.0.6-0">pandoc</requirement>
         <requirement type="package" version="1.14.1">bioconductor-deseq2</requirement>
         <requirement type="package" version="1.20.0">r-getopt</requirement>
-        <requirement type="package" version="1.2">r-rmarkdown</requirement>
+        <requirement type="package" version="1.3">r-rmarkdown</requirement>
         <requirement type="package" version="1.8.4">r-plyr</requirement>
         <requirement type="package" version="1.1.0">r-stringr</requirement>
         <requirement type="package" version="0.5.0">r-highcharter</requirement>
@@ -14,22 +14,12 @@
         <requirement type="package" version="0.3.5">r-htmltools</requirement>
         <requirement type="package" version="0.11.5">fastqc</requirement>
     </requirements>
-    <description>
-        Implements FastQC analysis and display results in R Markdown website.
-    </description>
     <stdio>
-        <regex match="Execution halted"
-               source="both"
-               level="fatal"
-               description="Execution halted." />
-        <regex match="Error in"
-               source="both"
-               level="fatal"
-               description="An undefined error occured, please check your intput carefully and contact your administrator." />
-        <regex match="Fatal error"
-               source="both"
-               level="fatal"
-               description="An undefined error occured, please check your intput carefully and contact your administrator." />
+        <!--redirecting stderr to a file. "XXX" is used to match with nothing so that tool running won't be interrupted during testing-->
+        <regex match="XXX"
+               source="stderr"
+               level="warning"
+               description="Check the warnings_and_errors.txt file for more details."/>
     </stdio>
     <command>
         <![CDATA[
@@ -37,36 +27,65 @@
         Rscript '${__tool_directory__}/fastqc_site_render.R'

             ## 1. input data
-            -r $reads
             -e $echo
+            -r $reads_1
+            -n '$reads_1.name'
+            -R $reads_2
+            -N '$reads_2.name'
+            -c $contaminants
+            -l $limits

             ## 2. output report and report site directory
-		    -o $fastqc_site
-		    -d $fastqc_site.files_path
+		    -o $report
+		    -d $report.files_path
+		    -s $sink_message

 		    ## 3. Rmd templates sitting in the tool directory

-		        ## _site.yml and index.Rmd template files
-                -s '${__tool_directory__}/_site.yml'
-                -i '${__tool_directory__}/index.Rmd'
+		    ## _site.yml and index.Rmd template files
+            -S '${__tool_directory__}/_site.yml'
+            -I '${__tool_directory__}/index.Rmd'

-                ## other Rmd body template files
-		        -p  '${__tool_directory__}/01_evaluation_overview.Rmd'
-		        -a  '${__tool_directory__}/02_fastqc_original_reports.Rmd'
-		        -b  '${__tool_directory__}/1_per_base_quality_scores.Rmd'
-		        -c  '${__tool_directory__}/2_per_base_N_content.Rmd'
-		        -f  '${__tool_directory__}/3_per_sequence_quality_scores.Rmd'
-		        -g  '${__tool_directory__}/4_per_sequence_GC_content.Rmd'
-		        -h  '${__tool_directory__}/5_per_base_sequence_content.Rmd'
+            ## other Rmd body template files
+		    -A '${__tool_directory__}/01_evaluation_overview.Rmd'
+		    -B '${__tool_directory__}/02_per_base_sequence_quality.Rmd'
+		    -C '${__tool_directory__}/03_per_tile_sequence_quality.Rmd'
+		    -D '${__tool_directory__}/04_per_sequence_quality_score.Rmd'
+		    -E '${__tool_directory__}/05_per_base_sequence_content.Rmd'
+		    -F '${__tool_directory__}/06_per_sequence_gc_content.Rmd'
+		    -G '${__tool_directory__}/07_per_base_n_content.Rmd'
+		    -H '${__tool_directory__}/08_sequence_length_distribution.Rmd'
+		    -J '${__tool_directory__}/09_sequence_duplication_levels.Rmd'
+		    -K '${__tool_directory__}/10_adapter_content.Rmd'
+		    -L '${__tool_directory__}/11_kmer_content.Rmd'
+

         ]]>
     </command>
     <inputs>
-        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" multiple="true" name="reads" type="data" label="Short reads data from history" />
-        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Display analysis code in report?" />
+        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_1" type="data" optional="false"
+               label="Short reads before trimming"
+               help="Short reads data from history. This could be reads before trimming."/>
+        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_2" type="data"
+               label="Short reads after trimming"
+               help="Short reads data from history. This could be reads after trimming."/>
+        <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list"
+               help="Specifies a non-default file which contains the list of adapter sequences which will be explicitly
+                     searched against the library. The file must contain sets of named adapters
+                     in the form name[tab]sequence.  Lines prefixed with a hash will be ignored."/>
+        <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file"
+               help="Specifies a non-default file which contains a set of criteria
+                    which will be used to determine the warn/error limits for the
+                    various modules.  This file can also be used to selectively
+                    remove some modules from the output all together.  The format
+                    needs to mirror the default limits.txt file found in the
+                    Configuration folder."/>
+        <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
+               label="Display analysis code in report?"/>
     </inputs>
     <outputs>
-        <data format="html" name="fastqc_site" label="fastqc site" />
+        <data format="html" name="report" label="fastqc site"/>
+        <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
     </outputs>
     <citations>
         <citation type="bibtex">
@@ -79,7 +98,8 @@
         <citation type="bibtex">
             @article{allaire2016rmarkdown,
             title={rmarkdown: Dynamic Documents for R, 2016},
-            author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
+            author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff
+            and Wickham, Hadley and Atkins, Aron and Hyndman, Rob},
             journal={R package version 0.9},
             volume={6},
             year={2016}
@@ -97,31 +117,14 @@
         <citation type="bibtex">
             @misc{plotly2017,
             title = {plotly: Create Interactive Web Graphics via 'plotly.js'},
-            author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy},
+            author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and
+            Marianne Corvellec and Pedro Despouy},
             year = {2017},
             note = {R package version 4.6.0},
             url = {https://CRAN.R-project.org/package=plotly},
             }
         </citation>
         <citation type="bibtex">
-            @misc{highcharter2017,
-            title = {highcharter: A Wrapper for the 'Highcharts' Library},
-            author = {Joshua Kunst},
-            year = {2017},
-            note = {R package version 0.5.0},
-            url = {https://CRAN.R-project.org/package=highcharter},
-            }
-        </citation>
-        <citation type="bibtex">
-            @misc{formattable2016,
-            title = {formattable: Create 'Formattable' Data Structures},
-            author = {Kun Ren and Kenton Russell},
-            year = {2016},
-            note = {R package version 0.2.0.1},
-            url = {https://CRAN.R-project.org/package=formattable},
-            }
-        </citation>
-        <citation>
             @article{ewels2016multiqc,
             title={MultiQC: summarize analysis results for multiple tools and samples in a single report},
             author={Ewels, Philip and Magnusson, M{\aa}ns and Lundin, Sverker and K{\"a}ller, Max},
--- a/fastqc_site_render.R	Tue Aug 15 15:50:21 2017 -0400
+++ b/fastqc_site_render.R	Tue Nov 07 16:52:24 2017 -0500
@@ -1,195 +1,283 @@
-##======= Handle arguments from command line ========
-# setup R error handline to go to stderr
-options(show.error.messages=FALSE,
-        error=function(){
-          cat(geterrmessage(), file=stderr())
-          quit("no", 1, F)
-        })
-
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
-loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-# suppress warning
-options(warn = -1)
-
-options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)
-args = commandArgs(trailingOnly=TRUE)
-
-suppressPackageStartupMessages({
-  library(getopt)
-  library(tools)
-})
-
-# column 1: the long flag name
-# column 2: the short flag alias. A SINGLE character string
-# column 3: argument mask
-#           0: no argument
-#           1: argument required
-#           2: argument is optional
-# column 4: date type to which the flag's argument shall be cast.
-#           possible values: logical, integer, double, complex, character.
-spec_list=list()
-
-##------- 1. input data ---------------------
-spec_list$READS = c('reads', 'r', '1', 'character')
-spec_list$ECHO = c('echo', 'e', '1', 'character')
-
-##--------2. output report and report site directory --------------
-spec_list$FASTQC_SITE = c('fastqc_site', 'o', '1', 'character')
-spec_list$FASTQC_SITE_DIR = c('fastqc_site_dir', 'd', '1', 'character')
-
-##--------3. Rmd templates sitting in the tool directory ----------
-
-    ## _site.yml and index.Rmd files
-    spec_list$SITE_YML = c('site_yml', 's', 1, 'character')
-    spec_list$INDEX_Rmd = c('index_rmd', 'i', 1, 'character')
-
-    ## other Rmd body template files
-    spec_list$x01 = c('x01_evaluation_overview', 'p', '1', 'character')
-    spec_list$x02 = c('x02_fastqc_original_reports', 'a', '1', 'character')
-    spec_list$x1 = c('x1_per_base_quality_scores', 'b', '1', 'character')
-    spec_list$x2 = c('x2_per_base_N_content', 'c', '1', 'character')
-    spec_list$x3 = c('x3_per_sequence_quality_scores', 'f', '1', 'character')
-    spec_list$x4 = c('x4_per_sequence_GC_content', 'g', '1', 'character')
-    spec_list$x5 = c('x5_per_base_sequence_content', 'h', '1', 'character')
-
-##------------------------------------------------------------------
-
-spec = t(as.data.frame(spec_list))
-opt = getopt(spec)
-# arguments are accessed by long flag name (the first column in the spec matrix)
-#                        NOT by element name in the spec_list
-# example: opt$help, opt$expression_file
-##====== End of arguments handling ==========
-
-#------ Load libraries ---------
+library(getopt)
 library(rmarkdown)
+library(htmltools)
 library(plyr)
+library(dplyr)
 library(stringr)
-library(dplyr)
 library(highcharter)
 library(DT)
 library(reshape2)
 library(plotly)
 library(formattable)
-library(htmltools)
-
+options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)

-#----- 1. create the report directory ------------------------
-paste0('mkdir -p ', opt$fastqc_site_dir) %>%
-  system()
-
-#----- 2. generate Rmd files with Rmd templates --------------
-#   a. templates without placeholder variables:
-#         copy templates from tool directory to the working directory.
-#   b. templates with placeholder variables:
-#         substitute variables with user input values and place them in the working directory.
+##============ Sink warnings and errors to a file ==============
+## use the sink() function to wrap all code within it.
+##==============================================================
+zz = file('warnings_and_errors.txt')
+sink(zz)
+sink(zz, type = 'message')
+  ##---------below is the code for rendering .Rmd templates-----
+
+  ##=============STEP 1: handle command line arguments==========
+  ##
+  ##============================================================
+  # column 1: the long flag name
+  # column 2: the short flag alias. A SINGLE character string
+  # column 3: argument mask
+  #           0: no argument
+  #           1: argument required
+  #           2: argument is optional
+  # column 4: date type to which the flag's argument shall be cast.
+  #           possible values: logical, integer, double, complex, character.
+  #-------------------------------------------------------------
+  #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++
+  # 1. short flag alias should match the flag in the command section in the XML file.
+  # 2. long flag name can be any legal R variable names
+  # 3. two names in args_list can have common string but one name should not be a part of another name.
+  #    for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems.
+  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+  args_list=list()
+  ##------- 1. input data ---------------------
+  args_list$ECHO = c('echo', 'e', '1', 'character')
+  args_list$READS_1 = c('reads_1', 'r', '1', 'character')
+  args_list$NAME_1 = c('name_1', 'n', '1', 'character')
+  args_list$READS_2 = c('reads_2', 'R', '1', 'character')
+  args_list$NAME_2 = c('name_2', 'N', '1', 'character')
+  args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character')
+  args_list$LIMITS = c('limits', 'l', '1', 'character')
+  ##--------2. output report and outputs --------------
+  args_list$REPORT_HTML = c('report_html', 'o', '1', 'character')
+  args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character')
+  args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character')
+  ##--------3. .Rmd templates in the tool directory ----------
+  args_list$SITE_YML = c('site_yml', 'S', '1', 'character')
+  args_list$INDEX_RMD = c('index_rmd', 'I', '1', 'character')
+  args_list$X01_EVALUATION_OVERVIEW = c('x01_evaluation_overview', 'A', '1', 'character')
+  args_list$X02_PER_BASE_SEQUENCE_QUALITY = c('x02_per_base_sequence_quality', 'B', '1', 'character')
+  args_list$X03_PER_TILE_SEQUENCE_QUALITY = c('x03_per_tile_sequence_quality', 'C', '1', 'character')
+  args_list$X04_PER_SEQUENCE_QUALITY_SCORE = c('x04_per_sequence_quality_score', 'D', '1', 'character')
+  args_list$X05_PER_BASE_SEQUENCE_CONTENT = c('x05_per_base_sequence_content', 'E', '1', 'character')
+  args_list$X06_PER_SEQUENCE_GC_CONTENT = c('x06_per_sequence_gc_content', 'F', '1', 'character')
+  args_list$X07_PER_BASE_N_CONTENT = c('x07_per_base_n_content', 'G', '1', 'character')
+  args_list$X08_SEQUENCE_LENGTH_DISTRIBUTION = c('x08_sequence_length_distribution', 'H', '1', 'character')
+  args_list$X09_SEQUENCE_DUPLICATION_LEVELS = c('x09_sequence_duplication_levels', 'J', '1', 'character')
+  args_list$X10_ADAPTER_CONTENT = c('x10_adapter_content', 'K', '1', 'character')
+  args_list$X11_KMER_CONTENT = c('x11_kmer_content', 'L', '1', 'character')
+  ##-----------------------------------------------------------
+  opt = getopt(t(as.data.frame(args_list)))


-    #----- Copy index.Rmd and _site.yml files to job working direcotry -----
-    file.copy(opt$index_rmd, 'index.Rmd', recursive=TRUE)
-    file.copy(opt$site_yml, '_site.yml', recursive=TRUE)
-    #---------------------------------------------------------
-
-    #----- 01_evaluation_overview.Rmd -----------------------
-    readLines(opt$x01_evaluation_overview) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        gsub('READS', opt$reads, x)
-      }) %>%
-      (function(x) {
-        gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('01_evaluation_overview.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-
-    #----- 1_per_base_quality_scores.Rmd --------------------
-    readLines(opt$x1_per_base_quality_scores) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('1_per_base_quality_scores.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-
-    #----- 2_per_base_N_content.Rmd -------------------------
-    readLines(opt$x2_per_base_N_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('2_per_base_N_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-
-    #----- 3_per_sequence_quality_scores.Rmd ----------------
-    readLines(opt$x3_per_sequence_quality_scores) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('3_per_sequence_quality_scores.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-
-
-    #----- 4_per_sequence_GC_content.Rmd --------------------
-    readLines(opt$x4_per_sequence_GC_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('4_per_sequence_GC_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-
-
-    #----- 5_per_base_sequence_content.Rmd ------------------
-    readLines(opt$x5_per_base_sequence_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('5_per_base_sequence_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
+
+  ##=======STEP 2: create report directory (optional)==========
+  ##
+  ##===========================================================
+  dir.create(opt$report_dir)
+
+  ##==STEP 3: copy index.Rmd and _site.yml to job working directory======
+  ##
+  ##=====================================================================
+  file.copy(opt$index_rmd, 'index.Rmd')
+  file.copy(opt$site_yml, '_site.yml')
+
+  ##=STEP 4: replace placeholders in .Rmd files with argument values=
+  ##
+  ##=================================================================
+  #++ need to replace placeholders with args values one by one+
+
+  # 01_evaluation_overview.Rmd
+  readLines(opt$x01_evaluation_overview) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('READS_1', opt$reads_1, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_1', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('READS_2', opt$reads_2, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_2', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('CONTAMINANTS', opt$contaminants, x)
+    }) %>%
+    (function(x) {
+      gsub('LIMITS', opt$limits, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x01_evaluation_overview.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 02_per_base_sequence_quality.Rmd
+  readLines(opt$x02_per_base_sequence_quality) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x02_per_base_sequence_quality.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 03_per_tile_sequence_quality.Rmd
+  readLines(opt$x03_per_tile_sequence_quality) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x03_per_tile_sequence_quality.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 04_per_sequence_quality_score.Rmd
+  readLines(opt$x04_per_sequence_quality_score) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x04_per_sequence_quality_score.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 05_per_base_sequence_content.Rmd
+  readLines(opt$x05_per_base_sequence_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x05_per_base_sequence_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 06_per_sequence_gc_content.Rmd
+  readLines(opt$x06_per_sequence_gc_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x06_per_sequence_gc_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 07_per_base_n_content.Rmd
+  readLines(opt$x07_per_base_n_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x07_per_base_n_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })

-    #----- 02_fastqc_original_reports.Rmd -------------------
-    readLines(opt$x02_fastqc_original_reports) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('02_fastqc_original_reports.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
+  # 08_sequence_length_distribution.Rmd
+  readLines(opt$x08_sequence_length_distribution) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x08_sequence_length_distribution.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 09_sequence_duplication_levels.Rmd
+  readLines(opt$x09_sequence_duplication_levels) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x09_sequence_duplication_levels.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 10_adapter_content.Rmd
+  readLines(opt$x10_adapter_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x10_adapter_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  # 11_kmer_content.Rmd
+  readLines(opt$x11_kmer_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x11_kmer_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+
+  ##=============STEP 5: render all .Rmd templates=================
+  ##
+  ##===========================================================
+  extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
+    f = readLines(fastqc_data)
+    start_line = grep(module_name, f)
+    end_module_lines = grep('END_MODULE', f)
+    end_line = end_module_lines[which(end_module_lines > start_line)[1]]
+    module_data = f[(start_line+1):(end_line-1)]
+    writeLines(module_data, 'temp.txt')
+    read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char)
+  }
+  render_site()
+
+  ##=============STEP 6: manipulate outputs====================
+  ##
+  ##===========================================================
+  file.copy('my_site/index.html', opt$report_html, recursive = TRUE)
+  system(paste0('cp -r my_site/* ', opt$report_dir))


-
-#------ 3. render all Rmd files with render_site() --------
-render_site()
-
-
-#-------4. manipulate outputs -----------------------------
-#   a. copy index.html to the report output path
-#   b. copy all files in 'my_site' to the report output directory
-file.copy('my_site/index.html', opt$fastqc_site, recursive=TRUE)
-paste0('cp -r my_site/* ', opt$fastqc_site_dir) %>%
-  system()
-
-
+  ##--------end of code rendering .Rmd templates----------------
+sink()
+##=========== End of sinking output=============================
\ No newline at end of file
--- a/index.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ b/index.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -4,7 +4,7 @@
 ---

 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(echo = TRUE, error = TRUE)
 ```