Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report

--- a/fastqc_report.Rmd	Thu Oct 19 00:11:14 2017 -0400
+++ b/fastqc_report.Rmd	Sat Oct 21 09:25:49 2017 -0400
@@ -10,72 +10,112 @@

 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
 knitr::opts_chunk$set(
-  echo = ECHO
+  echo = ECHO,
+  error = TRUE
 )
 ```


-# Fastqc Analysis
-
-* Copy fastq files to job working directory
+# Fastqc Evaluation

-```{bash 'copy files'}
-for f in $(echo READS | sed "s/,/ /g")
-do
-    cp $f ./
-done
-```
-
-* Run fastqc
+## Evaluation of reads before trimming

-```{bash 'run fastqc'}
-for r in $(ls *.dat)
-do
-    fastqc -o REPORT_DIR $r > /dev/null 2>&1
-done
-```
-
-## Evaluation results
-
-```{r 'html report links'}
-html_file = list.files('REPORT_DIR', pattern = '.*html')
-tags$ul(tags$a(href=html_file, paste0('HTML report', opt$name)))
+```{r}
+if ('READS_1' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>%
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>%
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>%
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
+  system(fastqc_command_reads_1, intern = TRUE)
+
+  # Original html report
+  reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
+
+  unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
+  # summary.txt
+  file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
+  summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
+
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
+}
 ```


-```{r 'extract fastqc_data.txt and summary.txt'}
-# list all zip files
-zip_file = list.files(path = 'REPORT_DIR', pattern = '.zip')
-unzip(paste0('REPORT_DIR/', zip_file), exdir = 'REPORT_DIR')
+## Evaluation of reads after trimming

-unzip_directory = paste0(tail(strsplit(opt$reads, '/')[[1]], 1), '_fastqc/')
-fastqc_data_txt_path = paste0('REPORT_DIR/', unzip_directory, 'fastqc_data.txt')
-summary_txt_path = paste0('REPORT_DIR/', unzip_directory, 'summary.txt')
+```{r}
+if ('READS_2' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>%
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>%
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>%
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
+  system(fastqc_command_reads_2, intern = TRUE)
+
+  # Original html report
+  reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
+
+  unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
+  # summary.txt
+  file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
+  summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
+
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
+}
 ```


-```{r 'summary.txt'}
-tags$ul(tags$a(href=paste0(unzip_directory, 'summary.txt'), 'summary.txt'))
-```
-
-
-```{r 'fastqc_data.txt'}
-tags$ul(tags$a(href=paste0(unzip_directory, 'fastqc_data.txt'), 'fastqc_data.txt'))
-```
-

 # Fastqc output visualization

 ## Overview

 ```{r}
-# read.table(fastqc_data_txt_path)
-summary_txt = read.csv(summary_txt_path, header = FALSE, sep = '\t')[, 2:1]
-names(summary_txt) = c('MODULE', 'PASS/FAIL')
-knitr::kable(summary_txt)
+reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
+reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1]
+combined_summary = cbind(reads_1_summary, reads_2_summary)
+names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
+knitr::kable(combined_summary)
 ```

-## Summary by module {.tabset}
+## Visualization by module {.tabset}

 * Define a function to extract outputs for each module from fastqc output

@@ -93,16 +133,53 @@

 ### Per base sequence quality

-```{r}
-pbsq = extract_data_module(fastqc_data_txt_path, 'Per base sequence quality')
-knitr::kable(pbsq)
+```{r 'per base sequence quality', fig.width=10}
+## reads 1
+pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
+pbsq_1$id = 1:length(pbsq_1$X.Base)
+
+melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
+melt_pbsq_1$trim = 'before'
+
+
+## reads 2
+pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
+pbsq_2$id = 1:length(pbsq_2$X.Base)
+
+melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
+melt_pbsq_2$trim = 'after'
+
+comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
+comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
+p = ggplot(data = comb_pbsq) +
+  geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
+  scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
+  facet_grid(. ~ trim) +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
+
 ```

 ### Per tile sequence quality

-```{r}
-ptsq = extract_data_module(fastqc_data_txt_path, 'Per tile sequence quality')
-knitr::kable(ptsq)
+```{r 'per tile sequence quality'}
+## reads 1
+ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
+ptsq_1$trim = 'before'
+
+## reads 2
+ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
+ptsq_2$trim = 'after'
+
+comb_ptsq = rbind(ptsq_1, ptsq_2)
+comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
+comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
+
+p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) +
+  geom_raster() +
+  facet_grid(. ~ trim) +
+  theme(axis.text.x = element_text(angle=45))
+ggplotly(p)
 ```
--- a/fastqc_report.xml	Thu Oct 19 00:11:14 2017 -0400
+++ b/fastqc_report.xml	Sat Oct 21 09:25:49 2017 -0400
@@ -1,6 +1,7 @@
 <tool id="fastqc_report" name="Fastqc report" version="2.0.0">
     <description>
-        Implements FastQC analysis and display results in R Markdown html.
+        Evaluate short reads with FastQC software on a single reads file or a paired of untrimmed and trimmed reads
+        files.
     </description>
     <requirements>
         <requirement type="package" version="1.15.0.6-0">pandoc</requirement>
@@ -29,8 +30,12 @@

         Rscript '${__tool_directory__}/fastqc_report_render.R'
             -e $echo
-            -r $reads
-            -n $reads.name
+            -r $reads_1
+            -n '$reads_1.name'
+            -R $reads_2
+            -N '$reads_2.name'
+            -c $contaminants
+            -l $limits

 		    -o $report
 		    -d $report.files_path
@@ -40,8 +45,23 @@
         ]]>
     </command>
     <inputs>
-        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads" type="data"
-               label="Short reads data from history"/>
+        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_1" type="data" optional="false"
+               label="Short reads before trimming"
+               help="Short reads data from history. This could be reads before trimming."/>
+        <param format="fastq,fastq.gz,fastq.bz2,bam,sam" name="reads_2" type="data"
+               label="Short reads after trimming"
+               help="Short reads data from history. This could be reads after trimming."/>
+        <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list"
+               help="Specifies a non-default file which contains the list of adapter sequences which will be explicitly
+                     searched against the library. The file must contain sets of named adapters
+                     in the form name[tab]sequence.  Lines prefixed with a hash will be ignored."/>
+        <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file"
+               help="Specifies a non-default file which contains a set of criteria
+                    which will be used to determine the warn/error limits for the
+                    various modules.  This file can also be used to selectively
+                    remove some modules from the output all together.  The format
+                    needs to mirror the default limits.txt file found in the
+                    Configuration folder."/>
         <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false"
                label="Display analysis code in report?"/>
     </inputs>
--- a/fastqc_report_render.R	Thu Oct 19 00:11:14 2017 -0400
+++ b/fastqc_report_render.R	Sat Oct 21 09:25:49 2017 -0400
@@ -9,6 +9,7 @@
 library(reshape2)
 library(plotly)
 library(formattable)
+options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)

 ##============ Sink warnings and errors to a file ==============
 ## use the sink() function to wrap all code within it.
@@ -39,8 +40,12 @@
   args_list=list()
   ##------- 1. input data ---------------------
   args_list$ECHO = c('echo', 'e', '1', 'character')
-  args_list$READS = c('reads', 'r', '1', 'character')
-  args_list$NAMES = c('names', 'n', '1', 'character')
+  args_list$READS_1 = c('reads_1', 'r', '1', 'character')
+  args_list$NAME_1 = c('name_1', 'n', '1', 'character')
+  args_list$READS_2 = c('reads_2', 'R', '1', 'character')
+  args_list$NAME_2 = c('name_2', 'N', '1', 'character')
+  args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character')
+  args_list$LIMITS = c('limits', 'l', '1', 'character')
   ##--------2. output report and outputs --------------
   args_list$REPORT_HTML = c('report_html', 'o', '1', 'character')
   args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character')
@@ -66,7 +71,22 @@
       gsub('ECHO', opt$echo, x)
     }) %>%
     (function(x) {
-      gsub('READS', opt$reads, x)
+      gsub('READS_1', opt$reads_1, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_1', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('READS_2', opt$reads_2, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_2', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('CONTAMINANTS', opt$contaminants, x)
+    }) %>%
+    (function(x) {
+      gsub('LIMITS', opt$limits, x)
     }) %>%
     (function(x) {
       gsub('REPORT_DIR', opt$report_dir, x)