diff 01_evaluation_overview.Rmd @ 0:d732d4526c6d draft

planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_fastqc_site commit ddb1f6aca7619aea2e660b1729367841b56ba4c9-dirty
author mingchen0919
date Tue, 08 Aug 2017 10:14:46 -0400
parents
children 507eec497730
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/01_evaluation_overview.Rmd	Tue Aug 08 10:14:46 2017 -0400
@@ -0,0 +1,123 @@
+---
+title: "Evaluation Overview"
+output: html_document
+---
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(echo = ECHO)
+```
+
+```{bash 'copy data from datasets directory to working directory', echo=FALSE}
+# Copy uploaded data to the working directory
+for f in $(echo READS | sed "s/,/ /g")
+do
+    cp $f ./
+done
+```
+
+```{bash 'run fastqc', echo=FALSE}
+# run fastqc and place outputs into the report directory
+for r in $(ls *.dat)
+do
+    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
+done
+```
+
+```{bash 'parse fastqc results', echo=FALSE}
+##==== copy fastqc generated zip files from report output directory to job work directory ==
+cp -r REPORT_OUTPUT_DIR/*zip ./
+
+# create a file to store data file paths
+echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
+echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
+echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
+echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
+echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
+echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
+echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
+echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
+echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
+
+for i in $(ls *.zip)
+do
+    BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
+    echo $BASE
+    unzip ${BASE}.zip > /dev/null 2>&1
+    
+    ##====== pass,warning,fail (WSF) =============
+    awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
+    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
+
+    ##====== per base quality scores (PBQS) ======
+    awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
+    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
+
+    ##====== per sequence quality scores (PSQS)
+    awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
+    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
+
+    ##====== Per sequence GC content (PSGC)
+    awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
+    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
+    
+    ##====== Per Base Sequence Content (PBSC)
+    awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
+    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
+    
+    ##====== Per Base N Content (PBNC)
+    awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
+    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
+    
+    ##====== Sequence Duplication Level (SDL)
+    awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
+    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
+    
+    ##====== Sequence Length Distribution (SLD)
+    awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
+    echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
+    
+    ##====== Kmer Content ============
+    awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
+    echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
+    
+done
+```
+
+
+## Evaluation Overview
+
+```{r 'overview'}
+PWF_file_paths = read.csv('PWF_file_paths.txt',
+                           header = TRUE, stringsAsFactors = FALSE)
+rm('PWF_df')
+for(i in 1:nrow(PWF_file_paths)) {
+  file_path = PWF_file_paths[i,2]
+  pwf_df = read.csv(file_path,
+                     sep='\t', header=FALSE, stringsAsFactors = FALSE)
+  colnames(pwf_df) = c('item', PWF_file_paths[i,1])
+  if (!exists('PWF_df')) {
+    PWF_df = pwf_df
+  } else {
+    PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
+  }
+}
+```
+
+
+```{r}
+my_icon = c('ok', 'remove', 'star')
+names(my_icon) = c('pass', 'fail', 'warn')
+evaluate_list = list()
+for (i in colnames(PWF_df)[-1]) {
+  evaluate_list[[i]] = formatter(
+      "span", 
+      style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), 
+                        "color" = "white",
+                        "width" = "50px",
+                        "float" = "left",
+                        "padding-right" = "5px")
+    )
+}
+
+formattable(PWF_df, evaluate_list)
+```
\ No newline at end of file