Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
diff 01_evaluation_overview.Rmd @ 0:d732d4526c6d draft
planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_fastqc_site commit ddb1f6aca7619aea2e660b1729367841b56ba4c9-dirty
author | mingchen0919 |
---|---|
date | Tue, 08 Aug 2017 10:14:46 -0400 |
parents | |
children | 507eec497730 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/01_evaluation_overview.Rmd Tue Aug 08 10:14:46 2017 -0400 @@ -0,0 +1,123 @@ +--- +title: "Evaluation Overview" +output: html_document +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set(echo = ECHO) +``` + +```{bash 'copy data from datasets directory to working directory', echo=FALSE} +# Copy uploaded data to the working directory +for f in $(echo READS | sed "s/,/ /g") +do + cp $f ./ +done +``` + +```{bash 'run fastqc', echo=FALSE} +# run fastqc and place outputs into the report directory +for r in $(ls *.dat) +do + fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 +done +``` + +```{bash 'parse fastqc results', echo=FALSE} +##==== copy fastqc generated zip files from report output directory to job work directory == +cp -r REPORT_OUTPUT_DIR/*zip ./ + +# create a file to store data file paths +echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail +echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score +echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score +echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content +echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content +echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content +echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level +echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution +echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content + +for i in $(ls *.zip) +do + BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') + echo $BASE + unzip ${BASE}.zip > /dev/null 2>&1 + + ##====== pass,warning,fail (WSF) ============= + awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt + echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt + + ##====== per base quality scores (PBQS) ====== + awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt + echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt + + ##====== per sequence quality scores (PSQS) + awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt + echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt + + ##====== Per sequence GC content (PSGC) + awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt + echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt + + ##====== Per Base Sequence Content (PBSC) + awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt + echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt + + ##====== Per Base N Content (PBNC) + awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt + echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt + + ##====== Sequence Duplication Level (SDL) + awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt + echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt + + ##====== Sequence Length Distribution (SLD) + awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt + echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt + + ##====== Kmer Content ============ + awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt + echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt + +done +``` + + +## Evaluation Overview + +```{r 'overview'} +PWF_file_paths = read.csv('PWF_file_paths.txt', + header = TRUE, stringsAsFactors = FALSE) +rm('PWF_df') +for(i in 1:nrow(PWF_file_paths)) { + file_path = PWF_file_paths[i,2] + pwf_df = read.csv(file_path, + sep='\t', header=FALSE, stringsAsFactors = FALSE) + colnames(pwf_df) = c('item', PWF_file_paths[i,1]) + if (!exists('PWF_df')) { + PWF_df = pwf_df + } else { + PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) + } +} +``` + + +```{r} +my_icon = c('ok', 'remove', 'star') +names(my_icon) = c('pass', 'fail', 'warn') +evaluate_list = list() +for (i in colnames(PWF_df)[-1]) { + evaluate_list[[i]] = formatter( + "span", + style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), + "color" = "white", + "width" = "50px", + "float" = "left", + "padding-right" = "5px") + ) +} + +formattable(PWF_df, evaluate_list) +``` \ No newline at end of file