Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
view 01_evaluation_overview.Rmd @ 7:d820be692d74 draft
planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_fastqc_site commit d91f269e8bc09a488ed2e005122bbb4a521f44a0-dirty
author | mingchen0919 |
---|---|
date | Tue, 08 Aug 2017 12:36:13 -0400 |
parents | d732d4526c6d |
children | 507eec497730 |
line wrap: on
line source
--- title: "Evaluation Overview" output: html_document --- ```{r setup, include=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set(echo = ECHO) ``` ```{bash 'copy data from datasets directory to working directory', echo=FALSE} # Copy uploaded data to the working directory for f in $(echo READS | sed "s/,/ /g") do cp $f ./ done ``` ```{bash 'run fastqc', echo=FALSE} # run fastqc and place outputs into the report directory for r in $(ls *.dat) do fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 done ``` ```{bash 'parse fastqc results', echo=FALSE} ##==== copy fastqc generated zip files from report output directory to job work directory == cp -r REPORT_OUTPUT_DIR/*zip ./ # create a file to store data file paths echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content for i in $(ls *.zip) do BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') echo $BASE unzip ${BASE}.zip > /dev/null 2>&1 ##====== pass,warning,fail (WSF) ============= awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt ##====== per base quality scores (PBQS) ====== awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt ##====== per sequence quality scores (PSQS) awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt ##====== Per sequence GC content (PSGC) awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt ##====== Per Base Sequence Content (PBSC) awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt ##====== Per Base N Content (PBNC) awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt ##====== Sequence Duplication Level (SDL) awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt ##====== Sequence Length Distribution (SLD) awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt ##====== Kmer Content ============ awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt done ``` ## Evaluation Overview ```{r 'overview'} PWF_file_paths = read.csv('PWF_file_paths.txt', header = TRUE, stringsAsFactors = FALSE) rm('PWF_df') for(i in 1:nrow(PWF_file_paths)) { file_path = PWF_file_paths[i,2] pwf_df = read.csv(file_path, sep='\t', header=FALSE, stringsAsFactors = FALSE) colnames(pwf_df) = c('item', PWF_file_paths[i,1]) if (!exists('PWF_df')) { PWF_df = pwf_df } else { PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) } } ``` ```{r} my_icon = c('ok', 'remove', 'star') names(my_icon) = c('pass', 'fail', 'warn') evaluate_list = list() for (i in colnames(PWF_df)[-1]) { evaluate_list[[i]] = formatter( "span", style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), "color" = "white", "width" = "50px", "float" = "left", "padding-right" = "5px") ) } formattable(PWF_df, evaluate_list) ```