Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
comparison 01_evaluation_overview.Rmd @ 7:d820be692d74 draft
planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_fastqc_site commit d91f269e8bc09a488ed2e005122bbb4a521f44a0-dirty
| author | mingchen0919 |
|---|---|
| date | Tue, 08 Aug 2017 12:36:13 -0400 |
| parents | d732d4526c6d |
| children | 507eec497730 |
comparison
equal
deleted
inserted
replaced
| 6:2f4df2be0572 | 7:d820be692d74 |
|---|---|
| 1 --- | |
| 2 title: "Evaluation Overview" | |
| 3 output: html_document | |
| 4 --- | |
| 5 | |
| 6 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} | |
| 7 knitr::opts_chunk$set(echo = ECHO) | |
| 8 ``` | |
| 9 | |
| 10 ```{bash 'copy data from datasets directory to working directory', echo=FALSE} | |
| 11 # Copy uploaded data to the working directory | |
| 12 for f in $(echo READS | sed "s/,/ /g") | |
| 13 do | |
| 14 cp $f ./ | |
| 15 done | |
| 16 ``` | |
| 17 | |
| 18 ```{bash 'run fastqc', echo=FALSE} | |
| 19 # run fastqc and place outputs into the report directory | |
| 20 for r in $(ls *.dat) | |
| 21 do | |
| 22 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 | |
| 23 done | |
| 24 ``` | |
| 25 | |
| 26 ```{bash 'parse fastqc results', echo=FALSE} | |
| 27 ##==== copy fastqc generated zip files from report output directory to job work directory == | |
| 28 cp -r REPORT_OUTPUT_DIR/*zip ./ | |
| 29 | |
| 30 # create a file to store data file paths | |
| 31 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail | |
| 32 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score | |
| 33 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score | |
| 34 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content | |
| 35 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content | |
| 36 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content | |
| 37 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level | |
| 38 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution | |
| 39 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content | |
| 40 | |
| 41 for i in $(ls *.zip) | |
| 42 do | |
| 43 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') | |
| 44 echo $BASE | |
| 45 unzip ${BASE}.zip > /dev/null 2>&1 | |
| 46 | |
| 47 ##====== pass,warning,fail (WSF) ============= | |
| 48 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt | |
| 49 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt | |
| 50 | |
| 51 ##====== per base quality scores (PBQS) ====== | |
| 52 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt | |
| 53 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt | |
| 54 | |
| 55 ##====== per sequence quality scores (PSQS) | |
| 56 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt | |
| 57 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt | |
| 58 | |
| 59 ##====== Per sequence GC content (PSGC) | |
| 60 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt | |
| 61 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt | |
| 62 | |
| 63 ##====== Per Base Sequence Content (PBSC) | |
| 64 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt | |
| 65 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt | |
| 66 | |
| 67 ##====== Per Base N Content (PBNC) | |
| 68 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt | |
| 69 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt | |
| 70 | |
| 71 ##====== Sequence Duplication Level (SDL) | |
| 72 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt | |
| 73 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt | |
| 74 | |
| 75 ##====== Sequence Length Distribution (SLD) | |
| 76 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt | |
| 77 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt | |
| 78 | |
| 79 ##====== Kmer Content ============ | |
| 80 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt | |
| 81 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt | |
| 82 | |
| 83 done | |
| 84 ``` | |
| 85 | |
| 86 | |
| 87 ## Evaluation Overview | |
| 88 | |
| 89 ```{r 'overview'} | |
| 90 PWF_file_paths = read.csv('PWF_file_paths.txt', | |
| 91 header = TRUE, stringsAsFactors = FALSE) | |
| 92 rm('PWF_df') | |
| 93 for(i in 1:nrow(PWF_file_paths)) { | |
| 94 file_path = PWF_file_paths[i,2] | |
| 95 pwf_df = read.csv(file_path, | |
| 96 sep='\t', header=FALSE, stringsAsFactors = FALSE) | |
| 97 colnames(pwf_df) = c('item', PWF_file_paths[i,1]) | |
| 98 if (!exists('PWF_df')) { | |
| 99 PWF_df = pwf_df | |
| 100 } else { | |
| 101 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) | |
| 102 } | |
| 103 } | |
| 104 ``` | |
| 105 | |
| 106 | |
| 107 ```{r} | |
| 108 my_icon = c('ok', 'remove', 'star') | |
| 109 names(my_icon) = c('pass', 'fail', 'warn') | |
| 110 evaluate_list = list() | |
| 111 for (i in colnames(PWF_df)[-1]) { | |
| 112 evaluate_list[[i]] = formatter( | |
| 113 "span", | |
| 114 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), | |
| 115 "color" = "white", | |
| 116 "width" = "50px", | |
| 117 "float" = "left", | |
| 118 "padding-right" = "5px") | |
| 119 ) | |
| 120 } | |
| 121 | |
| 122 formattable(PWF_df, evaluate_list) | |
| 123 ``` |
