Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report
comparison fastqc_report.Rmd @ 2:0374e090e38e draft
Uploaded
author | mingchen0919 |
---|---|
date | Mon, 07 Aug 2017 21:40:56 -0400 |
parents | |
children | e629c2288316 |
comparison
equal
deleted
inserted
replaced
1:22cd2369354b | 2:0374e090e38e |
---|---|
1 --- | |
2 title: "Fastqc report: short reads quality evaluation" | |
3 author: "Ming Chen" | |
4 output: html_document | |
5 --- | |
6 | |
7 ```{r setup, include=FALSE} | |
8 knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE) | |
9 library(plyr) | |
10 library(stringr) | |
11 library(dplyr) | |
12 library(highcharter) | |
13 library(DT) | |
14 library(reshape2) | |
15 # library(Kmisc) | |
16 library(plotly) | |
17 library(formattable) | |
18 library(htmltools) | |
19 ``` | |
20 | |
21 | |
22 ```{bash 'create output directory', echo=FALSE} | |
23 # create extra files directory. very important! | |
24 mkdir REPORT_OUTPUT_DIR | |
25 ``` | |
26 | |
27 # Fastqc analysis | |
28 ```{bash 'copy data to working directory', echo=FALSE} | |
29 # Copy uploaded data to the working directory | |
30 for f in $(echo READS | sed "s/,/ /g") | |
31 do | |
32 cp $f ./ | |
33 done | |
34 ``` | |
35 | |
36 | |
37 ```{bash 'run fastqc', echo=FALSE} | |
38 for r in $(ls *.dat) | |
39 do | |
40 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 | |
41 done | |
42 ``` | |
43 | |
44 ## Fastqc html reports | |
45 | |
46 Below are links to ***Fastqc*** original html reports. | |
47 ```{r 'html report links'} | |
48 html_report_list = list() | |
49 html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') | |
50 for (i in html_files) { | |
51 html_report_list[[i]] = tags$li(tags$a(href=i, i)) | |
52 } | |
53 tags$ul(html_report_list) | |
54 ``` | |
55 | |
56 | |
57 ## Parsing fastqc data | |
58 | |
59 ```{bash echo=FALSE} | |
60 ##==== copy fastqc generated zip files from report output directory to job work directory == | |
61 cp -r REPORT_OUTPUT_DIR/*zip ./ | |
62 | |
63 # create a file to store data file paths | |
64 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail | |
65 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score | |
66 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score | |
67 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content | |
68 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content | |
69 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content | |
70 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level | |
71 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution | |
72 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content | |
73 | |
74 for i in $(ls *.zip) | |
75 do | |
76 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') | |
77 echo $BASE | |
78 unzip ${BASE}.zip > /dev/null 2>&1 | |
79 | |
80 ##====== pass,warning,fail (WSF) ============= | |
81 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt | |
82 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt | |
83 | |
84 ##====== per base quality scores (PBQS) ====== | |
85 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt | |
86 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt | |
87 | |
88 ##====== per sequence quality scores (PSQS) | |
89 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt | |
90 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt | |
91 | |
92 ##====== Per sequence GC content (PSGC) | |
93 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt | |
94 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt | |
95 | |
96 ##====== Per Base Sequence Content (PBSC) | |
97 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt | |
98 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt | |
99 | |
100 ##====== Per Base N Content (PBNC) | |
101 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt | |
102 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt | |
103 | |
104 ##====== Sequence Duplication Level (SDL) | |
105 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt | |
106 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt | |
107 | |
108 ##====== Sequence Length Distribution (SLD) | |
109 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt | |
110 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt | |
111 | |
112 ##====== Kmer Content ============ | |
113 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt | |
114 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt | |
115 | |
116 done | |
117 ``` | |
118 | |
119 | |
120 ## Evaluation Overview | |
121 | |
122 ```{r 'overview'} | |
123 PWF_file_paths = read.csv('PWF_file_paths.txt', | |
124 header = TRUE, stringsAsFactors = FALSE) | |
125 rm('PWF_df') | |
126 for(i in 1:nrow(PWF_file_paths)) { | |
127 file_path = PWF_file_paths[i,2] | |
128 pwf_df = read.csv(file_path, | |
129 sep='\t', header=FALSE, stringsAsFactors = FALSE) | |
130 colnames(pwf_df) = c('item', PWF_file_paths[i,1]) | |
131 if (!exists('PWF_df')) { | |
132 PWF_df = pwf_df | |
133 } else { | |
134 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) | |
135 } | |
136 } | |
137 ``` | |
138 | |
139 ```{r} | |
140 my_icon = c('ok', 'remove', 'star') | |
141 names(my_icon) = c('pass', 'fail', 'warn') | |
142 evaluate_list = list() | |
143 for (i in colnames(PWF_df)[-1]) { | |
144 evaluate_list[[i]] = formatter( | |
145 "span", | |
146 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), | |
147 "color" = "white", | |
148 "width" = "50px", | |
149 "float" = "left", | |
150 "padding-right" = "5px") | |
151 ) | |
152 } | |
153 | |
154 formattable(PWF_df, evaluate_list) | |
155 ``` | |
156 | |
157 | |
158 ## Per Base Quality Scores | |
159 | |
160 ```{r} | |
161 PBQS_df = data.frame() | |
162 PBQS_file_paths = read.csv('PBQS_file_paths.txt', | |
163 header = TRUE, stringsAsFactors = FALSE) | |
164 for(i in 1:nrow(PBQS_file_paths)) { | |
165 # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) | |
166 file_path = PBQS_file_paths[i,2] | |
167 pbqs_df = read.csv(file_path, | |
168 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
169 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
170 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
171 (function (df) { | |
172 df1 = select(df, -Base2) | |
173 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
174 colnames(df1) = c(colnames(df1)[1:7], 'Base') | |
175 colnames(df2) = c(colnames(df2)[1:7], 'Base') | |
176 res = rbind(df1, df2) %>% arrange(Base) | |
177 return(res) | |
178 }) | |
179 pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) | |
180 PBQS_df = rbind(PBQS_df, pbqs_df) | |
181 } | |
182 ``` | |
183 | |
184 | |
185 ```{r} | |
186 # datatable(PBQS_df) | |
187 max_phred = max(PBQS_df$Mean) + 10 | |
188 hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% | |
189 hc_title( | |
190 text = "Per Base Quality Score" | |
191 ) %>% | |
192 hc_yAxis( | |
193 title = list(text = "Mean Base Quality Score"), | |
194 min = 0, | |
195 max = max_phred, | |
196 plotLines = list( | |
197 list(label = list(text = "Phred Score = 27"), | |
198 width = 2, | |
199 dashStyle = "dash", | |
200 color = "green", | |
201 value = 27), | |
202 list(label = list(text = "Phred Score = 20"), | |
203 width = 2, | |
204 color = "red", | |
205 value = 20) | |
206 ) | |
207 ) %>% | |
208 hc_exporting(enabled = TRUE) | |
209 ``` | |
210 | |
211 | |
212 ## Per Base N Content | |
213 | |
214 ```{r} | |
215 PBNC_df = data.frame() | |
216 PBNC_file_paths = read.csv('PBNC_file_paths.txt', | |
217 header = TRUE, stringsAsFactors = FALSE) | |
218 for(i in 1:nrow(PBNC_file_paths)) { | |
219 # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) | |
220 file_path = PBNC_file_paths[i,2] | |
221 pbnc_df = read.csv(file_path, | |
222 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
223 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
224 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
225 (function (df) { | |
226 df1 = select(df, -Base2) | |
227 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
228 colnames(df1) = c(colnames(df1)[1:2], 'Base') | |
229 colnames(df2) = c(colnames(df2)[1:2], 'Base') | |
230 res = rbind(df1, df2) %>% arrange(Base) | |
231 return(res) | |
232 }) | |
233 pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) | |
234 PBNC_df = rbind(PBNC_df, pbnc_df) | |
235 } | |
236 ``` | |
237 | |
238 | |
239 ```{r} | |
240 PBNC_df$N.Count = PBNC_df$N.Count * 100 | |
241 max_phred = max(PBNC_df$N.Count) + 5 | |
242 hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% | |
243 hc_title( | |
244 text = "Per Base N Content" | |
245 ) %>% | |
246 hc_xAxis( | |
247 title = list(text = "Base Position") | |
248 ) %>% | |
249 hc_yAxis( | |
250 title = list(text = "N %"), | |
251 plotLines = list( | |
252 list(label = list(text = "N = 5%"), | |
253 width = 2, | |
254 dashStyle = "dash", | |
255 color = "red", | |
256 value = 5) | |
257 ) | |
258 ) %>% | |
259 hc_exporting(enabled = TRUE) | |
260 ``` | |
261 | |
262 | |
263 | |
264 | |
265 ## Per Sequence Quality Scores | |
266 | |
267 ```{r} | |
268 PSQS_df = data.frame() | |
269 PSQS_file_paths = read.csv('PSQS_file_paths.txt', | |
270 header = TRUE, stringsAsFactors = FALSE) | |
271 for(i in 1:nrow(PSQS_file_paths)) { | |
272 # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) | |
273 file_path = PSQS_file_paths[i,2] | |
274 psqs_df = read.csv(file_path, | |
275 sep='\t', header=TRUE, stringsAsFactors = FALSE) | |
276 psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) | |
277 PSQS_df = rbind(PSQS_df, psqs_df) | |
278 } | |
279 ``` | |
280 | |
281 | |
282 ```{r} | |
283 max_phred = max(PSQS_df$X.Quality) + 5 | |
284 hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% | |
285 hc_title( | |
286 text = "Per Sequence Quality Score" | |
287 ) %>% | |
288 hc_xAxis( | |
289 title = list(text = "Mean Sequence Quality Score"), | |
290 min = 0, | |
291 max = max_phred, | |
292 plotLines = list( | |
293 list(label = list(text = "Phred Score = 27"), | |
294 width = 2, | |
295 dashStyle = "dash", | |
296 color = "green", | |
297 value = 27), | |
298 list(label = list(text = "Phred Score = 20"), | |
299 width = 2, | |
300 color = "red", | |
301 value = 20) | |
302 ) | |
303 ) %>% | |
304 hc_exporting(enabled = TRUE) | |
305 ``` | |
306 | |
307 | |
308 ## Per Sequence GC Content | |
309 | |
310 | |
311 ```{r} | |
312 PSGC_df = data.frame() | |
313 PSGC_file_paths = read.csv('PSGC_file_paths.txt', | |
314 header = TRUE, stringsAsFactors = FALSE) | |
315 for(i in 1:nrow(PSGC_file_paths)) { | |
316 # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) | |
317 file_path = PSGC_file_paths[i,2] | |
318 psgc_df = read.csv(file_path, | |
319 sep='\t', header=TRUE, stringsAsFactors = FALSE) | |
320 psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) | |
321 PSGC_df = rbind(PSGC_df, psgc_df) | |
322 } | |
323 ``` | |
324 | |
325 | |
326 ```{r} | |
327 max_phred = max(PSGC_df$Count) + 5 | |
328 hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% | |
329 hc_title( | |
330 text = "Per Sequence GC Content" | |
331 ) %>% | |
332 hc_xAxis( | |
333 title = list(text = "% GC") | |
334 ) %>% | |
335 hc_exporting(enabled = TRUE) | |
336 ``` | |
337 | |
338 | |
339 ## Per Base Sequence Content | |
340 | |
341 ```{r} | |
342 PBSC_df = data.frame() | |
343 PBSC_file_paths = read.csv('PBSC_file_paths.txt', | |
344 header = TRUE, stringsAsFactors = FALSE) | |
345 for(i in 1:nrow(PBSC_file_paths)) { | |
346 # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) | |
347 file_path = PBSC_file_paths[i,2] | |
348 pbsc_df = read.csv(file_path, | |
349 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
350 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
351 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
352 (function (df) { | |
353 df1 = select(df, -Base2) | |
354 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
355 colnames(df1) = c(colnames(df1)[1:5], 'Base') | |
356 colnames(df2) = c(colnames(df2)[1:5], 'Base') | |
357 res = rbind(df1, df2) %>% arrange(Base) | |
358 return(res) | |
359 }) | |
360 pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) | |
361 PBSC_df = rbind(PBSC_df, pbsc_df) | |
362 } | |
363 ``` | |
364 | |
365 | |
366 ```{r out.width="100%"} | |
367 PBSC_df_2 = select(PBSC_df, -X.Base) %>% | |
368 melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') | |
369 p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + | |
370 geom_line() + | |
371 facet_wrap(~ sample_id) | |
372 ggplotly(p) | |
373 ``` | |
374 | |
375 | |
376 ## References | |
377 | |
378 * Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176. | |
379 * Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86. | |
380 * Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343. | |
381 * Highcharts. https://www.highcharts.com/. (access by May 26, 2017). | |
382 * R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. | |
383 * Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter | |
384 * Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly |