Mercurial > repos > mingchen0919 > aurora_fastqc
comparison rmarkdown_report.Rmd @ 0:0aeed70b3bc5 draft default tip
planemo upload commit 841d8b22bf9f1aaed6bfe8344b60617f45b275b2-dirty
author | mingchen0919 |
---|---|
date | Fri, 14 Dec 2018 00:38:44 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0aeed70b3bc5 |
---|---|
1 --- | |
2 title: '[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report' | |
3 output: | |
4 html_document: | |
5 highlight: pygments | |
6 --- | |
7 | |
8 | |
9 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} | |
10 knitr::opts_chunk$set(error = TRUE, echo = FALSE) | |
11 ``` | |
12 | |
13 ```{css, echo=FALSE} | |
14 pre code, pre, code { | |
15 white-space: pre !important; | |
16 overflow-x: scroll !important; | |
17 word-break: keep-all !important; | |
18 word-wrap: initial !important; | |
19 } | |
20 ``` | |
21 | |
22 ```{r, echo=FALSE} | |
23 # to make the css theme to work, <link></link> tags cannot be added directly | |
24 # as <script></script> tags as below. | |
25 # it has to be added using a code chunk with the htmltool functions!!! | |
26 css_link = tags$link() | |
27 css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css") | |
28 css_link | |
29 ``` | |
30 | |
31 ```{r, eval=FALSE, echo=FALSE} | |
32 # this code chunk is purely for adding comments | |
33 # below is to add jQuery and jstree javascripts | |
34 ``` | |
35 | |
36 <script src="vakata-jstree-3.3.5/dist/jstree.min.js"></script> | |
37 | |
38 | |
39 ```{r, eval=FALSE, echo=FALSE} | |
40 # this code chunk is purely for adding comments | |
41 # javascript code below is to build the file tree interface | |
42 # see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when | |
43 ``` | |
44 <script> | |
45 jQuery(function () { | |
46 // create an instance when the DOM is ready | |
47 jQuery('#jstree').jstree().bind("select_node.jstree", function (e, data) { | |
48 window.open( data.node.a_attr.href, data.node.a_attr.target ) | |
49 }); | |
50 }); | |
51 </script> | |
52 | |
53 | |
54 ```{r, eval=FALSE, echo=FALSE} | |
55 --- | |
56 # ADD YOUR DATA ANALYSIS CODE AND MARKUP TEXT BELOW TO EXTEND THIS R MARKDOWN FILE | |
57 --- | |
58 ``` | |
59 | |
60 | |
61 # Run FastQC | |
62 | |
63 ```{bash} | |
64 sh ${TOOL_INSTALL_DIR}/build-and-run-job-scripts.sh | |
65 ``` | |
66 | |
67 ```{r echo=FALSE,results='asis'} | |
68 # display fastqc job script | |
69 cat('```bash\n') | |
70 cat(readLines(paste0(Sys.getenv('REPORT_FILES_PATH'), '/job-1-script.sh')), sep = '\n') | |
71 cat('\n```') | |
72 ``` | |
73 | |
74 # Fastqc Output Visualization | |
75 | |
76 ## Overview | |
77 | |
78 ```{r eval=TRUE} | |
79 read_1_summary = read.csv(paste0(opt$X_d, '/read_1_fastqc/summary.txt'), | |
80 stringsAsFactors = FALSE, | |
81 header = FALSE, sep = '\t')[, 2:1] | |
82 read_2_summary = read.csv(paste0(opt$X_d, '/read_2_fastqc/summary.txt'), | |
83 stringsAsFactors = FALSE, | |
84 header = FALSE, sep = '\t')[, 1] | |
85 combined_summary = data.frame(read_1_summary, read_2_summary, stringsAsFactors = FALSE) | |
86 names(combined_summary) = c('MODULE', 'Pre-trimming', 'Post-trimming') | |
87 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' | |
88 combined_summary[combined_summary == 'WARN'] = 'WARN (!)' | |
89 DT::datatable(combined_summary) | |
90 ``` | |
91 | |
92 ```{r 'function definition', echo=FALSE} | |
93 extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { | |
94 f = readLines(fastqc_data) | |
95 start_line = grep(module_name, f) | |
96 end_module_lines = grep('END_MODULE', f) | |
97 end_line = end_module_lines[which(end_module_lines > start_line)[1]] | |
98 module_data = f[(start_line+1):(end_line-1)] | |
99 writeLines(module_data, '/tmp/temp.txt') | |
100 read.csv('/tmp/temp.txt', sep = '\t', header = header, comment.char = comment.char) | |
101 } | |
102 ``` | |
103 | |
104 | |
105 ### Basic Statistics {.tabset} | |
106 | |
107 #### Before | |
108 | |
109 ```{r} | |
110 fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt') | |
111 module_name = 'Basic Statistics pass' | |
112 | |
113 basic_statistics = extract_data_module(fastqc_data_1, module_name) | |
114 colnames(basic_statistics) = c('Measure', 'Value') | |
115 DT::datatable(basic_statistics) | |
116 ``` | |
117 | |
118 #### After | |
119 | |
120 ```{r} | |
121 fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt') | |
122 module_name = 'Basic Statistics pass' | |
123 | |
124 basic_statistics = extract_data_module(fastqc_data_2, module_name) | |
125 colnames(basic_statistics) = c('Measure', 'Value') | |
126 DT::datatable(basic_statistics) | |
127 ``` | |
128 | |
129 | |
130 ### Per base sequence quality | |
131 | |
132 ```{r 'per base sequence quality'} | |
133 ## reads 1 | |
134 pbsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence quality') | |
135 pbsq_1$id = 1:length(pbsq_1$X.Base) | |
136 pbsq_1$trim = 'before' | |
137 | |
138 ## reads 2 | |
139 pbsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence quality') | |
140 pbsq_2$id = 1:length(pbsq_2$X.Base) | |
141 pbsq_2$trim = 'after' | |
142 | |
143 comb_pbsq = rbind(pbsq_1, pbsq_2) | |
144 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) | |
145 | |
146 p = ggplot(data = comb_pbsq) + | |
147 geom_boxplot(mapping = aes(x = id, | |
148 lower = Lower.Quartile, | |
149 upper = Upper.Quartile, | |
150 middle = Median, | |
151 ymin = X10th.Percentile, | |
152 ymax = X90th.Percentile, | |
153 fill = "yellow"), | |
154 stat = 'identity') + | |
155 geom_line(mapping = aes(x = id, y = Mean, color = "red")) + | |
156 scale_x_continuous(name = '\nPosition in read (bp)', breaks = pbsq_2$id, labels = pbsq_2$X.Base) + | |
157 scale_y_continuous(limits = c(0, max(comb_pbsq$Upper.Quartile) + 5)) + | |
158 scale_fill_identity() + | |
159 scale_color_identity() + | |
160 facet_grid(. ~ trim) + | |
161 theme(axis.text.x = element_text(size = 5), | |
162 panel.background = element_rect(fill = NA), | |
163 panel.grid.major.y = element_line(color = 'blue', size = 0.1)) | |
164 p | |
165 ``` | |
166 | |
167 | |
168 ### Per tile sequence quality | |
169 | |
170 ```{r 'per tile sequence quality'} | |
171 ## check if 'per tile sequence quality' module exits or not | |
172 check_ptsq = grep('Per tile sequence quality', readLines(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'))) | |
173 if (length(check_ptsq) > 0) { | |
174 ## reads 1 | |
175 ptsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per tile sequence quality') | |
176 ptsq_1$trim = 'before' | |
177 | |
178 ## reads 2 | |
179 ptsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per tile sequence quality') | |
180 ptsq_2$trim = 'after' | |
181 | |
182 comb_ptsq = rbind(ptsq_1, ptsq_2) | |
183 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) | |
184 comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) | |
185 | |
186 # convert integers to charaters | |
187 # comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) | |
188 | |
189 p = ggplot(data = comb_ptsq) + | |
190 geom_raster(mapping = aes(x = Base, y = X.Tile, fill = Mean)) + | |
191 facet_grid(. ~ trim) + | |
192 scale_x_discrete(name = "\nPosition in read (bp)") + | |
193 scale_y_continuous(name = "") + | |
194 scale_fill_gradient(low = "blue", high = "red") + | |
195 theme(axis.text.x = element_text(size = 5, angle = 90), | |
196 axis.text.y = element_text(size = 5), | |
197 panel.background = element_rect(fill = NA)) | |
198 ggplotly(p) | |
199 } else { | |
200 print('No "per tile sequence quality" data') | |
201 } | |
202 ``` | |
203 | |
204 ### Per sequence quality score | |
205 | |
206 ```{r 'Per sequence quality score'} | |
207 ## reads 1 | |
208 psqs_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence quality scores') | |
209 psqs_1$trim = 'before' | |
210 | |
211 ## reads 2 | |
212 psqs_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence quality scores') | |
213 psqs_2$trim = 'after' | |
214 | |
215 comb_psqs = rbind(psqs_1, psqs_2) | |
216 comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) | |
217 | |
218 p = ggplot(data = comb_psqs) + | |
219 geom_line(mapping = aes(x = X.Quality, y = Count), color = 'red') + | |
220 facet_grid(. ~ trim) + | |
221 scale_x_continuous(name = '\nMean Sequence Qaulity (Phred Score)', | |
222 limits = c(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality))) + | |
223 scale_y_continuous(name = '') + | |
224 theme(panel.background = element_rect(fill = NA), | |
225 axis.line = element_line(), | |
226 panel.grid.major.y = element_line(color = 'blue', size = 0.1)) | |
227 p | |
228 ``` | |
229 | |
230 ### Per base sequence content | |
231 | |
232 ```{r 'Per base sequence content'} | |
233 ## reads 1 | |
234 pbsc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence content') | |
235 pbsc_1$id = 1:length(pbsc_1$X.Base) | |
236 | |
237 melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) | |
238 melt_pbsc_1$trim = 'before' | |
239 | |
240 | |
241 ## reads 2 | |
242 pbsc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence content') | |
243 pbsc_2$id = 1:length(pbsc_2$X.Base) | |
244 | |
245 melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) | |
246 melt_pbsc_2$trim = 'after' | |
247 | |
248 comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) | |
249 comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) | |
250 | |
251 p = ggplot(data = comb_pbsc) + | |
252 geom_line(mapping = aes(x = id, y = value, color = variable)) + | |
253 facet_grid(. ~ trim) + | |
254 xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + | |
255 ylim(0, 100) + | |
256 xlab('\nPosition in read (bp)') + | |
257 ylab('') + | |
258 scale_color_discrete(name = '') + | |
259 theme_classic() | |
260 p | |
261 ``` | |
262 | |
263 ### Per sequence GC content | |
264 | |
265 ```{r 'Per sequence GC content'} | |
266 ## reads 1 | |
267 psGCc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence GC content') | |
268 psGCc_1$trim = 'before' | |
269 | |
270 ## reads 2 | |
271 psGCc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence GC content') | |
272 psGCc_2$trim = 'after' | |
273 | |
274 comb_psGCc = rbind(psGCc_1, psGCc_2) | |
275 comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) | |
276 | |
277 p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + | |
278 geom_line(color = 'red') + | |
279 facet_grid(. ~ trim) + | |
280 xlab('\nMean Sequence Qaulity (Phred Score)') + | |
281 ylab('') + | |
282 scale_color_discrete(name = '') + | |
283 theme_classic() | |
284 p | |
285 ``` | |
286 | |
287 | |
288 ### Per base N content | |
289 | |
290 ```{r 'Per base N content'} | |
291 ## reads 1 | |
292 pbNc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base N content') | |
293 pbNc_1$id = 1:length(pbNc_1$X.Base) | |
294 pbNc_1$trim = 'before' | |
295 | |
296 ## reads 2 | |
297 pbNc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base N content') | |
298 pbNc_2$id = 1:length(pbNc_2$X.Base) | |
299 pbNc_2$trim = 'after' | |
300 | |
301 comb_pbNc = rbind(pbNc_1, pbNc_2) | |
302 comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) | |
303 | |
304 p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + | |
305 geom_line(color = 'red') + | |
306 scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + | |
307 facet_grid(. ~ trim) + | |
308 ylim(0, 1) + | |
309 xlab('\nN-Count') + | |
310 ylab('') + | |
311 theme(axis.text.x = element_text(size = 5), | |
312 axis.line = element_line(), | |
313 panel.background = element_rect(fill = NA)) | |
314 p | |
315 ``` | |
316 | |
317 | |
318 ### Sequence Length Distribution | |
319 | |
320 ```{r 'Sequence Length Distribution'} | |
321 ## reads 1 | |
322 sld_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') | |
323 sld_1$id = 1:length(sld_1$X.Length) | |
324 sld_1$trim = 'before' | |
325 | |
326 ## reads 2 | |
327 sld_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') | |
328 sld_2$id = 1:length(sld_2$X.Length) | |
329 sld_2$trim = 'after' | |
330 | |
331 comb_sld = rbind(sld_1, sld_2) | |
332 comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) | |
333 | |
334 p = ggplot(data = comb_sld, aes(x = id, y = Count)) + | |
335 geom_line(color = 'red') + | |
336 scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + | |
337 facet_grid(. ~ trim) + | |
338 xlab('\nSequence Length (bp)') + | |
339 ylab('') + | |
340 theme(axis.text.x = element_text(size = 5), | |
341 panel.background = element_rect(fill = NA), | |
342 axis.line = element_line(), | |
343 plot.margin = margin(2,2,2,10) ) | |
344 p | |
345 ``` | |
346 | |
347 ### Sequence Duplication Levels | |
348 | |
349 ```{r 'Sequence Duplication Levels'} | |
350 ## reads 1 | |
351 sdl_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') | |
352 names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') | |
353 sdl_1$id = 1:length(sdl_1$Duplication_Level) | |
354 | |
355 melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) | |
356 melt_sdl_1$trim = 'before' | |
357 | |
358 | |
359 ## reads 2 | |
360 sdl_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') | |
361 names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') | |
362 sdl_2$id = 1:length(sdl_2$Duplication_Level) | |
363 | |
364 melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) | |
365 melt_sdl_2$trim = 'after' | |
366 | |
367 comb_sdl = rbind(melt_sdl_1, melt_sdl_2) | |
368 comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) | |
369 | |
370 p = ggplot(data = comb_sdl) + | |
371 geom_line(mapping = aes(x = id, y = value, color = variable)) + | |
372 scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + | |
373 facet_grid(. ~ trim) + | |
374 xlab('\nSequence Duplication Level') + | |
375 ylab('') + | |
376 scale_color_discrete(name = '') + | |
377 theme(axis.text.x = element_text(size = 5), | |
378 panel.background = element_rect(fill = NA), | |
379 axis.line = element_line(), | |
380 legend.position="top") | |
381 p | |
382 ``` | |
383 | |
384 | |
385 ### Overrepresented sequences {.tabset} | |
386 | |
387 #### Before | |
388 | |
389 ```{r} | |
390 fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt') | |
391 module_name = 'Overrepresented sequences' | |
392 | |
393 overrepresented_seq = extract_data_module(fastqc_data_1, module_name) | |
394 colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source') | |
395 DT::datatable(overrepresented_seq) | |
396 ``` | |
397 | |
398 #### After | |
399 | |
400 ```{r} | |
401 fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt') | |
402 module_name = 'Overrepresented sequences' | |
403 | |
404 overrepresented_seq = extract_data_module(fastqc_data_2, module_name) | |
405 colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source') | |
406 DT::datatable(overrepresented_seq) | |
407 ``` | |
408 | |
409 | |
410 ### Adapter Content | |
411 | |
412 ```{r 'Adapter Content'} | |
413 ## reads 1 | |
414 ac_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Adapter Content') | |
415 ac_1$id = 1:length(ac_1$X.Position) | |
416 | |
417 melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) | |
418 melt_ac_1$trim = 'before' | |
419 | |
420 ## reads 2 | |
421 ac_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Adapter Content') | |
422 ac_2$id = 1:length(ac_2$X.Position) | |
423 | |
424 melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) | |
425 melt_ac_2$trim = 'after' | |
426 | |
427 comb_ac = rbind(melt_ac_1, melt_ac_2) | |
428 comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) | |
429 | |
430 p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + | |
431 geom_line() + | |
432 facet_grid(. ~ trim) + | |
433 xlim(min(comb_ac$id), max(comb_ac$id)) + | |
434 ylim(0, 1) + | |
435 xlab('\nPosition in read (bp)') + | |
436 ylab('') + | |
437 scale_color_discrete(name = '') + | |
438 theme(axis.text.x = element_text(size = 5), | |
439 panel.background = element_rect(fill = NA), | |
440 axis.line = element_line()) | |
441 ggplotly(p) | |
442 ``` | |
443 | |
444 ### Kmer Content {.tabset} | |
445 | |
446 #### Before | |
447 | |
448 ```{r 'Kmer Content (before)'} | |
449 kc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Kmer Content') | |
450 DT::datatable(kc_1) | |
451 ``` | |
452 | |
453 #### After | |
454 ```{r 'Kmer Content (after)'} | |
455 kc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Kmer Content') | |
456 DT::datatable(kc_2) | |
457 ``` | |
458 |