2
|
1 ---
|
15
|
2 title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
|
14
|
3 output:
|
|
4 html_document:
|
|
5 number_sections: true
|
|
6 toc: true
|
|
7 theme: cosmo
|
|
8 highlight: tango
|
2
|
9 ---
|
|
10
|
14
|
11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
|
|
12 knitr::opts_chunk$set(
|
16
|
13 echo = ECHO,
|
|
14 error = TRUE
|
14
|
15 )
|
2
|
16 ```
|
|
17
|
|
18
|
16
|
19 # Fastqc Evaluation
|
14
|
20
|
16
|
21 ## Evaluation of reads before trimming
|
2
|
22
|
16
|
23 ```{r}
|
|
24 if ('READS_1' == 'None') {
|
|
25 stop("No pre-trimming reads provided!")
|
|
26 } else {
|
|
27 ## run fastqc evaluation
|
|
28 fastqc_command = paste0('fastqc ') %>%
|
|
29 (function(x) {
|
|
30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
|
|
31 }) %>%
|
|
32 (function(x) {
|
|
33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
|
|
34 }) %>%
|
|
35 (function(x) {
|
|
36 paste0(x, '-o REPORT_DIR ')
|
|
37 })
|
|
38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
|
|
39 system(fastqc_command_reads_1, intern = TRUE)
|
|
40
|
|
41 # Original html report
|
|
42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
|
|
43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
|
|
44
|
|
45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
|
|
46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
|
|
47 # fastqc_data.txt
|
|
48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
|
|
49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
|
|
50 # summary.txt
|
|
51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
|
|
52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
|
|
53
|
|
54 tags$ul(
|
|
55 tags$li(original_html),
|
|
56 tags$li(fastqc_data),
|
|
57 tags$li(summary_data)
|
|
58 )
|
|
59 }
|
15
|
60 ```
|
|
61
|
|
62
|
16
|
63 ## Evaluation of reads after trimming
|
15
|
64
|
16
|
65 ```{r}
|
|
66 if ('READS_2' == 'None') {
|
|
67 stop("No pre-trimming reads provided!")
|
|
68 } else {
|
|
69 ## run fastqc evaluation
|
|
70 fastqc_command = paste0('fastqc ') %>%
|
|
71 (function(x) {
|
|
72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
|
|
73 }) %>%
|
|
74 (function(x) {
|
|
75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
|
|
76 }) %>%
|
|
77 (function(x) {
|
|
78 paste0(x, '-o REPORT_DIR ')
|
|
79 })
|
|
80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
|
|
81 system(fastqc_command_reads_2, intern = TRUE)
|
|
82
|
|
83 # Original html report
|
|
84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
|
|
85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
|
|
86
|
|
87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
|
|
88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
|
|
89 # fastqc_data.txt
|
|
90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
|
|
91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
|
|
92 # summary.txt
|
|
93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
|
|
94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
|
|
95
|
|
96 tags$ul(
|
|
97 tags$li(original_html),
|
|
98 tags$li(fastqc_data),
|
|
99 tags$li(summary_data)
|
|
100 )
|
|
101 }
|
2
|
102 ```
|
|
103
|
15
|
104
|
|
105
|
|
106 # Fastqc output visualization
|
|
107
|
|
108 ## Overview
|
|
109
|
|
110 ```{r}
|
16
|
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
|
17
|
112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
|
16
|
113 combined_summary = cbind(reads_1_summary, reads_2_summary)
|
|
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
|
17
|
115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
|
|
116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
|
16
|
117 knitr::kable(combined_summary)
|
15
|
118 ```
|
|
119
|
17
|
120 ## Visualization by data module {.tabset}
|
2
|
121
|
14
|
122 * Define a function to extract outputs for each module from fastqc output
|
2
|
123
|
14
|
124 ```{r 'function definition'}
|
17
|
125 extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
|
14
|
126 f = readLines(fastqc_data)
|
|
127 start_line = grep(module_name, f)
|
|
128 end_module_lines = grep('END_MODULE', f)
|
|
129 end_line = end_module_lines[which(end_module_lines > start_line)[1]]
|
|
130 module_data = f[(start_line+1):(end_line-1)]
|
|
131 writeLines(module_data, 'temp.txt')
|
17
|
132 read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char)
|
2
|
133 }
|
|
134 ```
|
|
135
|
15
|
136 ### Per base sequence quality
|
|
137
|
16
|
138 ```{r 'per base sequence quality', fig.width=10}
|
|
139 ## reads 1
|
|
140 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
|
|
141 pbsq_1$id = 1:length(pbsq_1$X.Base)
|
18
|
142 pbsq_1$trim = 'before'
|
16
|
143
|
|
144 ## reads 2
|
|
145 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
|
|
146 pbsq_2$id = 1:length(pbsq_2$X.Base)
|
18
|
147 pbsq_2$trim = 'after'
|
16
|
148
|
18
|
149 comb_pbsq = rbind(pbsq_1, pbsq_2)
|
16
|
150 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
|
17
|
151
|
16
|
152 p = ggplot(data = comb_pbsq) +
|
18
|
153 geom_boxplot(mapping = aes(x = id,
|
|
154 lower = Lower.Quartile,
|
|
155 upper = Upper.Quartile,
|
|
156 middle = Median,
|
|
157 ymin = X10th.Percentile,
|
|
158 ymax = X90th.Percentile,
|
|
159 fill = "yellow"),
|
|
160 stat = 'identity') +
|
|
161 geom_line(mapping = aes(x = id, y = Mean, color = "red")) +
|
|
162 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
|
|
163 scale_fill_identity() +
|
|
164 scale_color_identity() +
|
|
165 ylim(0, max(comb_pbsq$Upper.Quartile) + 5) +
|
19
|
166 xlab('Position in read (bp)') +
|
18
|
167 facet_grid(. ~ trim) +
|
16
|
168 theme(axis.text.x = element_text(angle=45))
|
18
|
169 p
|
16
|
170
|
15
|
171 ```
|
|
172
|
|
173 ### Per tile sequence quality
|
|
174
|
17
|
175 ```{r 'per tile sequence quality', fig.width=10}
|
|
176 ## check if 'per tile sequence quality' module exits or not
|
|
177 check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt'))
|
|
178 if (length(check_ptsq) > 0) {
|
|
179 ## reads 1
|
|
180 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
|
|
181 ptsq_1$trim = 'before'
|
|
182
|
|
183 ## reads 2
|
|
184 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
|
|
185 ptsq_2$trim = 'after'
|
|
186
|
|
187 comb_ptsq = rbind(ptsq_1, ptsq_2)
|
|
188 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
|
|
189 comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
|
|
190
|
|
191 # convert integers to charaters
|
|
192 comb_ptsq$Tile = as.character(comb_ptsq$X.Tile)
|
|
193
|
|
194 p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) +
|
|
195 geom_raster() +
|
|
196 facet_grid(. ~ trim) +
|
|
197 xlab('Position in read (bp)') +
|
|
198 ylab('') +
|
|
199 theme(axis.text.x = element_text(angle=45))
|
|
200 ggplotly(p)
|
|
201 } else {
|
|
202 print('No "per tile sequence quality" data')
|
|
203 }
|
|
204
|
|
205
|
|
206 ```
|
|
207
|
|
208 ### Per sequence quality score
|
|
209
|
|
210 ```{r 'Per sequence quality score', fig.width=10}
|
16
|
211 ## reads 1
|
17
|
212 psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores')
|
|
213 psqs_1$trim = 'before'
|
16
|
214
|
|
215 ## reads 2
|
17
|
216 psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores')
|
|
217 psqs_2$trim = 'after'
|
|
218
|
|
219 comb_psqs = rbind(psqs_1, psqs_2)
|
|
220 comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim)
|
|
221
|
|
222 p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) +
|
|
223 geom_line(color = 'red') +
|
|
224 facet_grid(. ~ trim) +
|
|
225 xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) +
|
|
226 xlab('Mean Sequence Qaulity (Phred Score)') +
|
|
227 ylab('')
|
|
228 ggplotly(p)
|
|
229 ```
|
|
230
|
|
231
|
|
232 ### Per base sequence content
|
|
233
|
|
234 ```{r 'Per base sequence content', fig.width=10}
|
|
235 ## reads 1
|
|
236 pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content')
|
|
237 pbsc_1$id = 1:length(pbsc_1$X.Base)
|
|
238
|
|
239 melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id'))
|
|
240 melt_pbsc_1$trim = 'before'
|
|
241
|
|
242
|
|
243 ## reads 2
|
|
244 pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content')
|
|
245 pbsc_2$id = 1:length(pbsc_2$X.Base)
|
|
246
|
|
247 melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id'))
|
|
248 melt_pbsc_2$trim = 'after'
|
|
249
|
|
250 comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2)
|
|
251 comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim)
|
|
252
|
|
253 p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) +
|
|
254 geom_line() +
|
|
255 facet_grid(. ~ trim) +
|
|
256 xlim(min(comb_pbsc$id), max(comb_pbsc$id)) +
|
|
257 ylim(0, 100) +
|
|
258 xlab('Position in read (bp)') +
|
|
259 ylab('')
|
|
260 ggplotly(p)
|
|
261 ```
|
16
|
262
|
17
|
263 ### Per sequence GC content
|
|
264
|
|
265 ```{r 'Per sequence GC content', fig.width=10}
|
|
266 ## reads 1
|
|
267 psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content')
|
|
268 psGCc_1$trim = 'before'
|
|
269
|
|
270 ## reads 2
|
|
271 psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content')
|
|
272 psGCc_2$trim = 'after'
|
|
273
|
|
274 comb_psGCc = rbind(psGCc_1, psGCc_2)
|
|
275 comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim)
|
|
276
|
|
277 p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) +
|
|
278 geom_line(color = 'red') +
|
|
279 facet_grid(. ~ trim) +
|
|
280 xlab('Mean Sequence Qaulity (Phred Score)') +
|
|
281 ylab('')
|
|
282 ggplotly(p)
|
|
283 ```
|
|
284
|
16
|
285
|
17
|
286 ### Per base N content
|
|
287
|
|
288 ```{r 'Per base N content', fig.width=10}
|
|
289 ## reads 1
|
|
290 pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content')
|
|
291 pbNc_1$id = 1:length(pbNc_1$X.Base)
|
|
292 pbNc_1$trim = 'before'
|
|
293
|
|
294 ## reads 2
|
|
295 pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content')
|
|
296 pbNc_2$id = 1:length(pbNc_2$X.Base)
|
|
297 pbNc_2$trim = 'after'
|
|
298
|
|
299 comb_pbNc = rbind(pbNc_1, pbNc_2)
|
|
300 comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim)
|
|
301
|
|
302 p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) +
|
|
303 geom_line(color = 'red') +
|
|
304 scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) +
|
|
305 facet_grid(. ~ trim) +
|
|
306 ylim(0, 1) +
|
|
307 xlab('N-Count') +
|
|
308 ylab('') +
|
16
|
309 theme(axis.text.x = element_text(angle=45))
|
|
310 ggplotly(p)
|
15
|
311 ```
|
|
312
|
|
313
|
17
|
314 ### Sequence Length Distribution
|
|
315
|
|
316 ```{r 'Sequence Length Distribution', fig.width=10}
|
|
317 ## reads 1
|
|
318 sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution')
|
|
319 sld_1$id = 1:length(sld_1$X.Length)
|
|
320 sld_1$trim = 'before'
|
|
321
|
|
322 ## reads 2
|
|
323 sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution')
|
|
324 sld_2$id = 1:length(sld_2$X.Length)
|
|
325 sld_2$trim = 'after'
|
|
326
|
|
327 comb_sld = rbind(sld_1, sld_2)
|
|
328 comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim)
|
|
329
|
|
330 p = ggplot(data = comb_sld, aes(x = id, y = Count)) +
|
|
331 geom_line(color = 'red') +
|
|
332 scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) +
|
|
333 facet_grid(. ~ trim) +
|
|
334 xlab('Sequence Length (bp)') +
|
|
335 ylab('') +
|
|
336 theme(axis.text.x = element_text(angle=45))
|
|
337 ggplotly(p)
|
|
338 ```
|
|
339
|
|
340 ### Sequence Duplication Levels
|
|
341
|
|
342 ```{r 'Sequence Duplication Levels', fig.width=10}
|
|
343 ## reads 1
|
|
344 sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
|
|
345 names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
|
|
346 sdl_1$id = 1:length(sdl_1$Duplication_Level)
|
|
347
|
|
348 melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id'))
|
|
349 melt_sdl_1$trim = 'before'
|
|
350
|
|
351
|
|
352 ## reads 2
|
|
353 sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
|
|
354 names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
|
|
355 sdl_2$id = 1:length(sdl_2$Duplication_Level)
|
|
356
|
|
357 melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id'))
|
|
358 melt_sdl_2$trim = 'after'
|
|
359
|
|
360 comb_sdl = rbind(melt_sdl_1, melt_sdl_2)
|
|
361 comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim)
|
|
362
|
|
363 p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) +
|
|
364 geom_line() +
|
|
365 scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) +
|
|
366 facet_grid(. ~ trim) +
|
|
367 xlab('Sequence Duplication Level') +
|
|
368 ylab('') +
|
|
369 theme(axis.text.x = element_text(angle=45))
|
|
370 ggplotly(p)
|
|
371 ```
|
|
372
|
|
373 ### Adapter Content
|
|
374
|
|
375 ```{r 'Adapter Content', fig.width=10}
|
|
376 ## reads 1
|
|
377 ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content')
|
|
378 ac_1$id = 1:length(ac_1$X.Position)
|
|
379
|
|
380 melt_ac_1 = melt(ac_1, id=c('X.Position', 'id'))
|
|
381 melt_ac_1$trim = 'before'
|
|
382
|
|
383 ## reads 2
|
|
384 ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content')
|
|
385 ac_2$id = 1:length(ac_2$X.Position)
|
|
386
|
|
387 melt_ac_2 = melt(ac_2, id=c('X.Position', 'id'))
|
|
388 melt_ac_2$trim = 'after'
|
|
389
|
|
390 comb_ac = rbind(melt_ac_1, melt_ac_2)
|
|
391 comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim)
|
|
392
|
|
393 p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) +
|
|
394 geom_line() +
|
|
395 facet_grid(. ~ trim) +
|
|
396 xlim(min(comb_ac$id), max(comb_ac$id)) +
|
|
397 ylim(0, 1) +
|
|
398 xlab('Position in read (bp)') +
|
|
399 ylab('')
|
|
400 ggplotly(p)
|
|
401 ```
|
|
402
|
|
403 ### Kmer Content {.tabset}
|
|
404
|
|
405 #### Before
|
|
406
|
|
407 ```{r 'Kmer Content (before)', fig.width=10}
|
|
408 kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content')
|
|
409 knitr::kable(kc_1)
|
|
410 ```
|
|
411
|
|
412 #### After
|
|
413 ```{r 'Kmer Content (after)', fig.width=10}
|
|
414 kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content')
|
|
415 knitr::kable(kc_2)
|
|
416 ```
|
|
417
|
2
|
418
|
14
|
419 # Session Info
|
2
|
420
|
14
|
421 ```{r 'session info'}
|
|
422 sessionInfo()
|
2
|
423 ```
|
|
424
|
19
|
425 # References
|
|
426
|
|
427 * Bioinformatics, Babraham (2014). FastQC.
|
|
428
|
|
429 * Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob (2016). rmarkdown: Dynamic Documents for R, 2016. In R package version 0.9, 6.
|
|
430
|
|
431 * Xie, Yihui (2015). Dynamic Documents with R and knitr, CRC Press, Vol.29.
|
|
432
|
|
433 * Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. [Link]
|
|
434
|
|
435 * Wickham, H. (2016). ggplot2: elegant graphics for data analysis. Springer. Chicago
|