comparison fastqc_report.Rmd @ 2:0374e090e38e draft

Uploaded
author mingchen0919
date Mon, 07 Aug 2017 21:40:56 -0400
parents
children e629c2288316
comparison
equal deleted inserted replaced
1:22cd2369354b 2:0374e090e38e
1 ---
2 title: "Fastqc report: short reads quality evaluation"
3 author: "Ming Chen"
4 output: html_document
5 ---
6
7 ```{r setup, include=FALSE}
8 knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE)
9 library(plyr)
10 library(stringr)
11 library(dplyr)
12 library(highcharter)
13 library(DT)
14 library(reshape2)
15 # library(Kmisc)
16 library(plotly)
17 library(formattable)
18 library(htmltools)
19 ```
20
21
22 ```{bash 'create output directory', echo=FALSE}
23 # create extra files directory. very important!
24 mkdir REPORT_OUTPUT_DIR
25 ```
26
27 # Fastqc analysis
28 ```{bash 'copy data to working directory', echo=FALSE}
29 # Copy uploaded data to the working directory
30 for f in $(echo READS | sed "s/,/ /g")
31 do
32 cp $f ./
33 done
34 ```
35
36
37 ```{bash 'run fastqc', echo=FALSE}
38 for r in $(ls *.dat)
39 do
40 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
41 done
42 ```
43
44 ## Fastqc html reports
45
46 Below are links to ***Fastqc*** original html reports.
47 ```{r 'html report links'}
48 html_report_list = list()
49 html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html')
50 for (i in html_files) {
51 html_report_list[[i]] = tags$li(tags$a(href=i, i))
52 }
53 tags$ul(html_report_list)
54 ```
55
56
57 ## Parsing fastqc data
58
59 ```{bash echo=FALSE}
60 ##==== copy fastqc generated zip files from report output directory to job work directory ==
61 cp -r REPORT_OUTPUT_DIR/*zip ./
62
63 # create a file to store data file paths
64 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
65 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
66 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
67 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
68 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
69 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
70 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
71 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
72 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
73
74 for i in $(ls *.zip)
75 do
76 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
77 echo $BASE
78 unzip ${BASE}.zip > /dev/null 2>&1
79
80 ##====== pass,warning,fail (WSF) =============
81 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
82 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
83
84 ##====== per base quality scores (PBQS) ======
85 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
86 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
87
88 ##====== per sequence quality scores (PSQS)
89 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
90 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
91
92 ##====== Per sequence GC content (PSGC)
93 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
94 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
95
96 ##====== Per Base Sequence Content (PBSC)
97 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
98 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
99
100 ##====== Per Base N Content (PBNC)
101 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
102 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
103
104 ##====== Sequence Duplication Level (SDL)
105 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
106 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
107
108 ##====== Sequence Length Distribution (SLD)
109 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
110 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
111
112 ##====== Kmer Content ============
113 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
114 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
115
116 done
117 ```
118
119
120 ## Evaluation Overview
121
122 ```{r 'overview'}
123 PWF_file_paths = read.csv('PWF_file_paths.txt',
124 header = TRUE, stringsAsFactors = FALSE)
125 rm('PWF_df')
126 for(i in 1:nrow(PWF_file_paths)) {
127 file_path = PWF_file_paths[i,2]
128 pwf_df = read.csv(file_path,
129 sep='\t', header=FALSE, stringsAsFactors = FALSE)
130 colnames(pwf_df) = c('item', PWF_file_paths[i,1])
131 if (!exists('PWF_df')) {
132 PWF_df = pwf_df
133 } else {
134 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
135 }
136 }
137 ```
138
139 ```{r}
140 my_icon = c('ok', 'remove', 'star')
141 names(my_icon) = c('pass', 'fail', 'warn')
142 evaluate_list = list()
143 for (i in colnames(PWF_df)[-1]) {
144 evaluate_list[[i]] = formatter(
145 "span",
146 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')),
147 "color" = "white",
148 "width" = "50px",
149 "float" = "left",
150 "padding-right" = "5px")
151 )
152 }
153
154 formattable(PWF_df, evaluate_list)
155 ```
156
157
158 ## Per Base Quality Scores
159
160 ```{r}
161 PBQS_df = data.frame()
162 PBQS_file_paths = read.csv('PBQS_file_paths.txt',
163 header = TRUE, stringsAsFactors = FALSE)
164 for(i in 1:nrow(PBQS_file_paths)) {
165 # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
166 file_path = PBQS_file_paths[i,2]
167 pbqs_df = read.csv(file_path,
168 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
169 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
170 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
171 (function (df) {
172 df1 = select(df, -Base2)
173 df2 = select(df, -Base1) %>% filter(Base2 != '')
174 colnames(df1) = c(colnames(df1)[1:7], 'Base')
175 colnames(df2) = c(colnames(df2)[1:7], 'Base')
176 res = rbind(df1, df2) %>% arrange(Base)
177 return(res)
178 })
179 pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
180 PBQS_df = rbind(PBQS_df, pbqs_df)
181 }
182 ```
183
184
185 ```{r}
186 # datatable(PBQS_df)
187 max_phred = max(PBQS_df$Mean) + 10
188 hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
189 hc_title(
190 text = "Per Base Quality Score"
191 ) %>%
192 hc_yAxis(
193 title = list(text = "Mean Base Quality Score"),
194 min = 0,
195 max = max_phred,
196 plotLines = list(
197 list(label = list(text = "Phred Score = 27"),
198 width = 2,
199 dashStyle = "dash",
200 color = "green",
201 value = 27),
202 list(label = list(text = "Phred Score = 20"),
203 width = 2,
204 color = "red",
205 value = 20)
206 )
207 ) %>%
208 hc_exporting(enabled = TRUE)
209 ```
210
211
212 ## Per Base N Content
213
214 ```{r}
215 PBNC_df = data.frame()
216 PBNC_file_paths = read.csv('PBNC_file_paths.txt',
217 header = TRUE, stringsAsFactors = FALSE)
218 for(i in 1:nrow(PBNC_file_paths)) {
219 # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
220 file_path = PBNC_file_paths[i,2]
221 pbnc_df = read.csv(file_path,
222 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
223 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
224 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
225 (function (df) {
226 df1 = select(df, -Base2)
227 df2 = select(df, -Base1) %>% filter(Base2 != '')
228 colnames(df1) = c(colnames(df1)[1:2], 'Base')
229 colnames(df2) = c(colnames(df2)[1:2], 'Base')
230 res = rbind(df1, df2) %>% arrange(Base)
231 return(res)
232 })
233 pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
234 PBNC_df = rbind(PBNC_df, pbnc_df)
235 }
236 ```
237
238
239 ```{r}
240 PBNC_df$N.Count = PBNC_df$N.Count * 100
241 max_phred = max(PBNC_df$N.Count) + 5
242 hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
243 hc_title(
244 text = "Per Base N Content"
245 ) %>%
246 hc_xAxis(
247 title = list(text = "Base Position")
248 ) %>%
249 hc_yAxis(
250 title = list(text = "N %"),
251 plotLines = list(
252 list(label = list(text = "N = 5%"),
253 width = 2,
254 dashStyle = "dash",
255 color = "red",
256 value = 5)
257 )
258 ) %>%
259 hc_exporting(enabled = TRUE)
260 ```
261
262
263
264
265 ## Per Sequence Quality Scores
266
267 ```{r}
268 PSQS_df = data.frame()
269 PSQS_file_paths = read.csv('PSQS_file_paths.txt',
270 header = TRUE, stringsAsFactors = FALSE)
271 for(i in 1:nrow(PSQS_file_paths)) {
272 # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
273 file_path = PSQS_file_paths[i,2]
274 psqs_df = read.csv(file_path,
275 sep='\t', header=TRUE, stringsAsFactors = FALSE)
276 psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
277 PSQS_df = rbind(PSQS_df, psqs_df)
278 }
279 ```
280
281
282 ```{r}
283 max_phred = max(PSQS_df$X.Quality) + 5
284 hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
285 hc_title(
286 text = "Per Sequence Quality Score"
287 ) %>%
288 hc_xAxis(
289 title = list(text = "Mean Sequence Quality Score"),
290 min = 0,
291 max = max_phred,
292 plotLines = list(
293 list(label = list(text = "Phred Score = 27"),
294 width = 2,
295 dashStyle = "dash",
296 color = "green",
297 value = 27),
298 list(label = list(text = "Phred Score = 20"),
299 width = 2,
300 color = "red",
301 value = 20)
302 )
303 ) %>%
304 hc_exporting(enabled = TRUE)
305 ```
306
307
308 ## Per Sequence GC Content
309
310
311 ```{r}
312 PSGC_df = data.frame()
313 PSGC_file_paths = read.csv('PSGC_file_paths.txt',
314 header = TRUE, stringsAsFactors = FALSE)
315 for(i in 1:nrow(PSGC_file_paths)) {
316 # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
317 file_path = PSGC_file_paths[i,2]
318 psgc_df = read.csv(file_path,
319 sep='\t', header=TRUE, stringsAsFactors = FALSE)
320 psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
321 PSGC_df = rbind(PSGC_df, psgc_df)
322 }
323 ```
324
325
326 ```{r}
327 max_phred = max(PSGC_df$Count) + 5
328 hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
329 hc_title(
330 text = "Per Sequence GC Content"
331 ) %>%
332 hc_xAxis(
333 title = list(text = "% GC")
334 ) %>%
335 hc_exporting(enabled = TRUE)
336 ```
337
338
339 ## Per Base Sequence Content
340
341 ```{r}
342 PBSC_df = data.frame()
343 PBSC_file_paths = read.csv('PBSC_file_paths.txt',
344 header = TRUE, stringsAsFactors = FALSE)
345 for(i in 1:nrow(PBSC_file_paths)) {
346 # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
347 file_path = PBSC_file_paths[i,2]
348 pbsc_df = read.csv(file_path,
349 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
350 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
351 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
352 (function (df) {
353 df1 = select(df, -Base2)
354 df2 = select(df, -Base1) %>% filter(Base2 != '')
355 colnames(df1) = c(colnames(df1)[1:5], 'Base')
356 colnames(df2) = c(colnames(df2)[1:5], 'Base')
357 res = rbind(df1, df2) %>% arrange(Base)
358 return(res)
359 })
360 pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
361 PBSC_df = rbind(PBSC_df, pbsc_df)
362 }
363 ```
364
365
366 ```{r out.width="100%"}
367 PBSC_df_2 = select(PBSC_df, -X.Base) %>%
368 melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
369 p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
370 geom_line() +
371 facet_wrap(~ sample_id)
372 ggplotly(p)
373 ```
374
375
376 ## References
377
378 * Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176.
379 * Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86.
380 * Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343.
381 * Highcharts. https://www.highcharts.com/. (access by May 26, 2017).
382 * R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
383 * Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter
384 * Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly