14
|
1 ---
|
|
2 title: "Fastqc report: short reads quality evaluation"
|
|
3 author: "Ming Chen"
|
|
4 output: html_document
|
|
5 ---
|
|
6
|
|
7 ```{r setup, include=FALSE}
|
|
8 knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE)
|
|
9 library(plyr)
|
|
10 library(stringr)
|
|
11 library(dplyr)
|
|
12 library(highcharter)
|
|
13 library(DT)
|
|
14 library(reshape2)
|
|
15 library(plotly)
|
|
16 library(formattable)
|
|
17 library(htmltools)
|
|
18 ```
|
|
19
|
|
20
|
|
21 ```{bash 'create output directory', echo=FALSE}
|
|
22 # create extra files directory. very important!
|
|
23 mkdir REPORT_OUTPUT_DIR
|
|
24 ```
|
|
25
|
|
26 # Fastqc analysis
|
|
27 ```{bash 'copy data to working directory', echo=FALSE}
|
|
28 # Copy uploaded data to the working directory
|
|
29 for f in $(echo READS | sed "s/,/ /g")
|
|
30 do
|
|
31 cp $f ./
|
|
32 done
|
|
33 ```
|
|
34
|
|
35
|
|
36 ```{bash 'run fastqc', echo=FALSE}
|
|
37 for r in $(ls *.dat)
|
|
38 do
|
|
39 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
|
|
40 done
|
|
41 ```
|
|
42
|
|
43 ## Fastqc html reports
|
|
44
|
|
45 Below are links to ***Fastqc*** original html reports.
|
|
46 ```{r 'html report links'}
|
|
47 html_report_list = list()
|
|
48 html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html')
|
|
49 for (i in html_files) {
|
|
50 html_report_list[[i]] = tags$li(tags$a(href=i, i))
|
|
51 }
|
|
52 tags$ul(html_report_list)
|
|
53 ```
|
|
54
|
|
55
|
|
56 ## Parsing fastqc data
|
|
57
|
|
58 ```{bash echo=FALSE}
|
|
59 ##==== copy fastqc generated zip files from report output directory to job work directory ==
|
|
60 cp -r REPORT_OUTPUT_DIR/*zip ./
|
|
61
|
|
62 # create a file to store data file paths
|
|
63 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
|
|
64 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
|
|
65 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
|
|
66 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
|
|
67 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
|
|
68 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
|
|
69 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
|
|
70 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
|
|
71 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
|
|
72
|
|
73 for i in $(ls *.zip)
|
|
74 do
|
|
75 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
|
|
76 echo $BASE
|
|
77 unzip ${BASE}.zip > /dev/null 2>&1
|
|
78
|
|
79 ##====== pass,warning,fail (WSF) =============
|
|
80 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
|
|
81 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
|
|
82
|
|
83 ##====== per base quality scores (PBQS) ======
|
|
84 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
|
|
85 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
|
|
86
|
|
87 ##====== per sequence quality scores (PSQS)
|
|
88 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
|
|
89 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
|
|
90
|
|
91 ##====== Per sequence GC content (PSGC)
|
|
92 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
|
|
93 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
|
|
94
|
|
95 ##====== Per Base Sequence Content (PBSC)
|
|
96 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
|
|
97 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
|
|
98
|
|
99 ##====== Per Base N Content (PBNC)
|
|
100 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
|
|
101 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
|
|
102
|
|
103 ##====== Sequence Duplication Level (SDL)
|
|
104 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
|
|
105 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
|
|
106
|
|
107 ##====== Sequence Length Distribution (SLD)
|
|
108 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
|
|
109 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
|
|
110
|
|
111 ##====== Kmer Content ============
|
|
112 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
|
|
113 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
|
|
114
|
|
115 done
|
|
116 ```
|
|
117
|
|
118
|
|
119 ## Evaluation Overview
|
|
120
|
|
121 ```{r 'overview'}
|
|
122 PWF_file_paths = read.csv('PWF_file_paths.txt',
|
|
123 header = TRUE, stringsAsFactors = FALSE)
|
|
124 rm('PWF_df')
|
|
125 for(i in 1:nrow(PWF_file_paths)) {
|
|
126 file_path = PWF_file_paths[i,2]
|
|
127 pwf_df = read.csv(file_path,
|
|
128 sep='\t', header=FALSE, stringsAsFactors = FALSE)
|
|
129 colnames(pwf_df) = c('item', PWF_file_paths[i,1])
|
|
130 if (!exists('PWF_df')) {
|
|
131 PWF_df = pwf_df
|
|
132 } else {
|
|
133 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
|
|
134 }
|
|
135 }
|
|
136 ```
|
|
137
|
|
138 ```{r}
|
|
139 my_icon = c('ok', 'remove', 'star')
|
|
140 names(my_icon) = c('pass', 'fail', 'warn')
|
|
141 evaluate_list = list()
|
|
142 for (i in colnames(PWF_df)[-1]) {
|
|
143 evaluate_list[[i]] = formatter(
|
|
144 "span",
|
|
145 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')),
|
|
146 "color" = "white",
|
|
147 "width" = "50px",
|
|
148 "float" = "left",
|
|
149 "padding-right" = "5px")
|
|
150 )
|
|
151 }
|
|
152
|
|
153 formattable(PWF_df, evaluate_list)
|
|
154 ```
|
|
155
|
|
156
|
|
157 ## Per Base Quality Scores
|
|
158
|
|
159 ```{r}
|
|
160 PBQS_df = data.frame()
|
|
161 PBQS_file_paths = read.csv('PBQS_file_paths.txt',
|
|
162 header = TRUE, stringsAsFactors = FALSE)
|
|
163 for(i in 1:nrow(PBQS_file_paths)) {
|
|
164 # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
|
|
165 file_path = PBQS_file_paths[i,2]
|
|
166 pbqs_df = read.csv(file_path,
|
|
167 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
|
|
168 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
|
|
169 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
|
|
170 (function (df) {
|
|
171 df1 = select(df, -Base2)
|
|
172 df2 = select(df, -Base1) %>% filter(Base2 != '')
|
|
173 colnames(df1) = c(colnames(df1)[1:7], 'Base')
|
|
174 colnames(df2) = c(colnames(df2)[1:7], 'Base')
|
|
175 res = rbind(df1, df2) %>% arrange(Base)
|
|
176 return(res)
|
|
177 })
|
|
178 pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
|
|
179 PBQS_df = rbind(PBQS_df, pbqs_df)
|
|
180 }
|
|
181 ```
|
|
182
|
|
183
|
|
184 ```{r}
|
|
185 # datatable(PBQS_df)
|
|
186 max_phred = max(PBQS_df$Mean) + 10
|
|
187 hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
|
|
188 hc_title(
|
|
189 text = "Per Base Quality Score"
|
|
190 ) %>%
|
|
191 hc_yAxis(
|
|
192 title = list(text = "Mean Base Quality Score"),
|
|
193 min = 0,
|
|
194 max = max_phred,
|
|
195 plotLines = list(
|
|
196 list(label = list(text = "Phred Score = 27"),
|
|
197 width = 2,
|
|
198 dashStyle = "dash",
|
|
199 color = "green",
|
|
200 value = 27),
|
|
201 list(label = list(text = "Phred Score = 20"),
|
|
202 width = 2,
|
|
203 color = "red",
|
|
204 value = 20)
|
|
205 )
|
|
206 ) %>%
|
|
207 hc_exporting(enabled = TRUE)
|
|
208 ```
|
|
209
|
|
210
|
|
211 ## Per Base N Content
|
|
212
|
|
213 ```{r}
|
|
214 PBNC_df = data.frame()
|
|
215 PBNC_file_paths = read.csv('PBNC_file_paths.txt',
|
|
216 header = TRUE, stringsAsFactors = FALSE)
|
|
217 for(i in 1:nrow(PBNC_file_paths)) {
|
|
218 # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
|
|
219 file_path = PBNC_file_paths[i,2]
|
|
220 pbnc_df = read.csv(file_path,
|
|
221 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
|
|
222 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
|
|
223 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
|
|
224 (function (df) {
|
|
225 df1 = select(df, -Base2)
|
|
226 df2 = select(df, -Base1) %>% filter(Base2 != '')
|
|
227 colnames(df1) = c(colnames(df1)[1:2], 'Base')
|
|
228 colnames(df2) = c(colnames(df2)[1:2], 'Base')
|
|
229 res = rbind(df1, df2) %>% arrange(Base)
|
|
230 return(res)
|
|
231 })
|
|
232 pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
|
|
233 PBNC_df = rbind(PBNC_df, pbnc_df)
|
|
234 }
|
|
235 ```
|
|
236
|
|
237
|
|
238 ```{r}
|
|
239 PBNC_df$N.Count = PBNC_df$N.Count * 100
|
|
240 max_phred = max(PBNC_df$N.Count) + 5
|
|
241 hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
|
|
242 hc_title(
|
|
243 text = "Per Base N Content"
|
|
244 ) %>%
|
|
245 hc_xAxis(
|
|
246 title = list(text = "Base Position")
|
|
247 ) %>%
|
|
248 hc_yAxis(
|
|
249 title = list(text = "N %"),
|
|
250 plotLines = list(
|
|
251 list(label = list(text = "N = 5%"),
|
|
252 width = 2,
|
|
253 dashStyle = "dash",
|
|
254 color = "red",
|
|
255 value = 5)
|
|
256 )
|
|
257 ) %>%
|
|
258 hc_exporting(enabled = TRUE)
|
|
259 ```
|
|
260
|
|
261
|
|
262
|
|
263
|
|
264 ## Per Sequence Quality Scores
|
|
265
|
|
266 ```{r}
|
|
267 PSQS_df = data.frame()
|
|
268 PSQS_file_paths = read.csv('PSQS_file_paths.txt',
|
|
269 header = TRUE, stringsAsFactors = FALSE)
|
|
270 for(i in 1:nrow(PSQS_file_paths)) {
|
|
271 # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
|
|
272 file_path = PSQS_file_paths[i,2]
|
|
273 psqs_df = read.csv(file_path,
|
|
274 sep='\t', header=TRUE, stringsAsFactors = FALSE)
|
|
275 psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
|
|
276 PSQS_df = rbind(PSQS_df, psqs_df)
|
|
277 }
|
|
278 ```
|
|
279
|
|
280
|
|
281 ```{r}
|
|
282 max_phred = max(PSQS_df$X.Quality) + 5
|
|
283 hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
|
|
284 hc_title(
|
|
285 text = "Per Sequence Quality Score"
|
|
286 ) %>%
|
|
287 hc_xAxis(
|
|
288 title = list(text = "Mean Sequence Quality Score"),
|
|
289 min = 0,
|
|
290 max = max_phred,
|
|
291 plotLines = list(
|
|
292 list(label = list(text = "Phred Score = 27"),
|
|
293 width = 2,
|
|
294 dashStyle = "dash",
|
|
295 color = "green",
|
|
296 value = 27),
|
|
297 list(label = list(text = "Phred Score = 20"),
|
|
298 width = 2,
|
|
299 color = "red",
|
|
300 value = 20)
|
|
301 )
|
|
302 ) %>%
|
|
303 hc_exporting(enabled = TRUE)
|
|
304 ```
|
|
305
|
|
306
|
|
307 ## Per Sequence GC Content
|
|
308
|
|
309
|
|
310 ```{r}
|
|
311 PSGC_df = data.frame()
|
|
312 PSGC_file_paths = read.csv('PSGC_file_paths.txt',
|
|
313 header = TRUE, stringsAsFactors = FALSE)
|
|
314 for(i in 1:nrow(PSGC_file_paths)) {
|
|
315 # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
|
|
316 file_path = PSGC_file_paths[i,2]
|
|
317 psgc_df = read.csv(file_path,
|
|
318 sep='\t', header=TRUE, stringsAsFactors = FALSE)
|
|
319 psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
|
|
320 PSGC_df = rbind(PSGC_df, psgc_df)
|
|
321 }
|
|
322 ```
|
|
323
|
|
324
|
|
325 ```{r}
|
|
326 max_phred = max(PSGC_df$Count) + 5
|
|
327 hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
|
|
328 hc_title(
|
|
329 text = "Per Sequence GC Content"
|
|
330 ) %>%
|
|
331 hc_xAxis(
|
|
332 title = list(text = "% GC")
|
|
333 ) %>%
|
|
334 hc_exporting(enabled = TRUE)
|
|
335 ```
|
|
336
|
|
337
|
|
338 ## Per Base Sequence Content
|
|
339
|
|
340 ```{r}
|
|
341 PBSC_df = data.frame()
|
|
342 PBSC_file_paths = read.csv('PBSC_file_paths.txt',
|
|
343 header = TRUE, stringsAsFactors = FALSE)
|
|
344 for(i in 1:nrow(PBSC_file_paths)) {
|
|
345 # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
|
|
346 file_path = PBSC_file_paths[i,2]
|
|
347 pbsc_df = read.csv(file_path,
|
|
348 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
|
|
349 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
|
|
350 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
|
|
351 (function (df) {
|
|
352 df1 = select(df, -Base2)
|
|
353 df2 = select(df, -Base1) %>% filter(Base2 != '')
|
|
354 colnames(df1) = c(colnames(df1)[1:5], 'Base')
|
|
355 colnames(df2) = c(colnames(df2)[1:5], 'Base')
|
|
356 res = rbind(df1, df2) %>% arrange(Base)
|
|
357 return(res)
|
|
358 })
|
|
359 pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
|
|
360 PBSC_df = rbind(PBSC_df, pbsc_df)
|
|
361 }
|
|
362 ```
|
|
363
|
|
364
|
|
365 ```{r out.width="100%"}
|
|
366 PBSC_df_2 = select(PBSC_df, -X.Base) %>%
|
|
367 melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
|
|
368 p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
|
|
369 geom_line() +
|
|
370 facet_wrap(~ sample_id)
|
|
371 ggplotly(p)
|
|
372 ```
|
|
373
|
|
374
|
|
375 # Session Info
|
|
376
|
|
377 ```{r 'session info'}
|
|
378 sessionInfo()
|
|
379 ```
|
|
380
|
|
381
|