view fastqc_report.Rmd @ 16:1710b0e874f1 draft

fix file name issue
author mingchen0919
date Sat, 21 Oct 2017 09:25:49 -0400
parents d1d20f341632
children ac5c618e4d97
line wrap: on
line source

---
title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
output:
    html_document:
      number_sections: true
      toc: true
      theme: cosmo
      highlight: tango
---

```{r setup, include=FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(
  echo = ECHO,
  error = TRUE
)
```


# Fastqc Evaluation

## Evaluation of reads before trimming

```{r}
if ('READS_1' == 'None') {
  stop("No pre-trimming reads provided!")
} else {
  ## run fastqc evaluation
  fastqc_command = paste0('fastqc ') %>% 
  (function(x) {
    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
  }) %>% 
  (function(x) {
    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
  }) %>% 
  (function(x) {
    paste0(x, '-o REPORT_DIR ')
  })
  fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
  system(fastqc_command_reads_1, intern = TRUE)
  
  # Original html report
  reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
  original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
  
  unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
  reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
  # fastqc_data.txt
  file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
  fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
  # summary.txt
  file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
  summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
  
  tags$ul(
    tags$li(original_html),
    tags$li(fastqc_data),
    tags$li(summary_data)
  )
}
```


## Evaluation of reads after trimming

```{r}
if ('READS_2' == 'None') {
  stop("No pre-trimming reads provided!")
} else {
  ## run fastqc evaluation
  fastqc_command = paste0('fastqc ') %>% 
  (function(x) {
    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
  }) %>% 
  (function(x) {
    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
  }) %>% 
  (function(x) {
    paste0(x, '-o REPORT_DIR ')
  })
  fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
  system(fastqc_command_reads_2, intern = TRUE)
  
  # Original html report
  reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
  original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
  
  unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
  reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
  # fastqc_data.txt
  file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
  fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
  # summary.txt
  file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
  summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
  
  tags$ul(
    tags$li(original_html),
    tags$li(fastqc_data),
    tags$li(summary_data)
  )
}
```



# Fastqc output visualization

## Overview

```{r}
reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1]
combined_summary = cbind(reads_1_summary, reads_2_summary)
names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
knitr::kable(combined_summary)
```

## Visualization by module {.tabset}

* Define a function to extract outputs for each module from fastqc output

```{r 'function definition'}
extract_data_module = function(fastqc_data, module_name) {
  f = readLines(fastqc_data)
  start_line = grep(module_name, f)
  end_module_lines = grep('END_MODULE', f)
  end_line = end_module_lines[which(end_module_lines > start_line)[1]]
  module_data = f[(start_line+1):(end_line-1)]
  writeLines(module_data, 'temp.txt')
  read.csv('temp.txt', sep = '\t')
}
```

### Per base sequence quality

```{r 'per base sequence quality', fig.width=10}
## reads 1
pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
pbsq_1$id = 1:length(pbsq_1$X.Base)

melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
melt_pbsq_1$trim = 'before'


## reads 2
pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
pbsq_2$id = 1:length(pbsq_2$X.Base)

melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
melt_pbsq_2$trim = 'after'

comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
p = ggplot(data = comb_pbsq) +
  geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
  scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + 
  facet_grid(. ~ trim) + 
  theme(axis.text.x = element_text(angle=45))
ggplotly(p)

```

### Per tile sequence quality

```{r 'per tile sequence quality'}
## reads 1
ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
ptsq_1$trim = 'before'

## reads 2
ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
ptsq_2$trim = 'after'

comb_ptsq = rbind(ptsq_1, ptsq_2)
comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)

p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) +
  geom_raster() + 
  facet_grid(. ~ trim) + 
  theme(axis.text.x = element_text(angle=45))
ggplotly(p)
```



# Session Info

```{r 'session info'}
sessionInfo()
```