Repository 'rmarkdown_fastqc_report'
hg clone https://toolshed.g2.bx.psu.edu/repos/mingchen0919/rmarkdown_fastqc_report

Changeset 2:0374e090e38e (2017-08-07)
Previous changeset 1:22cd2369354b (2017-08-07) Next changeset 3:1ed094d8871c (2017-08-07)
Commit message:
Uploaded
added:
fastqc_report.Rmd
b
diff -r 22cd2369354b -r 0374e090e38e fastqc_report.Rmd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_report.Rmd Mon Aug 07 21:40:56 2017 -0400
[
b'@@ -0,0 +1,384 @@\n+---\n+title: "Fastqc report: short reads quality evaluation"\n+author: "Ming Chen"\n+output: html_document\n+---\n+\n+```{r setup, include=FALSE}\n+knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE)\n+library(plyr)\n+library(stringr)\n+library(dplyr)\n+library(highcharter)\n+library(DT)\n+library(reshape2)\n+# library(Kmisc)\n+library(plotly)\n+library(formattable)\n+library(htmltools)\n+```\n+\n+\n+```{bash \'create output directory\', echo=FALSE}\n+# create extra files directory. very important!\n+mkdir REPORT_OUTPUT_DIR\n+```\n+\n+# Fastqc analysis\n+```{bash \'copy data to working directory\', echo=FALSE}\n+# Copy uploaded data to the working directory\n+for f in $(echo READS | sed "s/,/ /g")\n+do\n+    cp $f ./\n+done\n+```\n+\n+\n+```{bash \'run fastqc\', echo=FALSE}\n+for r in $(ls *.dat)\n+do\n+    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1\n+done\n+```\n+\n+## Fastqc html reports\n+\n+Below are links to ***Fastqc*** original html reports.\n+```{r \'html report links\'}\n+html_report_list = list()\n+html_files = list.files(\'REPORT_OUTPUT_DIR\', pattern = \'.*html\')\n+for (i in html_files) {\n+  html_report_list[[i]] = tags$li(tags$a(href=i, i))\n+}\n+tags$ul(html_report_list)\n+```\n+\n+\n+## Parsing fastqc data\n+\n+```{bash echo=FALSE}\n+##==== copy fastqc generated zip files from report output directory to job work directory ==\n+cp -r REPORT_OUTPUT_DIR/*zip ./\n+\n+# create a file to store data file paths\n+echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail\n+echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score\n+echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score\n+echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content\n+echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content\n+echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content\n+echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level\n+echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution\n+echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content\n+\n+for i in $(ls *.zip)\n+do\n+    BASE=$(echo $i | sed \'s/\\(.*\\)\\.zip/\\1/g\')\n+    echo $BASE\n+    unzip ${BASE}.zip > /dev/null 2>&1\n+    \n+    ##====== pass,warning,fail (WSF) =============\n+    awk \'/^>>/ {print}\' "$BASE"/fastqc_data.txt | grep -v \'END_MODULE\' | sed \'s/>>//\' > "$BASE"-PWF.txt\n+    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt\n+\n+    ##====== per base quality scores (PBQS) ======\n+    awk \'/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt\n+    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt\n+\n+    ##====== per sequence quality scores (PSQS)\n+    awk \'/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt\n+    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt\n+\n+    ##====== Per sequence GC content (PSGC)\n+    awk \'/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt\n+    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt\n+    \n+    ##====== Per Base Sequence Content (PBSC)\n+    awk \'/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt\n+    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt\n+    \n+    ##====== Per Base N Content (PBNC)\n+    awk \'/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt\n+    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt\n+    \n+    ##====== Sequence Duplication Level (SDL)\n+    awk \'/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt\n+    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt\n+    \n+    ##====== Sequence Length Distribution (SLD)\n+    awk \'/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag\' "$BASE"/fastqc_'..b'= rep(PSQS_file_paths[i,1], nrow(psqs_df))\n+  PSQS_df = rbind(PSQS_df, psqs_df)\n+}\n+```\n+\n+\n+```{r}\n+max_phred = max(PSQS_df$X.Quality) + 5\n+hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%\n+  hc_title(\n+    text = "Per Sequence Quality Score"\n+  ) %>%\n+  hc_xAxis(\n+    title = list(text = "Mean Sequence Quality Score"),\n+    min = 0,\n+    max = max_phred,\n+    plotLines = list(\n+      list(label = list(text = "Phred Score = 27"),\n+           width = 2,\n+           dashStyle = "dash",\n+           color = "green",\n+           value = 27),\n+      list(label = list(text = "Phred Score = 20"),\n+           width = 2,\n+           color = "red",\n+           value = 20)\n+    )\n+  ) %>% \n+  hc_exporting(enabled = TRUE)\n+```\n+\n+\n+## Per Sequence GC Content\n+\n+\n+```{r}\n+PSGC_df = data.frame()\n+PSGC_file_paths = read.csv(\'PSGC_file_paths.txt\', \n+                           header = TRUE, stringsAsFactors = FALSE)\n+for(i in 1:nrow(PSGC_file_paths)) {\n+  # file_path = paste0(\'REPORT_OUTPUT_DIR/\', PSGC_file_paths[i,2])\n+  file_path = PSGC_file_paths[i,2]\n+  psgc_df = read.csv(file_path,\n+                     sep=\'\\t\', header=TRUE, stringsAsFactors = FALSE) \n+  psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))\n+  PSGC_df = rbind(PSGC_df, psgc_df)\n+}\n+```\n+\n+\n+```{r}\n+max_phred = max(PSGC_df$Count) + 5\n+hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%\n+  hc_title(\n+    text = "Per Sequence GC Content"\n+  ) %>%\n+  hc_xAxis(\n+    title = list(text = "% GC")\n+  ) %>%\n+  hc_exporting(enabled = TRUE)\n+```\n+\n+\n+## Per Base Sequence Content\n+\n+```{r}\n+PBSC_df = data.frame()\n+PBSC_file_paths = read.csv(\'PBSC_file_paths.txt\',\n+                           header = TRUE, stringsAsFactors = FALSE)\n+for(i in 1:nrow(PBSC_file_paths)) {\n+  # file_path = paste0(\'REPORT_OUTPUT_DIR/\', PBSC_file_paths[i,2])\n+  file_path = PBSC_file_paths[i,2]\n+  pbsc_df = read.csv(file_path,\n+                     sep=\'\\t\', header=TRUE, stringsAsFactors = FALSE) %>%\n+    mutate(Base1=as.numeric(str_split_fixed(X.Base, \'-\', 2)[,1]),\n+           Base2=as.numeric(str_split_fixed(X.Base, \'-\', 2)[,2])) %>%\n+  (function (df) {\n+    df1 = select(df, -Base2)\n+    df2 = select(df, -Base1) %>% filter(Base2 != \'\')\n+    colnames(df1) = c(colnames(df1)[1:5], \'Base\')\n+    colnames(df2) = c(colnames(df2)[1:5], \'Base\')\n+    res = rbind(df1, df2) %>% arrange(Base)\n+    return(res)\n+  })\n+  pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))\n+  PBSC_df = rbind(PBSC_df, pbsc_df)\n+}\n+```\n+\n+\n+```{r out.width="100%"}\n+PBSC_df_2 = select(PBSC_df, -X.Base) %>%\n+  melt(id = c(\'Base\', \'sample_id\'), value.name = \'base_percentage\')\n+p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +\n+  geom_line() +\n+  facet_wrap(~ sample_id)\n+ggplotly(p)\n+```\n+\n+\n+## References\n+\n+* Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176.\n+* Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86.\n+* Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343.\n+* Highcharts. https://www.highcharts.com/. (access by May 26, 2017).\n+* R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.\n+* Joshua Kunst (2017). highcharter: A Wrapper for the \'Highcharts\' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter\n+* Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via \'plotly.js\'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly\n'