# HG changeset patch # User mingchen0919 # Date 1502210111 14400 # Node ID 2f4df2be05724ecd22c32a02d3e430eec10dfe46 # Parent 0ac073bef19d2a32d490940065fc02c34260e690 planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_wgcna commit d91f269e8bc09a488ed2e005122bbb4a521f44a0-dirty diff -r 0ac073bef19d -r 2f4df2be0572 01_evaluation_overview.Rmd --- a/01_evaluation_overview.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,123 +0,0 @@ ---- -title: "Evaluation Overview" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -```{bash 'copy data from datasets directory to working directory', echo=FALSE} -# Copy uploaded data to the working directory -for f in $(echo READS | sed "s/,/ /g") -do - cp $f ./ -done -``` - -```{bash 'run fastqc', echo=FALSE} -# run fastqc and place outputs into the report directory -for r in $(ls *.dat) -do - fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 -done -``` - -```{bash 'parse fastqc results', echo=FALSE} -##==== copy fastqc generated zip files from report output directory to job work directory == -cp -r REPORT_OUTPUT_DIR/*zip ./ - -# create a file to store data file paths -echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail -echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score -echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score -echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content -echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content -echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content -echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level -echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution -echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content - -for i in $(ls *.zip) -do - BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') - echo $BASE - unzip ${BASE}.zip > /dev/null 2>&1 - - ##====== pass,warning,fail (WSF) ============= - awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt - echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt - - ##====== per base quality scores (PBQS) ====== - awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt - echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt - - ##====== per sequence quality scores (PSQS) - awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt - echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt - - ##====== Per sequence GC content (PSGC) - awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt - echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt - - ##====== Per Base Sequence Content (PBSC) - awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt - echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt - - ##====== Per Base N Content (PBNC) - awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt - echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt - - ##====== Sequence Duplication Level (SDL) - awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt - echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt - - ##====== Sequence Length Distribution (SLD) - awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt - echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt - - ##====== Kmer Content ============ - awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt - echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt - -done -``` - - -## Evaluation Overview - -```{r 'overview'} -PWF_file_paths = read.csv('PWF_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -rm('PWF_df') -for(i in 1:nrow(PWF_file_paths)) { - file_path = PWF_file_paths[i,2] - pwf_df = read.csv(file_path, - sep='\t', header=FALSE, stringsAsFactors = FALSE) - colnames(pwf_df) = c('item', PWF_file_paths[i,1]) - if (!exists('PWF_df')) { - PWF_df = pwf_df - } else { - PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) - } -} -``` - - -```{r} -my_icon = c('ok', 'remove', 'star') -names(my_icon) = c('pass', 'fail', 'warn') -evaluate_list = list() -for (i in colnames(PWF_df)[-1]) { - evaluate_list[[i]] = formatter( - "span", - style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), - "color" = "white", - "width" = "50px", - "float" = "left", - "padding-right" = "5px") - ) -} - -formattable(PWF_df, evaluate_list) -``` \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 02_fastqc_original_reports.Rmd --- a/02_fastqc_original_reports.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ ---- -title: "FastQC original reports" -output: html_document ---- - -```{r 'FastQC original reports', include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - - -Below are links to ***Fastqc*** original html reports. - -```{r 'html report links'} -html_report_list = list() -html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') -for (i in html_files) { - html_report_list[[i]] = tags$li(tags$a(href=i, i)) -} -tags$ul(html_report_list) -``` \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 1_per_base_quality_scores.Rmd --- a/1_per_base_quality_scores.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ ---- -title: "Per Base Quality Scores" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - - -## Per Base Quality Scores - -```{r} -PBQS_df = data.frame() -PBQS_file_paths = read.csv('PBQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) - file_path = PBQS_file_paths[i,2] - pbqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:7], 'Base') - colnames(df2) = c(colnames(df2)[1:7], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) - PBQS_df = rbind(PBQS_df, pbqs_df) -} -``` - - -```{r} -# datatable(PBQS_df) -max_phred = max(PBQS_df$Mean) + 10 -hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% - hc_title( - text = "Per Base Quality Score" - ) %>% - hc_yAxis( - title = list(text = "Mean Base Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` diff -r 0ac073bef19d -r 2f4df2be0572 2_per_base_N_content.Rmd --- a/2_per_base_N_content.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ ---- -title: "Per Base N Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Base N Content - -```{r} -PBNC_df = data.frame() -PBNC_file_paths = read.csv('PBNC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBNC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) - file_path = PBNC_file_paths[i,2] - pbnc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:2], 'Base') - colnames(df2) = c(colnames(df2)[1:2], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) - PBNC_df = rbind(PBNC_df, pbnc_df) -} -``` - - -```{r} -PBNC_df$N.Count = PBNC_df$N.Count * 100 -max_phred = max(PBNC_df$N.Count) + 5 -hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% - hc_title( - text = "Per Base N Content" - ) %>% - hc_xAxis( - title = list(text = "Base Position") - ) %>% - hc_yAxis( - title = list(text = "N %"), - plotLines = list( - list(label = list(text = "N = 5%"), - width = 2, - dashStyle = "dash", - color = "red", - value = 5) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` diff -r 0ac073bef19d -r 2f4df2be0572 3_per_sequence_quality_scores.Rmd --- a/3_per_sequence_quality_scores.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ ---- -title: "Per Sequence Quality Scores" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Sequence Quality Scores - -```{r} -PSQS_df = data.frame() -PSQS_file_paths = read.csv('PSQS_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSQS_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) - file_path = PSQS_file_paths[i,2] - psqs_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) - PSQS_df = rbind(PSQS_df, psqs_df) -} -``` - - -```{r} -max_phred = max(PSQS_df$X.Quality) + 5 -hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence Quality Score" - ) %>% - hc_xAxis( - title = list(text = "Mean Sequence Quality Score"), - min = 0, - max = max_phred, - plotLines = list( - list(label = list(text = "Phred Score = 27"), - width = 2, - dashStyle = "dash", - color = "green", - value = 27), - list(label = list(text = "Phred Score = 20"), - width = 2, - color = "red", - value = 20) - ) - ) %>% - hc_exporting(enabled = TRUE) -``` diff -r 0ac073bef19d -r 2f4df2be0572 4_per_sequence_GC_content.Rmd --- a/4_per_sequence_GC_content.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ ---- -title: "Per Sequence GC Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Sequence GC Content - - -```{r} -PSGC_df = data.frame() -PSGC_file_paths = read.csv('PSGC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PSGC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) - file_path = PSGC_file_paths[i,2] - psgc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) - psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) - PSGC_df = rbind(PSGC_df, psgc_df) -} -``` - - -```{r} -max_phred = max(PSGC_df$Count) + 5 -hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% - hc_title( - text = "Per Sequence GC Content" - ) %>% - hc_xAxis( - title = list(text = "% GC") - ) %>% - hc_exporting(enabled = TRUE) -``` diff -r 0ac073bef19d -r 2f4df2be0572 5_per_base_sequence_content.Rmd --- a/5_per_base_sequence_content.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ ---- -title: "Per Base Sequence Content" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -## Per Base Sequence Content - -```{r} -PBSC_df = data.frame() -PBSC_file_paths = read.csv('PBSC_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -for(i in 1:nrow(PBSC_file_paths)) { - # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) - file_path = PBSC_file_paths[i,2] - pbsc_df = read.csv(file_path, - sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% - mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), - Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% - (function (df) { - df1 = select(df, -Base2) - df2 = select(df, -Base1) %>% filter(Base2 != '') - colnames(df1) = c(colnames(df1)[1:5], 'Base') - colnames(df2) = c(colnames(df2)[1:5], 'Base') - res = rbind(df1, df2) %>% arrange(Base) - return(res) - }) - pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) - PBSC_df = rbind(PBSC_df, pbsc_df) -} -``` - - -```{r out.width="100%"} -PBSC_df_2 = select(PBSC_df, -X.Base) %>% - melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') -p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + - geom_line() + - facet_wrap(~ sample_id) -ggplotly(p) -``` - diff -r 0ac073bef19d -r 2f4df2be0572 _site.yml --- a/_site.yml Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -name: "FastQC Website" -output_dir: "my_site" -navbar: - title: "FastQC" - type: inverse - left: - - text: "Home" - icon: fa-home - href: index.html - - text: "Evaluation Overview" - href: 01_evaluation_overview.html - - text: "Evaluation Items" - menu: - - text: "Per Base Quality Scores" - href: 1_per_base_quality_scores.html - - text: "Per Base N Content" - href: 2_per_base_N_content.html - - text: "Per Sequence Quality Scores" - href: 3_per_sequence_quality_scores.html - - text: "Per Sequence GC Content" - href: 4_per_sequence_GC_content.html - - text: "Per Base Sequence Content" - href: 5_per_base_sequence_content.html - - text: "Original FastQC Reports" - href: 02_fastqc_original_reports.html -output: - html_document: - theme: cosmo - highlight: textmate \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 fastqc_site.xml --- a/fastqc_site.xml Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,124 +0,0 @@ - - - bioconductor-deseq2 - r-getopt - r-rmarkdown - r-plyr - r-stringr - r-highcharter - r-dt - r-reshape2 - r-plotly - r-formattable - r-htmltools - fastqc - - - Implements FastQC analysis and display results in R Markdown website. - - - - - - - - - - - - - - - - - - - @misc{bioinformatics2014fastqc, - title={FastQC}, - author={Bioinformatics, Babraham}, - year={2014} - } - - - @article{allaire2016rmarkdown, - title={rmarkdown: Dynamic Documents for R, 2016}, - author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, - journal={R package version 0.9}, - volume={6}, - year={2016} - } - - - @book{xie2015dynamic, - title={Dynamic Documents with R and knitr}, - author={Xie, Yihui}, - volume={29}, - year={2015}, - publisher={CRC Press} - } - - - @misc{plotly2017, - title = {plotly: Create Interactive Web Graphics via 'plotly.js'}, - author = {Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy}, - year = {2017}, - note = {R package version 4.6.0}, - url = {https://CRAN.R-project.org/package=plotly}, - } - - - @misc{highcharter2017, - title = {highcharter: A Wrapper for the 'Highcharts' Library}, - author = {Joshua Kunst}, - year = {2017}, - note = {R package version 0.5.0}, - url = {https://CRAN.R-project.org/package=highcharter}, - } - - - @misc{formattable2016, - title = {formattable: Create 'Formattable' Data Structures}, - author = {Kun Ren and Kenton Russell}, - year = {2016}, - note = {R package version 0.2.0.1}, - url = {https://CRAN.R-project.org/package=formattable}, - } - - - \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 fastqc_site_render.R --- a/fastqc_site_render.R Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,195 +0,0 @@ -##======= Handle arguments from command line ======== -# setup R error handline to go to stderr -options(show.error.messages=FALSE, - error=function(){ - cat(geterrmessage(), file=stderr()) - quit("no", 1, F) - }) - -# we need that to not crash galaxy with an UTF8 error on German LC settings. -loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -# suppress warning -options(warn = -1) - -options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -args = commandArgs(trailingOnly=TRUE) - -suppressPackageStartupMessages({ - library(getopt) - library(tools) -}) - -# column 1: the long flag name -# column 2: the short flag alias. A SINGLE character string -# column 3: argument mask -# 0: no argument -# 1: argument required -# 2: argument is optional -# column 4: date type to which the flag's argument shall be cast. -# possible values: logical, integer, double, complex, character. -spec_list=list() - -##------- 1. input data --------------------- -spec_list$READS = c('reads', 'r', '1', 'character') -spec_list$ECHO = c('echo', 'e', '1', 'character') - -##--------2. output report and report site directory -------------- -spec_list$FASTQC_SITE = c('fastqc_site', 'o', '1', 'character') -spec_list$FASTQC_SITE_DIR = c('fastqc_site_dir', 'd', '1', 'character') - -##--------3. Rmd templates sitting in the tool directory ---------- - - ## _site.yml and index.Rmd files - spec_list$SITE_YML = c('site_yml', 's', 1, 'character') - spec_list$INDEX_Rmd = c('index_rmd', 'i', 1, 'character') - - ## other Rmd body template files - spec_list$x01 = c('x01_evaluation_overview', 'p', '1', 'character') - spec_list$x02 = c('x02_fastqc_original_reports', 'a', '1', 'character') - spec_list$x1 = c('x1_per_base_quality_scores', 'b', '1', 'character') - spec_list$x2 = c('x2_per_base_N_content', 'c', '1', 'character') - spec_list$x3 = c('x3_per_sequence_quality_scores', 'f', '1', 'character') - spec_list$x4 = c('x4_per_sequence_GC_content', 'g', '1', 'character') - spec_list$x5 = c('x5_per_base_sequence_content', 'h', '1', 'character') - -##------------------------------------------------------------------ - -spec = t(as.data.frame(spec_list)) -opt = getopt(spec) -# arguments are accessed by long flag name (the first column in the spec matrix) -# NOT by element name in the spec_list -# example: opt$help, opt$expression_file -##====== End of arguments handling ========== - -#------ Load libraries --------- -library(rmarkdown) -library(plyr) -library(stringr) -library(dplyr) -library(highcharter) -library(DT) -library(reshape2) -library(plotly) -library(formattable) -library(htmltools) - - -#----- 1. create the report directory ------------------------ -paste0('mkdir -p ', opt$fastqc_site_dir) %>% - system() - -#----- 2. generate Rmd files with Rmd templates -------------- -# a. templates without placeholder variables: -# copy templates from tool directory to the working directory. -# b. templates with placeholder variables: -# substitute variables with user input values and place them in the working directory. - - - #----- Copy index.Rmd and _site.yml files to job working direcotry ----- - file.copy(opt$index_rmd, 'index.Rmd', recursive=TRUE) - file.copy(opt$site_yml, '_site.yml', recursive=TRUE) - #--------------------------------------------------------- - - #----- 01_evaluation_overview.Rmd ----------------------- - readLines(opt$x01_evaluation_overview) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('READS', opt$reads, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('01_evaluation_overview.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 1_per_base_quality_scores.Rmd -------------------- - readLines(opt$x1_per_base_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('1_per_base_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 2_per_base_N_content.Rmd ------------------------- - readLines(opt$x2_per_base_N_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('2_per_base_N_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 3_per_sequence_quality_scores.Rmd ---------------- - readLines(opt$x3_per_sequence_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('3_per_sequence_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 4_per_sequence_GC_content.Rmd -------------------- - readLines(opt$x4_per_sequence_GC_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('4_per_sequence_GC_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 5_per_base_sequence_content.Rmd ------------------ - readLines(opt$x5_per_base_sequence_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('5_per_base_sequence_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 02_fastqc_original_reports.Rmd ------------------- - readLines(opt$x02_fastqc_original_reports) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('02_fastqc_original_reports.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - -#------ 3. render all Rmd files with render_site() -------- -render_site() - - -#-------4. manipulate outputs ----------------------------- -# a. copy index.html to the report output path -# b. copy all files in 'my_site' to the report output directory -file.copy('my_site/index.html', opt$fastqc_site, recursive=TRUE) -paste0('cp -r my_site/* ', opt$fastqc_site_dir) %>% - system() - - diff -r 0ac073bef19d -r 2f4df2be0572 index.Rmd --- a/index.Rmd Tue Aug 08 11:45:41 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ ---- -title: "FastQC Report" -output: html_document ---- - -```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - - -## References - -* Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176. -* Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86. -* Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343. -* Highcharts. https://www.highcharts.com/. (access by May 26, 2017). -* R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. -* Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter -* Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_construct_network.Rmd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_construct_network.Rmd Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,178 @@ +--- +title: 'WGCNA: construct network' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO +) +``` + +# Import workspace + +This step imports workspace from the **WGCNA: preprocessing** step. + +```{r} +fcp = file.copy("PREPROCESSING_WORKSPACE", "deseq.RData") +load("deseq.RData") +``` + + +# Processing outliers {.tabset} + +## Before removing outliers + +```{r} +plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, + cex.axis = 1, cex.main = 1, cex = 0.5) +if(!is.na(HEIGHT_CUT)) { + # plot a line to show the cut + abline(h = HEIGHT_CUT, col = "red") + # determine cluster under the line + clust = cutreeStatic(sampleTree, cutHeight = HEIGHT_CUT, minSize = 10) + keepSamples = (clust==1) + expression_data = expression_data[keepSamples, ] +} +``` + +## After removing outliers + +```{r} +sampleTree = hclust(dist(expression_data), method = "average"); +plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", + cex.axis = 1, cex.main = 1, cex = 0.5) +``` + + +# Trait data {.tabeset} + +If trait data is provided, the first 100 rows from the data will be displayed here. A plot consisting of sample cluster dendrogram and trait heatmap will also be gerenated. + +## Trait data table + +```{r} +trait_data = data.frame() +if ("TRAIT_DATA" != 'None') { + trait_data = read.csv("TRAIT_DATA", header = TRUE, row.names = 1) + # form a data frame analogous to expression data that will hold the traits. + sample_names = rownames(expression_data) + trait_rows = match(sample_names, rownames(trait_data)) + trait_data = trait_data[trait_rows, ] + datatable(head(trait_data, 100), style="bootstrap", filter = 'top', + class="table-condensed", options = list(dom = 'tp', scrollX = TRUE)) +} +``` + +## Dendrogram and heatmap + +```{r fig.align='center', fig.width=8, fig.height=9} +if (nrow(trait_data) != 0) { + traitColors = numbers2colors(trait_data, signed = FALSE) + plotDendroAndColors(sampleTree, traitColors, + groupLabels = names(trait_data), + main = "Sample dendrogram and trait heatmap", + cex.dendroLabels = 0.5) +} +``` + + +# The thresholding power + +```{r} +powers = c(1:10, seq(12, 20, 2)) +soft_threshold = pickSoftThreshold(expression_data, powerVector = powers, verbose = 5) +``` + +```{r fig.align='center'} +par(mfrow=c(1,2)) +plot(soft_threshold$fitIndices[,1], -sign(soft_threshold$fitIndices[,3])*soft_threshold$fitIndices[,2], + xlab="Soft Threshold (power)", + ylab="Scale Free Topology Model Fit,signed R^2",type="n", + main = paste("Scale independence"), + cex.lab = 0.5); +text(soft_threshold$fitIndices[,1], -sign(soft_threshold$fitIndices[,3])*soft_threshold$fitIndices[,2], + labels=powers,cex=0.5,col="red"); + +# calculate soft threshold power +y = -sign(soft_threshold$fitIndices[,3])*soft_threshold$fitIndices[,2] +r2_cutoff = 0.9 +for(i in 1:length(powers)) { + if(y[i] > r2_cutoff) { + soft_threshold_power = soft_threshold$fitIndices[,1][i] + r2_cutoff_new = y[i] + break + } + soft_threshold_power = soft_threshold$fitIndices[,1][length(powers)] +} +abline(h=r2_cutoff, col="red") +abline(v=soft_threshold_power, col="blue") +text(soft_threshold_power+1, r2_cutoff-0.1, + paste0('R^2 cutoff = ', round(r2_cutoff_new,2)), + cex = 0.5, col = "red") + +plot(soft_threshold$fitIndices[,1], soft_threshold$fitIndices[,5], + xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", + main = paste("Mean connectivity"), + cex.lab = 0.5) +text(soft_threshold$fitIndices[,1], soft_threshold$fitIndices[,5], labels=powers, cex=0.5,col="red") +par(mfrow=c(1,1)) +``` + + +# Construct network + +The gene network is constructed based on **soft threshold power = `r soft_threshold_power`** + +```{r} +gene_network = blockwiseModules(expression_data, power = soft_threshold_power, + TOMType = "unsigned", minModuleSize = 30, + reassignThreshold = 0, mergeCutHeight = 0.25, + numericLabels = TRUE, pamRespectsDendro = FALSE, + verbose = 3) +``` + + +# Gene modules {.tabset} + +## Idenfity gene modules + +```{r} +modules = table(gene_network$colors) +n_modules = length(modules) - 1 +module_size_upper = modules[2] +module_size_lower = modules[length(modules)] + +module_table = data.frame(model_label = c(0, 1:n_modules), + gene_size = as.vector(modules)) +datatable(t(module_table)) +``` + +The results above indicates that there are **`r n_modules` gene modules**, labeled 1 through `r length(n_modules)` in order of descending size. The largest module has **`r module_size_upper` genes**, and the smallest module has **`r module_size_lower` genes**. The label 0 is reserved for genes outside of all modules. + + +## Dendrogram and module plot + +```{r} +# Convert labels to colors for plotting +module_colors = labels2colors(gene_network$colors) +# Plot the dendrogram and the module colors underneath +plotDendroAndColors(gene_network$dendrograms[[1]], module_colors[gene_network$blockGenes[[1]]], + "Module colors", + dendroLabels = FALSE, hang = 0.03, + addGuide = TRUE, guideHang = 0.05) +``` + + +```{r echo=FALSE} +# save workspace +rm("opt") +save(list=ls(all.names = TRUE), file='CONSTRUCT_NETWORK_WORKSPACE') +``` + + diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_construct_network.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_construct_network.xml Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,105 @@ + + + r-getopt + r-rmarkdown + r-plyr + r-highcharter + r-dt + r-htmltools + r-wgcna + + + Construct gene network. + + + + + + + + + + + + + + + + + + + + + + + @article{langfelder2008wgcna, + title={WGCNA: an R package for weighted correlation network analysis}, + author={Langfelder, Peter and Horvath, Steve}, + journal={BMC bioinformatics}, + volume={9}, + number={1}, + pages={559}, + year={2008}, + publisher={BioMed Central} + } + + + @article{allaire2016rmarkdown, + title={rmarkdown: Dynamic Documents for R, 2016}, + author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, + journal={R package version 0.9}, + volume={6}, + year={2016} + } + + + @book{xie2015dynamic, + title={Dynamic Documents with R and knitr}, + author={Xie, Yihui}, + volume={29}, + year={2015}, + publisher={CRC Press} + } + + + \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_construct_network_render.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_construct_network_render.R Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,112 @@ +##======= Handle arguments from command line ======== +# setup R error handline to go to stderr +options(show.error.messages=FALSE, + error=function(){ + cat(geterrmessage(), file=stderr()) + quit("no", 1, F) + }) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# suppress warning +options(warn = -1) + +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) +args = commandArgs(trailingOnly=TRUE) + +suppressPackageStartupMessages({ + library(getopt) + library(tools) +}) + +# column 1: the long flag name +# column 2: the short flag alias. A SINGLE character string +# column 3: argument mask +# 0: no argument +# 1: argument required +# 2: argument is optional +# column 4: date type to which the flag's argument shall be cast. +# possible values: logical, integer, double, complex, character. +spec_list=list() + +##------- 1. input data --------------------- +spec_list$ECHO = c('echo', 'e', '1', 'character') +spec_list$PREPROCESSING_WORKSPACE = c('preprocessing_workspace', 'w', '1', 'character') +spec_list$HEIGHT_CUT = c('height_cut', 'h', '2', 'double') +spec_list$TRAIT_DATA = c('trait_data', 't', '2', 'character') + + +##--------2. output report and report site directory -------------- +spec_list$OUTPUT_HTML = c('wgcna_construct_network_html', 'o', '1', 'character') +spec_list$OUTPUT_DIR = c('wgcna_construct_network_dir', 'd', '1', 'character') +spec_list$CONSTRUCT_NETWORK_WORKSPACE = c('construct_network_workspace', 'W', '1', 'character') + + +##--------3. Rmd templates in the tool directory ---------- + +spec_list$WGCNA_PREPROCESSING_RMD = c('wgcna_construct_network_rmd', 'M', '1', 'character') + + + +##------------------------------------------------------------------ + +spec = t(as.data.frame(spec_list)) +opt = getopt(spec) +# arguments are accessed by long flag name (the first column in the spec matrix) +# NOT by element name in the spec_list +# example: opt$help, opt$expression_file +##====== End of arguments handling ========== + +#------ Load libraries --------- +library(rmarkdown) +library(WGCNA) +library(DT) +library(htmltools) +library(ggplot2) + + +#----- 1. create the report directory ------------------------ +system(paste0('mkdir -p ', opt$wgcna_construct_network_dir)) + + +#----- 2. generate Rmd files with Rmd templates -------------- +# a. templates without placeholder variables: +# copy templates from tool directory to the working directory. +# b. templates with placeholder variables: +# substitute variables with user input values and place them in the working directory. + + +#----- 01 wgcna_construct_network.Rmd ----------------------- +readLines(opt$wgcna_construct_network_rmd) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('PREPROCESSING_WORKSPACE', opt$preprocessing_workspace, x) + }) %>% + (function(x) { + gsub('HEIGHT_CUT', opt$height_cut, x) + }) %>% + (function(x) { + gsub('TRAIT_DATA', opt$trait_data, x) + }) %>% + (function(x) { + gsub('OUTPUT_DIR', opt$wgcna_construct_network_dir, x) + }) %>% + (function(x) { + gsub('CONSTRUCT_NETWORK_WORKSPACE', opt$construct_network_workspace, x) + }) %>% + (function(x) { + fileConn = file('wgcna_construct_network.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + +#------ 3. render all Rmd files -------- +render('wgcna_construct_network.Rmd', output_file = opt$wgcna_construct_network_html) + +#-------4. manipulate outputs ----------------------------- + + diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_eigengene_visualization.Rmd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_eigengene_visualization.Rmd Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,121 @@ +--- +title: 'WGCNA: eigengene visualization' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO +) +``` + +# Import workspace + +This step imports workspace from the **WGCNA: construct network** step. + +```{r} +fcp = file.copy("CONSTRUCT_NETWORK_WORKSPACE", "deseq.RData") +load("deseq.RData") +``` + + +# Gene modules {.tabset} + +```{r} +if(!is.na(SOFT_THRESHOLD_POWER)) soft_threshold_power = SOFT_THRESHOLD_POWER +``` + +## Identify gene modules + +The gene network is constructed based on **soft threshold power = `r soft_threshold_power`** + +```{r} +gene_network = blockwiseModules(expression_data, power = soft_threshold_power, + TOMType = "unsigned", minModuleSize = 30, + reassignThreshold = 0, mergeCutHeight = 0.25, + numericLabels = TRUE, pamRespectsDendro = FALSE, + verbose = 3) +``` + + +```{r} +modules = table(gene_network$colors) +n_modules = length(modules) - 1 +module_size_upper = modules[2] +module_size_lower = modules[length(modules)] + +module_table = data.frame(model_label = c(0, 1:n_modules), + gene_size = as.vector(modules)) +datatable(t(module_table)) +``` + +The results above indicates that there are **`r n_modules` gene modules**, labeled 1 through `r length(n_modules)` in order of descending size. The largest module has **`r module_size_upper` genes**, and the smallest module has **`r module_size_lower` genes**. The label 0 is reserved for genes outside of all modules. + + +## Dendrogram and module plot + +```{r} +# Convert labels to colors for plotting +module_colors = labels2colors(gene_network$colors) +# Plot the dendrogram and the module colors underneath +plotDendroAndColors(gene_network$dendrograms[[1]], module_colors[gene_network$blockGenes[[1]]], + "Module colors", + dendroLabels = FALSE, hang = 0.03, + addGuide = TRUE, guideHang = 0.05) +``` + + +# Gene module correlation + +We can calculate eigengenes and use them as representative profiles to quantify similarity of found gene modules. + +```{r} +n_genes = ncol(expression_data) +n_samples = nrow(expression_data) +``` + +```{r} +diss_tom = 1-TOMsimilarityFromExpr(expression_data, power = soft_threshold_power) +set.seed(123) +select_genes = sample(n_genes, size = PLOT_GENES) +select_diss_tom = diss_tom[select_genes, select_genes] + +# calculate gene tree on selected genes +select_gene_tree = hclust(as.dist(select_diss_tom), method = 'average') +select_module_colors = module_colors[select_genes] + +# transform diss_tom with a power to make moderately strong connections more visiable in the heatmap. +plot_diss_tom = select_diss_tom^7 +# set diagonal to NA for a nicer plot +diag(plot_diss_tom) = NA +``` + + +```{r fig.align='center'} +TOMplot(plot_diss_tom, select_gene_tree, select_module_colors, main = "Network heatmap") +``` + + +# Eigengene visualization {.tabset} + +## Eigengene dendrogram + +```{r fig.align='center'} +module_eigengenes = moduleEigengenes(expression_data, module_colors)$eigengenes +plotEigengeneNetworks(module_eigengenes, "Eigengene dendrogram", + plotHeatmaps = FALSE) +``` + +## Eigengene adjacency heatmap + +```{r fig.align='center'} +plotEigengeneNetworks(module_eigengenes, "Eigengene adjacency heatmap", + marHeatmap = c(2, 3, 2, 2), + plotDendrograms = FALSE, xLabelsAngle = 90) +``` + diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_eigengene_visualization.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_eigengene_visualization.xml Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,100 @@ + + + r-getopt + r-rmarkdown + r-plyr + r-highcharter + r-dt + r-htmltools + r-wgcna + + + Eigengene visualization. + + + + + + + + + + + + + + + + + + + + + @article{langfelder2008wgcna, + title={WGCNA: an R package for weighted correlation network analysis}, + author={Langfelder, Peter and Horvath, Steve}, + journal={BMC bioinformatics}, + volume={9}, + number={1}, + pages={559}, + year={2008}, + publisher={BioMed Central} + } + + + @article{allaire2016rmarkdown, + title={rmarkdown: Dynamic Documents for R, 2016}, + author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, + journal={R package version 0.9}, + volume={6}, + year={2016} + } + + + @book{xie2015dynamic, + title={Dynamic Documents with R and knitr}, + author={Xie, Yihui}, + volume={29}, + year={2015}, + publisher={CRC Press} + } + + + \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_eigengene_visualization_render.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_eigengene_visualization_render.R Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,109 @@ +##======= Handle arguments from command line ======== +# setup R error handline to go to stderr +options(show.error.messages=FALSE, + error=function(){ + cat(geterrmessage(), file=stderr()) + quit("no", 1, F) + }) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# suppress warning +options(warn = -1) + +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) +args = commandArgs(trailingOnly=TRUE) + +suppressPackageStartupMessages({ + library(getopt) + library(tools) +}) + +# column 1: the long flag name +# column 2: the short flag alias. A SINGLE character string +# column 3: argument mask +# 0: no argument +# 1: argument required +# 2: argument is optional +# column 4: date type to which the flag's argument shall be cast. +# possible values: logical, integer, double, complex, character. +spec_list=list() + +##------- 1. input data --------------------- +spec_list$ECHO = c('echo', 'e', '1', 'character') +spec_list$CONSTRUCT_NETWORK_WORKSPACE = c('construct_network_workspace', 'w', '1', 'character') +spec_list$SOFT_THRESHOLD_POWER = c('soft_threshold_power', 'p', '2', 'double') +spec_list$PLOT_GENES = c('plot_genes', 'n', '1', 'integer') + + +##--------2. output report and report site directory -------------- +spec_list$OUTPUT_HTML = c('wgcna_eigengene_visualization_html', 'o', '1', 'character') +spec_list$OUTPUT_DIR = c('wgcna_eigengene_visualization_dir', 'd', '1', 'character') + + + +##--------3. Rmd templates in the tool directory ---------- + +spec_list$WGCNA_EIGENGENE_VISUALIZATION_RMD = c('wgcna_eigengene_visualization_rmd', 'M', '1', 'character') + + + +##------------------------------------------------------------------ + +spec = t(as.data.frame(spec_list)) +opt = getopt(spec) +# arguments are accessed by long flag name (the first column in the spec matrix) +# NOT by element name in the spec_list +# example: opt$help, opt$expression_file +##====== End of arguments handling ========== + +#------ Load libraries --------- +library(rmarkdown) +library(WGCNA) +library(DT) +library(htmltools) +library(ggplot2) + + +#----- 1. create the report directory ------------------------ +system(paste0('mkdir -p ', opt$wgcna_eigengene_visualization_dir)) + + +#----- 2. generate Rmd files with Rmd templates -------------- +# a. templates without placeholder variables: +# copy templates from tool directory to the working directory. +# b. templates with placeholder variables: +# substitute variables with user input values and place them in the working directory. + + +#----- 01 wgcna_eigengene_visualization.Rmd ----------------------- +readLines(opt$wgcna_eigengene_visualization_rmd) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('CONSTRUCT_NETWORK_WORKSPACE', opt$construct_network_workspace, x) + }) %>% + (function(x) { + gsub('SOFT_THRESHOLD_POWER', opt$soft_threshold_power, x) + }) %>% + (function(x) { + gsub('PLOT_GENES', opt$plot_genes, x) + }) %>% + (function(x) { + gsub('OUTPUT_DIR', opt$wgcna_eigengene_visualization_dir, x) + }) %>% + (function(x) { + fileConn = file('wgcna_eigengene_visualization.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + +#------ 3. render all Rmd files -------- +render('wgcna_eigengene_visualization.Rmd', output_file = opt$wgcna_eigengene_visualization_html) + +#-------4. manipulate outputs ----------------------------- + + diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_preprocessing.Rmd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_preprocessing.Rmd Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,76 @@ +--- +title: 'WGCNA: data preprocessing' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO +) +``` + +```{r} +str(opt) +``` + +# Import data + +Each row represents a gene and each column represents a sample. + +```{r} +expression_data = read.csv('EXPRESSION_DATA', header = TRUE, row.names = 1) +``` + +Display the first 100 genes. + +```{r} +datatable(head(expression_data, 100), style="bootstrap", filter = 'top', + class="table-condensed", options = list(dom = 'tp', scrollX = TRUE)) +``` + +Transpose expression data matrix so that each row represents a sample and each column represents a gene. + +```{r} +expression_data = as.data.frame(t(expression_data)) +``` + +# Checking data + +Checking data for excessive missing values and identification of outlier microarray samples. + +```{r} +gsg = goodSamplesGenes(expression_data, verbose = 3) +if (!gsg$allOK) { + # Optionally, print the gene and sample names that were removed: + if (sum(!gsg$goodGenes)>0) + printFlush(paste("Removing genes:", paste(names(expression_data)[!gsg$goodGenes], collapse = ", "))); + if (sum(!gsg$goodSamples)>0) + printFlush(paste("Removing samples:", paste(rownames(expression_data)[!gsg$goodSamples], collapse = ", "))); + # Remove the offending genes and samples from the data: + expression_data = expression_data[gsg$goodSamples, gsg$goodGenes] +} else { + print('all genes are OK!') +} +``` + +# Clustering samples + +If there are any outliers, choose a height cut that will remove the offending sample. Remember this number since you will need this number in further analysis. + +```{r fig.align='center'} +sampleTree = hclust(dist(expression_data), method = "average"); +plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", + cex.axis = 1, cex.main = 1, cex = 0.5) +``` + + +```{r echo=FALSE} +rm("opt") +save(list=ls(all.names = TRUE), file='PREPROCESSING_WORKSPACE') +``` + diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_preprocessing.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_preprocessing.xml Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,96 @@ + + + r-getopt + r-rmarkdown + r-plyr + r-highcharter + r-dt + r-htmltools + r-wgcna + + + Data clearning and preprocessing. + + + + + + + + + + + + + + + + + + + + + @article{langfelder2008wgcna, + title={WGCNA: an R package for weighted correlation network analysis}, + author={Langfelder, Peter and Horvath, Steve}, + journal={BMC bioinformatics}, + volume={9}, + number={1}, + pages={559}, + year={2008}, + publisher={BioMed Central} + } + + + @article{allaire2016rmarkdown, + title={rmarkdown: Dynamic Documents for R, 2016}, + author={Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob}, + journal={R package version 0.9}, + volume={6}, + year={2016} + } + + + @book{xie2015dynamic, + title={Dynamic Documents with R and knitr}, + author={Xie, Yihui}, + volume={29}, + year={2015}, + publisher={CRC Press} + } + + + \ No newline at end of file diff -r 0ac073bef19d -r 2f4df2be0572 wgcna_preprocessing_render.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wgcna_preprocessing_render.R Tue Aug 08 12:35:11 2017 -0400 @@ -0,0 +1,102 @@ +##======= Handle arguments from command line ======== +# setup R error handline to go to stderr +options(show.error.messages=FALSE, + error=function(){ + cat(geterrmessage(), file=stderr()) + quit("no", 1, F) + }) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +# suppress warning +options(warn = -1) + +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) +args = commandArgs(trailingOnly=TRUE) + +suppressPackageStartupMessages({ + library(getopt) + library(tools) +}) + +# column 1: the long flag name +# column 2: the short flag alias. A SINGLE character string +# column 3: argument mask +# 0: no argument +# 1: argument required +# 2: argument is optional +# column 4: date type to which the flag's argument shall be cast. +# possible values: logical, integer, double, complex, character. +spec_list=list() + +##------- 1. input data --------------------- +spec_list$ECHO = c('echo', 'e', '1', 'character') +spec_list$EXPRESSION_DATA = c('expression_data', 'E', '1', 'character') + + +##--------2. output report and report site directory -------------- +spec_list$OUTPUT_HTML = c('wgcna_preprocessing_html', 'o', '1', 'character') +spec_list$OUTPUT_DIR = c('wgcna_preprocessing_dir', 'd', '1', 'character') +spec_list$PREPROCESSING_WORKSPACE = c('preprocessing_workspace', 'w', '1', 'character') + +##--------3. Rmd templates sitting in the tool directory ---------- + +spec_list$WGCNA_PREPROCESSING_RMD = c('wgcna_preprocessing_rmd', 'D', '1', 'character') + + + +##------------------------------------------------------------------ + +spec = t(as.data.frame(spec_list)) +opt = getopt(spec) +# arguments are accessed by long flag name (the first column in the spec matrix) +# NOT by element name in the spec_list +# example: opt$help, opt$expression_file +##====== End of arguments handling ========== + +#------ Load libraries --------- +library(rmarkdown) +library(WGCNA) +library(DT) +library(htmltools) + + +#----- 1. create the report directory ------------------------ +system(paste0('mkdir -p ', opt$wgcna_preprocessing_dir)) + + +#----- 2. generate Rmd files with Rmd templates -------------- +# a. templates without placeholder variables: +# copy templates from tool directory to the working directory. +# b. templates with placeholder variables: +# substitute variables with user input values and place them in the working directory. + + +#----- 01 wgcna_preprocessing.Rmd ----------------------- +readLines(opt$wgcna_preprocessing_rmd) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('EXPRESSION_DATA', opt$expression_data, x) + }) %>% + (function(x) { + gsub('OUTPUT_DIR', opt$wgcna_preprocessing_dir, x) + }) %>% + (function(x) { + gsub('PREPROCESSING_WORKSPACE', opt$preprocessing_workspace, x) + }) %>% + (function(x) { + fileConn = file('wgcna_preprocessing.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + +#------ 3. render all Rmd files -------- +render('wgcna_preprocessing.Rmd', output_file = opt$wgcna_preprocessing_html) + +#-------4. manipulate outputs ----------------------------- + +