Mercurial > repos > mingchen0919 > rmarkdown_bdss_client
changeset 0:512d008295db draft default tip
planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_bdss_client_main commit d9ab791a7ce12362dc6e28c0a518a3f23dd581fe-dirty
author | mingchen0919 |
---|---|
date | Tue, 17 Oct 2017 14:09:01 -0400 |
parents | |
children | |
files | bdss_client.Rmd bdss_client.xml bdss_client_render.R bdss_client_sra.Rmd bdss_client_sra.xml bdss_client_sra_render.R |
diffstat | 6 files changed, 440 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client.Rmd Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,52 @@ +--- +title: 'Download with BDSS client' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error=TRUE +) +``` + +# Command line arguments + +```{r 'command line arguments'} +str(opt) +``` + +# BDSS configuration file + +First, we create a bdss configuration file `bdss.cfg` in the current directory. + +```{r} +system('echo "[metadata_repository]" > bdss.cfg') +system('echo url=http://bdss.bioinfo.wsu.edu/ >> bdss.cfg') +``` + +# Download data + +```{r 'download and extract reads'} +# create a directory to store read files +dir.create('read_files_directory') +# download and extract reads +urls = strsplit(gsub(',', ' ', 'URLS'), ' ')[[1]] +urls = urls[urls != ''] +# loop through SRA accessions to download and extract reads. +for(url in urls) { + print(url) + bdss_command = paste0('/tool_deps/_conda/bin/bdss transfer --destination read_files_directory -u ', url) + print(bdss_command) + print(system(bdss_command, intern = TRUE)) +} +# all files that need to be saved should be moved to REPORT_DIR directory +# print(system('mv read_files_directory REPORT_DIR', intern = TRUE)) +``` + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client.xml Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,47 @@ +<tool id="rmarkdown_bdss_client" name="BDSS client" version="1.0.3"> + <requirements> + <requirement type="package" version="1.15.0.6-0">pandoc</requirement> + <requirement type="package" version="3.3.2">r-base</requirement> + <requirement type="package" version="1.20.0">r-getopt</requirement> + <requirement type="package" version="1.2">r-rmarkdown</requirement> + <requirement type="package" version="0.3.5">r-htmltools</requirement> + <requirement type="package" version="0.5.0">r-dplyr</requirement> + <requirement type="package" version="0.5.4">parallel-fastq-dump</requirement> + <requirement type="package" version="1.95_4.8">r-rcurl</requirement> + </requirements> + <description> + Download data with BDSS client. + </description> + <stdio> + <!--All stderr are redirected to a file. "XXX" is used to match with nothing--> + <regex match="XXX" + source="stderr" + level="warning" + description="Check the warnings_and_errors.txt file for more details."/> + </stdio> + <command> + <![CDATA[ + Rscript '${__tool_directory__}/bdss_client_render.R' + -i '$urls' + -e $echo + + -r $report + -d $report.files_path + -s $sink_message + + -t '${__tool_directory__}/bdss_client.Rmd' + ]]> + </command> + <inputs> + <param type="text" name="urls" area="true" size="5x25" label="URLs to data files" /> + <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" + label="Display analysis code in report?"/> + </inputs> + <outputs> + <data format="html" name="report" label="BDSS client report"/> + <data name="output" label="BDSS downloaded data"> + <discover_datasets pattern="__name_and_ext__" directory="read_files_directory" visible="true"/> + </data> + <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/> + </outputs> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client_render.R Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,82 @@ +library(getopt) +library(rmarkdown) +library(htmltools) +library(dplyr) +library(RCurl) + + +##============ Sink warnings and errors to a file ============== +## use the sink() function to wrap all code within it. +##============================================================== +zz = file('warnings_and_errors.txt') +sink(zz) +sink(zz, type = 'message') +##---------below is the code for rendering .Rmd templates----- + + ##=============STEP 1: handle command line arguments========== + ## + ##============================================================ + # column 1: the long flag name + # column 2: the short flag alias. A SINGLE character string + # column 3: argument mask + # 0: no argument + # 1: argument required + # 2: argument is optional + # column 4: date type to which the flag's argument shall be cast. + # possible values: logical, integer, double, complex, character. + #------------------------------------------------------------- + #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++ + # 1. short flag alias should match the flag in the command section in the XML file. + # 2. long flag name can be any legal R variable names + # 3. two names in args_list can have common string but one name should not be a part of another name. + # for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems. + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + ##------- 1. input data --------------------- + args_list=list() + args_list$URLS = c('urls', 'i', '1', 'character') + args_list$ECHO = c('echo', 'e', '1', 'character') + ##--------2. output report and outputs -------------- + args_list$REPORT_HTML = c('report_html', 'r', '1', 'character') + args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') + args_list$SINK_OUTPUT = c('sink_message', 's', '1', 'character') + ##--------3. Rmd templates in the tool directory ---------- + args_list$BDSS_CLIENT_RMD = c('bdss_client_rmd', 't', '1', 'character') + + opt = getopt(t(as.data.frame(args_list))) + + + ##=======STEP 2: create report directory (optional)========== + ## + ##=========================================================== + dir.create(opt$report_dir) + + ##=STEP 3: replace placeholders in .Rmd with argument values= + ## + ##=========================================================== + #++ need to replace placeholders with args values one by one+ + #----- 01 bdss_client.Rmd ----------------------- + readLines(opt$bdss_client_rmd) %>% + (function(x) { + gsub('URLS', opt$urls, x) + }) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('bdss_client.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + ##=============STEP 4: render .Rmd templates================= + ## + ##=========================================================== + render('bdss_client.Rmd', output_file = opt$report_html) + + +##--------end of code rendering .Rmd templates---------------- +sink() +##=========== End of sinking output============================= \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client_sra.Rmd Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,105 @@ +--- +title: 'Download and extract single end fastq/fasta data with BDSS client from SRA accessions' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error=TRUE +) +``` + +# Command line arguments + +```{r 'command line arguments'} +str(opt) +``` + +# BDSS configuration file + +First, we create a bdss configuration file `bdss.cfg` in the current directory. + +```{r} +system('echo "[metadata_repository]" > bdss.cfg') +system('echo url=http://bdss.bioinfo.wsu.edu/ >> bdss.cfg') +``` + +# Download and extract reads + +```{r 'download and extract reads'} +# create two directories, one for single end and the other for paired end SRA reads. +dir.create('se_read_files_directory') +dir.create('pe_read_files_directory') +# download and extract reads (single end) +sra_ids_se = strsplit(gsub(',', ' ', 'SRA_IDS_SE'), ' ')[[1]] +sra_ids_se = sra_ids_se[sra_ids_se != ''] +# loop through SRA accessions to download and extract reads. +for(id in sra_ids_se) { + # build URL from SRA id + url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/', + substr(id, 1, 3), '/', + substr(id, 1, 6), '/', id, '/', id, '.sra') + # download sra file with bdss + bdss_command = paste0('/tool_deps/_conda/bin/bdss transfer -u ', url) + system(bdss_command, intern = TRUE) + # convert .sra to .fastq/.fasta + if('FORMAT' == 'fasta') { + command = paste0('fastq-dump --fasta -O se_read_files_directory ', id, '.sra') + } else { + command = paste0('fastq-dump -O se_read_files_directory ', id, '.sra') + } + cat('----convert SRA to fastq/fasta------\n') + print(system(command, intern = TRUE)) +} + +# download and extract reads (paired end) +sra_ids_pe = strsplit(gsub(',', ' ', 'SRA_IDS_PE'), ' ')[[1]] +sra_ids_pe = sra_ids_pe[sra_ids_pe != ''] +# loop through SRA accessions to download and extract reads. +for(id in sra_ids_pe) { + # build URL from SRA id + url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/', + substr(id, 1, 3), '/', + substr(id, 1, 6), '/', id, '/', id, '.sra') + # download sra file with bdss + bdss_command = paste0('/tool_deps/_conda/bin/bdss transfer -u ', url) + system(bdss_command, intern = TRUE) + # convert .sra to .fastq/.fasta + if('FORMAT' == 'fasta') { + command = paste0('fastq-dump --fasta --split-files -O pe_read_files_directory ', id, '.sra') + } else { + command = paste0('fastq-dump --split-files -O pe_read_files_directory ', id, '.sra') + } + cat('----convert SRA to fastq/fasta------\n') + command_stdout = system(command, intern = TRUE) + print(command_stdout) + if(!(paste0(id, '_2.FORMAT') %in% list.files('pe_read_files_directory'))) { + # this is not a paired end SRA file. The corresponding file will be deleted. + cat(paste0(id, ' is not paired end SRA, the corresponding fastq/fasta file will deleted.')) + system(paste0('rm pe_read_files_directory/', id, '_1.*'), intern = TRUE) + } + +} + +cat('-----single end files----\n') +list.files('./se_read_files_directory') +cat('-----paired end files----\n') +list.files('./pe_read_files_directory') + +cat('-----Renaming files------\n') +# rename files for paired end reads +old_files = paste0('./pe_read_files_directory/', list.files('./pe_read_files_directory')) +print(old_files) +new_files = gsub('_1', '_forward', old_files) +new_files = gsub('_2', '_reverse', new_files) +print(new_files) +file.rename(old_files, new_files) +``` + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client_sra.xml Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,64 @@ +<tool id="rmarkdown_bdss_client_sra" name="BDSS client SRA" version="1.0.3"> + <requirements> + <requirement type="package" version="1.15.0.6-0">pandoc</requirement> + <requirement type="package" version="3.3.2">r-base</requirement> + <requirement type="package" version="1.20.0">r-getopt</requirement> + <requirement type="package" version="1.2">r-rmarkdown</requirement> + <requirement type="package" version="0.3.5">r-htmltools</requirement> + <requirement type="package" version="0.5.0">r-dplyr</requirement> + <requirement type="package" version="0.5.4">parallel-fastq-dump</requirement> + <requirement type="package" version="1.95_4.8">r-rcurl</requirement> + </requirements> + <description> + Download data with BDSS client and generate list (single end SRA data) and list:paired dataset collection + (paired end SRA data). + </description> + <stdio> + <!--All stderr are redirected to a file. "XXX" is used to match with nothing--> + <regex match="XXX" + source="stderr" + level="warning" + description="Check the warnings_and_errors.txt file for more details."/> + </stdio> + <command> + <![CDATA[ + Rscript '${__tool_directory__}/bdss_client_sra_render.R' + -i '$sra_ids_se' + -p '$sra_ids_pe' + -f $format + -e $echo + + -r $report + -d $report.files_path + -s $sink_message + + -t '${__tool_directory__}/bdss_client_sra.Rmd' + ]]> + </command> + <inputs> + <param type="text" name="sra_ids_se" area="true" size="5x25" label="SRR/DRR/ERR accessions of single end SRA" + help="A list of SRR/DRR/ERR accessions separated by comma or space. e.g. SRR039885"/> + <param type="text" name="sra_ids_pe" area="true" size="5x25" label="SRR/DRR/ERR accessions of paired end SRA" + help="A list of SRR/DRR/ERR accessions separated by comma or space. e.g. ERR1748507"/> + <param type="select" name="format" label="Output format"> + <option value="fastq" selected="true">fastq</option> + <option value="fasta">fasta</option> + </param> + <param type="boolean" name="echo" truevalue="TRUE" falsevalue="FALSE" checked="false" + label="Display analysis code in report?"/> + </inputs> + <outputs> + <data format="html" name="report" label="BDSS client report"/> + <!--list dataset collection for single end SRA data--> + <collection type="list" name="list_collection" label="BDSS download data (single end reads)"> + <discover_datasets pattern="__name_and_ext__" directory="se_read_files_directory"/> + </collection> + <!--list:paired dataset collection for paired end SRA data--> + <collection type="list:paired" name="list:paired_collection" label="BDSS download data (paired end reads)"> + <discover_datasets + pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[^_]+)\.(?P<ext>[^\._]+)?" + directory="pe_read_files_directory"/> + </collection> + <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/> + </outputs> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client_sra_render.R Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,90 @@ +library(getopt) +library(rmarkdown) +library(htmltools) +library(dplyr) +library(RCurl) + + +##============ Sink warnings and errors to a file ============== +## use the sink() function to wrap all code within it. +##============================================================== +zz = file('warnings_and_errors.txt') +sink(zz) +sink(zz, type = 'message') +##---------below is the code for rendering .Rmd templates----- + +##=============STEP 1: handle command line arguments========== +## +##============================================================ +# column 1: the long flag name +# column 2: the short flag alias. A SINGLE character string +# column 3: argument mask +# 0: no argument +# 1: argument required +# 2: argument is optional +# column 4: date type to which the flag's argument shall be cast. +# possible values: logical, integer, double, complex, character. +#------------------------------------------------------------- +#++++++++++++++++++++ Best practice ++++++++++++++++++++++++++ +# 1. short flag alias should match the flag in the command section in the XML file. +# 2. long flag name can be any legal R variable names +# 3. two names in args_list can have common string but one name should not be a part of another name. +# for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems. +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +##------- 1. input data --------------------- +args_list=list() +args_list$SRA_IDS_SE = c('sra_ids_se', 'i', '1', 'character') +args_list$SRA_IDS_PE = c('sra_ids_pe', 'p', '1', 'character') +args_list$FORMAT = c('format', 'f', '1', 'character') +args_list$ECHO = c('echo', 'e', '1', 'character') +##--------2. output report and outputs -------------- +args_list$REPORT_HTML = c('report_html', 'r', '1', 'character') +args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') +args_list$SINK_OUTPUT = c('sink_message', 's', '1', 'character') +##--------3. Rmd templates in the tool directory ---------- +args_list$BDSS_CLIENT_RMD = c('bdss_client_rmd', 't', '1', 'character') + +opt = getopt(t(as.data.frame(args_list))) + + +##=======STEP 2: create report directory (optional)========== +## +##=========================================================== +dir.create(opt$report_dir) + +##=STEP 3: replace placeholders in .Rmd with argument values= +## +##=========================================================== +#++ need to replace placeholders with args values one by one+ +#----- 01 bdss_client.Rmd ----------------------- +readLines(opt$bdss_client_rmd) %>% + (function(x) { + gsub('SRA_IDS_SE', opt$sra_ids_se, x) + }) %>% + (function(x) { + gsub('SRA_IDS_PE', opt$sra_ids_pe, x) + }) %>% + (function(x) { + gsub('FORMAT', opt$format, x) + }) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('bdss_client.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + +##=============STEP 4: render .Rmd templates================= +## +##=========================================================== +render('bdss_client.Rmd', output_file = opt$report_html) + + +##--------end of code rendering .Rmd templates---------------- +sink() +##=========== End of sinking output============================= \ No newline at end of file