changeset 2:eb1206832359 draft

Deleted selected files
author theo.collard
date Wed, 26 Apr 2017 08:41:12 -0400
parents fa62657e9b57
children 896cdffe06ff
files ballgown.R ballgown.xml
diffstat 2 files changed, 0 insertions(+), 308 deletions(-) [+]
line wrap: on
line diff
--- a/ballgown.R	Wed Apr 26 08:29:56 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-#!/usr/bin/Rscript
-
-# Enabling commands line arguments. Using optparse which allows to use options.
-# ----------------------------------------------------------------------------------------
-
-suppressMessages(library(optparse, warn.conflicts = FALSE))
-opt_list=list(
-make_option(c("-d", "--directory"), type="character", default=NULL, help="directory containing the samples", metavar="character"),
-make_option(c("-p", "--phendat"), type="character", default=NULL, help="phenotype data(must be a .csv file)", metavar="character"),
-make_option(c("-t","--outputtranscript"), type="character", default="output_transcript.csv", help="output_transcript.csv: contains the transcripts of the expirements", metavar="character"),
-make_option(c("-g","--outputgenes"), type="character", default="output_genes.csv", help="output_genes.csv: contains the genes of the expirements", metavar="character"),
-make_option(c("-e","--texpression"), type="double", default="0.5", help="transcripts expression filter", metavar="character"),
-make_option(c("--bgout"), type="character", default="", help="save the ballgown object created in the process", metavar="character")
-)
-opt_parser=OptionParser(option_list=opt_list)
-opt=parse_args(opt_parser)
-
-# Loading required libraries. suppressMessages() remove all noisy attachement messages
-# ----------------------------------------------------------------------------------------
-
-suppressMessages(library(ballgown, warn.conflicts = FALSE))
-suppressMessages(library(genefilter, warn.conflicts = FALSE))
-suppressMessages(library(dplyr, warn.conflicts = FALSE))
-
-# Setup for the tool with some bases variables.
-# ----------------------------------------------------------------------------------------
-
-
-filtstr = opt$texpression
-pdat = 2
-phendata = read.csv(opt$phendat)
-setwd(opt$dir)
-
-# Checking if the pdata file has the right samples names.
-# ----------------------------------------------------------------------------------------
-
-if (all(phendata$ids == list.files(".")) != TRUE)
-{
-  stop("Your phenotype data table does not match the samples names. ")
-}
-
-# Creation of the ballgown object based on data
-# ----------------------------------------------------------------------------------------
-bgi = ballgown(dataDir= "." , samplePattern="", pData = phendata, verbose = FALSE)
-
-# Filter the genes with an expression superior to the input filter
-# ----------------------------------------------------------------------------------------
-bgi_filt= subset(bgi, paste("rowVars(texpr(bgi)) >",filtstr), genomesubset = TRUE)
-
-# Creating the variables containing the transcripts and the genes and sorting them through the arrange() command.
-# Checking if there's one or more adjust variables in the phenotype data file
-# ----------------------------------------------------------------------------------------
-
-if (ncol(pData(bgi))<=3) {
-  results_transcripts=stattest(bgi_filt,feature = "transcript", covariate = colnames(pData(bgi))[pdat], adjustvars = colnames(pData(bgi)[pdat+1]), getFC = TRUE, meas = "FPKM")
-  results_genes=stattest(bgi_filt,feature = "gene", covariate = colnames(pData(bgi))[pdat], adjustvars = colnames(pData(bgi)[pdat+1]), getFC = TRUE, meas = "FPKM")
-} else {
-  results_transcripts=stattest(bgi_filt,feature = "transcript", covariate = colnames(pData(bgi))[pdat], adjustvars = c(colnames(pData(bgi)[pdat+1:ncol(pData(bgi))])), getFC = TRUE, meas = "FPKM")
-  results_genes=stattest(bgi_filt,feature = "gene", covariate = colnames(pData(bgi))[pdat], adjustvars = c(colnames(pData(bgi)[pdat+1:ncol(pData(bgi))])), getFC = TRUE, meas = "FPKM")
-}
-
-results_transcripts = data.frame(geneNames=ballgown::geneNames(bgi_filt), geneIDs=ballgown::geneIDs(bgi_filt), results_transcripts)
-results_transcripts = arrange(results_transcripts,pval)
-results_genes = arrange(results_genes,pval)
-
-# Main output of the wrapper, two .csv files containing the genes and transcripts with their qvalue and pvalue
-#This part also output the data of the ballgown object created in the process and save it in a R data file
-# ----------------------------------------------------------------------------------------
-write.csv(results_transcripts, opt$outputtranscript, row.names = FALSE)
-write.csv(results_genes, opt$outputgenes, row.names = FALSE)
-if (opt$bgout != ""){
-  save(bgi, file=opt$bgout)
-}
--- a/ballgown.xml	Wed Apr 26 08:29:56 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,235 +0,0 @@
-<tool id="ballgown" name="Ballgown" version="0.5.0" workflow_compatible="true">
-  <description>Flexible, isoform-level differential expression analysis</description>
-  <requirements>
-    <requirement type="package" version="2.2.0">bioconductor-ballgown</requirement>
-    <requirement type="package" version="0.5.0">r-dplyr</requirement>
-    <requirement type="package" version="1.3.2">r-optparse</requirement>
-
-  </requirements>
-  <command interpreter="Rscript" detect_errors="aggressive">
-		##------------------------------------------------------------------------------------
-		## This function reads the input file with the mapping between samples and files
-		## E.g. of result:
-		## mapping = {
-		##     "e2t.ctab"   : "sample1",
-		##     "other.ctab" : "sample2",
-		##     "i2t.ctab"   : "sample1",
-		##     "t_data.ctab": "sample1"
-		##      ...
-		## }
-		##------------------------------------------------------------------------------------
-		#def read_sample_mapping_file(sample_mapping_file):
-			#try
-				#set mapping = {}
-				#set file = open($sample_mapping_file.dataset.dataset.get_file_name(),'r')
-				#for $line in $file:
-					#set content= $line.strip().split('\t')
-					#for $map in $content:
-						#set mapping[$map]= $content[0]
-					#end for
-				#end for
-				#return $mapping
-			#except
-				#return None
-			#end try
-		#end def
-
-		##------------------------------------------------------------------------------------
-		## This function returns the name of the sample associated to a given file
-		##------------------------------------------------------------------------------------
-		#def get_sample_name($dataset, $sample_mapping):
-			##If the file with samples mapping was provided
-			#if $sample_mapping != None:
-				#return $sample_mapping.get($dataset.name, None)
-			##Otherwise with extract the sample name from the filename
-			#else:
-				#return str($dataset.element_identifier)
-			#end if
-		#end def
-
-		##------------------------------------------------------------------------------------
-		## This function reads a dataset or list of datasets and sets the corresponding value
-		## in the $result variable
-		## e.g. of result
-		##'sample1' : {
-		##         'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
-		##         'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
-		##         't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
-		##         'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
-		##         'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
-		##      },
-		##------------------------------------------------------------------------------------
-		#def read_input_files($param_name, $param_value, $result, $sample_mapping, $create_if_empty):
-			## If input is a data collection
-			#if isinstance($param_value, list):
-				## For each dataset
-				#for $dataset in $param_value:
-					## Get the sample name
-					#set sample_name = $get_sample_name($dataset, $sample_mapping)
-					## Check if sample is already registered
-					#if not($result.has_key($sample_name)):
-						#if ($create_if_empty == True):
-							#set result[$sample_name] = {}
-						#else:
-							#raise ValueError("Error in input. Please check that input contains all the required files for sample " + $sample_name)
-						#end if
-					#end if
-					## Register the file to the sample
-					#set result[$sample_name][$param_name] = str($dataset.dataset.dataset.get_file_name())
-				#end for
-			#else:
-				#if not($result.has_key("sample_1")):
-					#set result["sample_1"] = {}
-				#end if
-				#set result["sample_1"][$param_name] = str($param_name.dataset.dataset.get_file_name())
-			#end if
-			#return $result
-		#end def
-
-		##------------------------------------------------------------------------------------
-		## Main body of the tool
-		##------------------------------------------------------------------------------------
-		## Set the params for the next R script
-		#set result={}
-		#set sample_mapping=None
-
-		## If the samples mapping file was provided, parse the content
-		#if $samples_names != None and not(isinstance($samples_names, list) and (None in $samples_names)):
-			#set sample_mapping = $read_sample_mapping_file($samples_names)
-		#end if
-
-		## READ THE CONTENT FOR e_data AND STORE THE FILES
-		## INDEXED BY THEIR SAMPLE NAME
-		## e.g. 'HBR_Rep1' : {
-		##         'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat'
-		##         'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat',
-		##         't_data': '/export/galaxy-central/database/files/000/dataset_12.dat',
-		##         'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat',
-		##         'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat'
-		##      },
-		##      'HBR_Rep2' : {...}
-		#set $result = $read_input_files("e_data.ctab", $e_data, $result, $sample_mapping, True)
-		#set $result = $read_input_files("i_data.ctab", $i_data, $result, $sample_mapping, False)
-		#set $result = $read_input_files("t_data.ctab", $t_data, $result, $sample_mapping, False)
-		#set $result = $read_input_files("e2t.ctab", $e2t, $result, $sample_mapping, False)
-		#set $result = $read_input_files("i2t.ctab", $i2t, $result, $sample_mapping, False)
-
-		## For each input sample, create a directory and link the input files for ballgown
-		#import os
-		#set n_sample = 1
-		#for $key, $value in $result.iteritems():
-			#set dir_name = str($output.files_path) + "/" + $key + "/"
-			$os.makedirs($dir_name)
-			#for $file_name, $file_path in $value.iteritems():
-				$os.symlink($file_path, $dir_name + $file_name)
-			#end for
-			#set n_sample = $n_sample + 1
-		#end for
-
-		## Run the R script with the location of the linked files and the name for outpot file
-		ballgown.R --directory $output.files_path --outputtranscript $output --outputgenes $outputgn --texpression $trexpression --phendat $phendata --bgout $bgo
-	</command>
-  <inputs>
-    <param name="e_data" type="data" multiple="true" format="tabular" label="Exon-level expression measurements" help="One row per exon. See below for more details."/>
-		<param name="i_data" type="data" multiple="true" format="tabular" label="Intron- (i.e., junction-) level expression measurements" help="One row per intron. See below for more details."/>
-		<param name="t_data" type="data" multiple="true" format="tabular" label="Transcript-level expression measurements" help="One row per transcript. See below for more details."/>
-		<param name="e2t" type="data" multiple="true" format="tabular" label="Exons-transcripts mapping" help="Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. See below for more details."/>
-		<param name="i2t" type="data" multiple="true" format="tabular" label="Introns-transcripts mapping" help="Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. See below for more details."/>
-		<param name="samples_names" type="data" optional="true" multiple="false" format="tabular" label="File names for samples" help="Optional. Use in case that the names for the analysed samples cannot be extracted from the filenames."/>
-    <param argument="--phendat" name="phendata" type="data" format="csv" label="phenotype data" />
-    <param argument="--texpression" name="trexpression" type="float" value="0.5" label="minimal transcript expression to appear in the results"/>
-  </inputs>
-  <outputs>
-    <data name="bgo" format="rda" file="ballgown_object.rda" label="${tool.name} on ${on_string}: ballgown object (R data file)"/>
-    <data name="output" format="csv" file="output_transcript.csv" label="${tool.name} on ${on_string}: transcripts expression (tabular)"/>
-    <data name="outputgn" format="csv" file="output_genes.csv" label="${tool.name} on ${on_string}: genes expression (tabular)"/>
-  </outputs>
-  <tests>
-  </tests>
-  <help>
-
-=======================
-Ballgown
-=======================
------------------------
-**What it does**
------------------------
-
-Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-seq data.
-The Ballgown package provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly.
-
-----
-
------------------------
-**How to use**
------------------------
-The input for this tools consists on 5 files for each sample in your experiment:
-
-- **e_data**: exon-level expression measurements. Tab file or collection of tab files. One row per exon. Columns are e_id (numeric exon id), chr, strand, start, end (genomic location of the exon), and the following expression measurements for each sample:
-          * rcount: reads overlapping the exon
-          * ucount: uniquely mapped reads overlapping the exon
-          * mrcount: multi-map-corrected number of reads overlapping the exon
-          * cov average per-base read coverage
-          * cov_sd: standard deviation of per-base read coverage
-          * mcov: multi-map-corrected average per-base read coverage
-          * mcov_sd: standard deviation of multi-map-corrected per-base coverage
-- **i_data**: intron- (i.e., junction-) level expression measurements. Tab file or collection of tab files. One row per intron. Columns are i_id (numeric intron id), chr, strand, start, end (genomic location of the intron), and the following expression measurements for each sample:
-          * rcount: number of reads supporting the intron
-          * ucount: number of uniquely mapped reads supporting the intron
-          * mrcount: multi-map-corrected number of reads supporting the intron
-- **t_data**: transcript-level expression measurements. Tab file or collection of tab files. One row per transcript. Columns are:
-          * t_id: numeric transcript id
-          * chr, strand, start, end: genomic location of the transcript
-          * t_name: Cufflinks-generated transcript id
-          * num_exons: number of exons comprising the transcript
-          * length: transcript length, including both exons and introns
-          * gene_id: gene the transcript belongs to
-          * gene_name: HUGO gene name for the transcript, if known
-          * cov: per-base coverage for the transcript (available for each sample)
-          * FPKM: Cufflinks-estimated FPKM for the transcript (available for each sample)
-- **e2t**: Tab file or collection of tab files. Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. These ids match the ids in the e_data and t_data tables.
-- **i2t**: Tab file or collection of tab files. Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. These ids match the ids in the i_data and t_data tables.
-- samples_names: (optional) Tab file. Table with five columns, one row per sample. Defines which files from the input belong to each sample in the experiment.
-
-.. class:: infomark
-
-'''TIP''' *Note* Here's an example of a good phenotype data file for your expirement.
-
-+--------------+-------------------------+-------------------------+---+
-|ids           |experimental variable 1  |experimental variable 2  |...|
-+==============+=========================+=========================+===+
-|sample 1      |value 1                  |value 2                  |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 2      |value 2                  |value 1                  |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 3      |value 1                  |value 2                  |...|
-+--------------+-------------------------+-------------------------+---+
-|sample 4      |value 2                  |value 1                  |...|
-+--------------+-------------------------+-------------------------+---+
-|...           |value 1                  |value 2                  |...|
-+--------------+-------------------------+-------------------------+---+
-
-
-.. class:: infomark
-
-*Note* The minimal transcript expression is a number used to filter the transcripts that
-are less or not expressed in our samples when compared to the genome
-
------------------------
-**Outputs**
------------------------
-
-This tool has 3 outputs:
-
-- **transcripts expression** : this is a csv file containing all the transcripts that are expressed above the transcripts expression value
-- **genes expression** : this is a csv file containing all the genes that are expressed above the transcripts expression value
-- **Ballgown object** : this is the ballgown object created during the process. This file can be re-used later for further analysis in a R console.
-
-----
-
-**Authors**: Théo Collard [SLU Global Bioinformatics Centre], Rafael Hernández de Diego [SLU Global Bioinformatics Centre], and Tomas Klingström [SLU Global Bioinformatics Centre]
-
-Sources are available at https://github.com/CollardT/Ballgown-Wrapper
-
-  </help>
-</tool>