# HG changeset patch # User theo.collard # Date 1493209796 14400 # Node ID fa62657e9b571f4beeab519587a3717ca139b0b5 # Parent 940701e8bc59635222587c4f48fd4c39c7c7dad0 Uploaded diff -r 940701e8bc59 -r fa62657e9b57 ballgown.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ballgown.xml Wed Apr 26 08:29:56 2017 -0400 @@ -0,0 +1,235 @@ + + Flexible, isoform-level differential expression analysis + + bioconductor-ballgown + r-dplyr + r-optparse + + + + ##------------------------------------------------------------------------------------ + ## This function reads the input file with the mapping between samples and files + ## E.g. of result: + ## mapping = { + ## "e2t.ctab" : "sample1", + ## "other.ctab" : "sample2", + ## "i2t.ctab" : "sample1", + ## "t_data.ctab": "sample1" + ## ... + ## } + ##------------------------------------------------------------------------------------ + #def read_sample_mapping_file(sample_mapping_file): + #try + #set mapping = {} + #set file = open($sample_mapping_file.dataset.dataset.get_file_name(),'r') + #for $line in $file: + #set content= $line.strip().split('\t') + #for $map in $content: + #set mapping[$map]= $content[0] + #end for + #end for + #return $mapping + #except + #return None + #end try + #end def + + ##------------------------------------------------------------------------------------ + ## This function returns the name of the sample associated to a given file + ##------------------------------------------------------------------------------------ + #def get_sample_name($dataset, $sample_mapping): + ##If the file with samples mapping was provided + #if $sample_mapping != None: + #return $sample_mapping.get($dataset.name, None) + ##Otherwise with extract the sample name from the filename + #else: + #return str($dataset.element_identifier) + #end if + #end def + + ##------------------------------------------------------------------------------------ + ## This function reads a dataset or list of datasets and sets the corresponding value + ## in the $result variable + ## e.g. of result + ##'sample1' : { + ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat' + ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat', + ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat', + ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat', + ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat' + ## }, + ##------------------------------------------------------------------------------------ + #def read_input_files($param_name, $param_value, $result, $sample_mapping, $create_if_empty): + ## If input is a data collection + #if isinstance($param_value, list): + ## For each dataset + #for $dataset in $param_value: + ## Get the sample name + #set sample_name = $get_sample_name($dataset, $sample_mapping) + ## Check if sample is already registered + #if not($result.has_key($sample_name)): + #if ($create_if_empty == True): + #set result[$sample_name] = {} + #else: + #raise ValueError("Error in input. Please check that input contains all the required files for sample " + $sample_name) + #end if + #end if + ## Register the file to the sample + #set result[$sample_name][$param_name] = str($dataset.dataset.dataset.get_file_name()) + #end for + #else: + #if not($result.has_key("sample_1")): + #set result["sample_1"] = {} + #end if + #set result["sample_1"][$param_name] = str($param_name.dataset.dataset.get_file_name()) + #end if + #return $result + #end def + + ##------------------------------------------------------------------------------------ + ## Main body of the tool + ##------------------------------------------------------------------------------------ + ## Set the params for the next R script + #set result={} + #set sample_mapping=None + + ## If the samples mapping file was provided, parse the content + #if $samples_names != None and not(isinstance($samples_names, list) and (None in $samples_names)): + #set sample_mapping = $read_sample_mapping_file($samples_names) + #end if + + ## READ THE CONTENT FOR e_data AND STORE THE FILES + ## INDEXED BY THEIR SAMPLE NAME + ## e.g. 'HBR_Rep1' : { + ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat' + ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat', + ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat', + ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat', + ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat' + ## }, + ## 'HBR_Rep2' : {...} + #set $result = $read_input_files("e_data.ctab", $e_data, $result, $sample_mapping, True) + #set $result = $read_input_files("i_data.ctab", $i_data, $result, $sample_mapping, False) + #set $result = $read_input_files("t_data.ctab", $t_data, $result, $sample_mapping, False) + #set $result = $read_input_files("e2t.ctab", $e2t, $result, $sample_mapping, False) + #set $result = $read_input_files("i2t.ctab", $i2t, $result, $sample_mapping, False) + + ## For each input sample, create a directory and link the input files for ballgown + #import os + #set n_sample = 1 + #for $key, $value in $result.iteritems(): + #set dir_name = str($output.files_path) + "/" + $key + "/" + $os.makedirs($dir_name) + #for $file_name, $file_path in $value.iteritems(): + $os.symlink($file_path, $dir_name + $file_name) + #end for + #set n_sample = $n_sample + 1 + #end for + + ## Run the R script with the location of the linked files and the name for outpot file + ballgown.R --directory $output.files_path --outputtranscript $output --outputgenes $outputgn --texpression $trexpression --phendat $phendata --bgout $bgo + + + + + + + + + + + + + + + + + + + + +======================= +Ballgown +======================= +----------------------- +**What it does** +----------------------- + +Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-seq data. +The Ballgown package provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly. + +---- + +----------------------- +**How to use** +----------------------- +The input for this tools consists on 5 files for each sample in your experiment: + +- **e_data**: exon-level expression measurements. Tab file or collection of tab files. One row per exon. Columns are e_id (numeric exon id), chr, strand, start, end (genomic location of the exon), and the following expression measurements for each sample: + * rcount: reads overlapping the exon + * ucount: uniquely mapped reads overlapping the exon + * mrcount: multi-map-corrected number of reads overlapping the exon + * cov average per-base read coverage + * cov_sd: standard deviation of per-base read coverage + * mcov: multi-map-corrected average per-base read coverage + * mcov_sd: standard deviation of multi-map-corrected per-base coverage +- **i_data**: intron- (i.e., junction-) level expression measurements. Tab file or collection of tab files. One row per intron. Columns are i_id (numeric intron id), chr, strand, start, end (genomic location of the intron), and the following expression measurements for each sample: + * rcount: number of reads supporting the intron + * ucount: number of uniquely mapped reads supporting the intron + * mrcount: multi-map-corrected number of reads supporting the intron +- **t_data**: transcript-level expression measurements. Tab file or collection of tab files. One row per transcript. Columns are: + * t_id: numeric transcript id + * chr, strand, start, end: genomic location of the transcript + * t_name: Cufflinks-generated transcript id + * num_exons: number of exons comprising the transcript + * length: transcript length, including both exons and introns + * gene_id: gene the transcript belongs to + * gene_name: HUGO gene name for the transcript, if known + * cov: per-base coverage for the transcript (available for each sample) + * FPKM: Cufflinks-estimated FPKM for the transcript (available for each sample) +- **e2t**: Tab file or collection of tab files. Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. These ids match the ids in the e_data and t_data tables. +- **i2t**: Tab file or collection of tab files. Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. These ids match the ids in the i_data and t_data tables. +- samples_names: (optional) Tab file. Table with five columns, one row per sample. Defines which files from the input belong to each sample in the experiment. + +.. class:: infomark + +'''TIP''' *Note* Here's an example of a good phenotype data file for your expirement. + ++--------------+-------------------------+-------------------------+---+ +|ids |experimental variable 1 |experimental variable 2 |...| ++==============+=========================+=========================+===+ +|sample 1 |value 1 |value 2 |...| ++--------------+-------------------------+-------------------------+---+ +|sample 2 |value 2 |value 1 |...| ++--------------+-------------------------+-------------------------+---+ +|sample 3 |value 1 |value 2 |...| ++--------------+-------------------------+-------------------------+---+ +|sample 4 |value 2 |value 1 |...| ++--------------+-------------------------+-------------------------+---+ +|... |value 1 |value 2 |...| ++--------------+-------------------------+-------------------------+---+ + + +.. class:: infomark + +*Note* The minimal transcript expression is a number used to filter the transcripts that +are less or not expressed in our samples when compared to the genome + +----------------------- +**Outputs** +----------------------- + +This tool has 3 outputs: + +- **transcripts expression** : this is a csv file containing all the transcripts that are expressed above the transcripts expression value +- **genes expression** : this is a csv file containing all the genes that are expressed above the transcripts expression value +- **Ballgown object** : this is the ballgown object created during the process. This file can be re-used later for further analysis in a R console. + +---- + +**Authors**: Théo Collard [SLU Global Bioinformatics Centre], Rafael Hernández de Diego [SLU Global Bioinformatics Centre], and Tomas Klingström [SLU Global Bioinformatics Centre] + +Sources are available at https://github.com/CollardT/Ballgown-Wrapper + + +