# HG changeset patch # User Jim Johnson # Date 1384814595 21600 # Node ID 137aab1d9ac11be1da4ee5246445c96c59675f36 # Parent 2bb88bf1c1dd334e174ebf999ef7baa08d18a856 Add metadata to datatype: CuffDataDB diff -r 2bb88bf1c1dd -r 137aab1d9ac1 README --- a/README Fri Nov 15 14:13:14 2013 -0600 +++ b/README Mon Nov 18 16:43:15 2013 -0600 @@ -1,8 +1,9 @@ CummeRbund is an R package that is designed to aid and simplify the task of analyzing Cufflinks RNA-Seq output. ( http://compbio.mit.edu/cummeRbund/ ) +( http://www.bioconductor.org/packages/release/bioc/html/cummeRbund.html ) -Prerequisites for installing cumme=Rbund: +Prerequisites for installing cummeRbund: The linux package: libxml2-dev In ubuntu: sudo apt-get install libxml2-dev @@ -14,9 +15,8 @@ source("http://bioconductor.org/biocLite.R") biocLite("cummeRbund") -This galaxy tool package includes a replacement variation of the cuffdiff wrapper that will generate an output that can be used directly in cummeRbund. +This galaxy tool package includes a replacement variation of the cuffdiff wrapper that will generate an SQLite data base output that can be used directly in cummeRbund. +This cuffdiff builds the cummeRbund database from the cuffdiff output files in the working directory which includes the run.info output, +thus it can populate the replicates information. - - - diff -r 2bb88bf1c1dd -r 137aab1d9ac1 cuffdata.py --- a/cuffdata.py Fri Nov 15 14:13:14 2013 -0600 +++ b/cuffdata.py Mon Nov 18 16:43:15 2013 -0600 @@ -2,7 +2,9 @@ CuffData """ import logging -import os,os.path,re +import os,os.path,sys,re +import tempfile +from subprocess import Popen import galaxy.datatypes.data from galaxy.datatypes.images import Html from galaxy.datatypes.binary import Binary @@ -110,6 +112,67 @@ return False class CuffDataDB( Binary ): - file_ext = 'cuffdata' + file_ext = 'cuffdatadb' is_binary = True allow_datatype_change = False + MetadataElement( name="sample_names", default=[], desc="Sample names", readonly=True, visible=True, optional=True, no_value=[] ) + MetadataElement( name="replicate_names", default=[], desc="Replicate names", readonly=True, visible=True, optional=True, no_value=[] ) + MetadataElement( name="gene_ids", default=[], desc="Gene Ids", readonly=True, visible=True, optional=True, no_value=[] ) + + def __init__( self, **kwd ): + Binary.__init__( self, **kwd ) + log.info('Creating cummeRbund CuffDataDB') + + def set_meta( self, dataset, **kwd ): + def get_contents(fname): + contents = '' + with open(fname,'r') as fh: + contents = fh.read() + return contents + if not dataset.has_data(): + return + try: + ## Create a tmpdir + ## create an Rscript to write out info about the CuffData, e.g. samples replicates gene_ids + ## define file names to use as sinks for each type of data + # tmp_dir = tempfile.mkdtemp() + tmp_dir = '/tmp/gx/cuffdb' + if not os.path.isdir(tmp_dir): + os.makedirs(tmp_dir) + rscript = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.r' ).name + rscript_fh = open( rscript, 'wb' ) + rscript_fh.write('library(cummeRbund)\n') + rscript_fh.write('cuff<-readCufflinks(dir = "", dbFile = "%s", rebuild = F)\n' % (dataset.file_name)) + rscript_fh.write('sink("%s")\n' % ("out.blurb")) + rscript_fh.write('print(cuff)\n') + rscript_fh.write('sink()\n') + rscript_fh.write('sink("%s")\n' % ("out.samples")) + rscript_fh.write('cat(samples(cuff)[[2]],sep=",")\n') + rscript_fh.write('sink()\n') + rscript_fh.write('sink("%s")\n' % ("out.replicates")) + rscript_fh.write('cat(replicates(cuff)[[4]],sep=",")\n') + rscript_fh.write('sink()\n') + rscript_fh.write('sink("%s")\n' % ("out.gene_ids")) + rscript_fh.write('cat(annotation(genes(cuff))[[1]],sep=",")\n') + rscript_fh.write('sink()\n') + rscript_fh.close() + cmd = ( "Rscript --vanilla %s" % rscript ) + tmp_stderr_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.err' ).name + tmp_stderr = open( tmp_stderr_name, 'wb' ) + proc = Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + flist = os.listdir(tmp_dir) + for i,fname in enumerate(flist): + sfname = os.path.split(fname)[-1] + if sfname == 'out.blurb': + dataset.blurb = get_contents(os.path.join(tmp_dir,fname)) + elif sfname == 'out.samples': + dataset.metadata.sample_names = get_contents(os.path.join(tmp_dir,fname)).split(',') + elif sfname == 'out.replicates': + dataset.metadata.replicate_names = get_contents(os.path.join(tmp_dir,fname)).split(',') + elif sfname == 'out.gene_ids': + dataset.metadata.gene_ids = get_contents(os.path.join(tmp_dir,fname)).split(',') + except Exception, e: + log.error('Error setting cummeRbund CuffDataDB metadata : %s' % str(e)) + diff -r 2bb88bf1c1dd -r 137aab1d9ac1 cummerbund_wrapper.xml --- a/cummerbund_wrapper.xml Fri Nov 15 14:13:14 2013 -0600 +++ b/cummerbund_wrapper.xml Mon Nov 18 16:43:15 2013 -0600 @@ -301,6 +301,7 @@ replicates(cuff) print("FEATURES:") print(annotation(genes(cuff))) +cat(annotation(genes(cuff))[[1]],sep=",") sink() #for $i, $p in enumerate($plots, start=1): @@ -355,11 +356,15 @@ ## Heatmap ## #elif $p.plot['type'] == "heatmap": + #if $p.plot.genes and len($p.plot.genes) > 0: myGeneIds <- c() #for $g in $p.plot.genes: myGeneIds <- c(myGeneIds, "$g['gene_id']") #end for myGenes <- getGenes(cuff, myGeneIds) + #else +myGenes <- getGenes(cuff,annotation(genes(cuff))[[1]]) + #end if csHeatmap(get_features(myGenes, "${p.plot.features}"), clustering="${p.plot.clustering}", labCol="${p.plot.labcol}", labRow="${p.plot.labrow}", border="${p.plot.border}") ## Cluster ##