changeset 6:137aab1d9ac1

Add metadata to datatype: CuffDataDB
author Jim Johnson <jj@umn.edu>
date Mon, 18 Nov 2013 16:43:15 -0600
parents 2bb88bf1c1dd
children b5562b9a55c7
files README cuffdata.py cummerbund_wrapper.xml
diffstat 3 files changed, 75 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/README	Fri Nov 15 14:13:14 2013 -0600
+++ b/README	Mon Nov 18 16:43:15 2013 -0600
@@ -1,8 +1,9 @@
 CummeRbund is an R package that is designed to aid and simplify the task of analyzing Cufflinks RNA-Seq output.
 ( http://compbio.mit.edu/cummeRbund/ )  
+( http://www.bioconductor.org/packages/release/bioc/html/cummeRbund.html )
 
 
-Prerequisites for installing cumme=Rbund:
+Prerequisites for installing cummeRbund:
 The linux package: libxml2-dev
 In ubuntu:   sudo apt-get install libxml2-dev
 
@@ -14,9 +15,8 @@
    source("http://bioconductor.org/biocLite.R")
    biocLite("cummeRbund")
 
-This galaxy tool package includes a replacement variation of the cuffdiff wrapper that will generate an output that can be used directly in cummeRbund.
+This galaxy tool package includes a replacement variation of the cuffdiff wrapper that will generate an SQLite data base output that can be used directly in cummeRbund.
+This cuffdiff builds the cummeRbund database from the cuffdiff output files in the working directory which includes the run.info output, 
+thus it can populate the replicates information.  
 
 
-
-
-
--- a/cuffdata.py	Fri Nov 15 14:13:14 2013 -0600
+++ b/cuffdata.py	Mon Nov 18 16:43:15 2013 -0600
@@ -2,7 +2,9 @@
 CuffData 
 """
 import logging
-import os,os.path,re
+import os,os.path,sys,re
+import tempfile
+from subprocess import Popen
 import galaxy.datatypes.data
 from galaxy.datatypes.images import Html
 from galaxy.datatypes.binary import Binary
@@ -110,6 +112,67 @@
         return False
 
 class CuffDataDB( Binary ):
-    file_ext = 'cuffdata'
+    file_ext = 'cuffdatadb'
     is_binary = True
     allow_datatype_change = False
+    MetadataElement( name="sample_names", default=[], desc="Sample names", readonly=True, visible=True, optional=True, no_value=[] )
+    MetadataElement( name="replicate_names", default=[], desc="Replicate names", readonly=True, visible=True, optional=True, no_value=[] )
+    MetadataElement( name="gene_ids", default=[], desc="Gene Ids", readonly=True, visible=True, optional=True, no_value=[] )
+
+    def __init__( self, **kwd ):
+        Binary.__init__( self, **kwd )
+        log.info('Creating cummeRbund CuffDataDB')
+
+    def set_meta( self, dataset, **kwd ):
+        def get_contents(fname):
+            contents = ''
+            with open(fname,'r') as fh:
+                contents = fh.read()
+            return contents
+        if not dataset.has_data():
+            return
+        try:
+            ## Create a tmpdir
+            ## create an Rscript to write out info about the CuffData, e.g. samples replicates gene_ids
+            ## define file names to use as sinks for each type of data
+            # tmp_dir = tempfile.mkdtemp()
+            tmp_dir = '/tmp/gx/cuffdb'
+            if not os.path.isdir(tmp_dir):
+                os.makedirs(tmp_dir)
+            rscript = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.r' ).name
+            rscript_fh = open( rscript, 'wb' )
+            rscript_fh.write('library(cummeRbund)\n')
+            rscript_fh.write('cuff<-readCufflinks(dir = "", dbFile = "%s", rebuild = F)\n' % (dataset.file_name))
+            rscript_fh.write('sink("%s")\n' % ("out.blurb"))
+            rscript_fh.write('print(cuff)\n')
+            rscript_fh.write('sink()\n')
+            rscript_fh.write('sink("%s")\n' % ("out.samples"))
+            rscript_fh.write('cat(samples(cuff)[[2]],sep=",")\n')
+            rscript_fh.write('sink()\n')
+            rscript_fh.write('sink("%s")\n' % ("out.replicates"))
+            rscript_fh.write('cat(replicates(cuff)[[4]],sep=",")\n')
+            rscript_fh.write('sink()\n')
+            rscript_fh.write('sink("%s")\n' % ("out.gene_ids"))
+            rscript_fh.write('cat(annotation(genes(cuff))[[1]],sep=",")\n')
+            rscript_fh.write('sink()\n')
+            rscript_fh.close()
+            cmd = ( "Rscript --vanilla %s" % rscript )
+            tmp_stderr_name = tempfile.NamedTemporaryFile( dir=tmp_dir,suffix='.err' ).name
+            tmp_stderr = open( tmp_stderr_name, 'wb' )
+            proc = Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+            returncode = proc.wait()
+            tmp_stderr.close()
+            flist = os.listdir(tmp_dir)
+            for i,fname in enumerate(flist):
+                sfname = os.path.split(fname)[-1]
+                if sfname == 'out.blurb':
+                    dataset.blurb = get_contents(os.path.join(tmp_dir,fname))
+                elif sfname == 'out.samples':
+                    dataset.metadata.sample_names = get_contents(os.path.join(tmp_dir,fname)).split(',')
+                elif sfname == 'out.replicates':
+                    dataset.metadata.replicate_names = get_contents(os.path.join(tmp_dir,fname)).split(',')
+                elif sfname == 'out.gene_ids':
+                    dataset.metadata.gene_ids = get_contents(os.path.join(tmp_dir,fname)).split(',')
+        except Exception, e:
+            log.error('Error setting cummeRbund CuffDataDB metadata : %s' % str(e))
+
--- a/cummerbund_wrapper.xml	Fri Nov 15 14:13:14 2013 -0600
+++ b/cummerbund_wrapper.xml	Mon Nov 18 16:43:15 2013 -0600
@@ -301,6 +301,7 @@
 replicates(cuff)
 print("FEATURES:")
 print(annotation(genes(cuff)))
+cat(annotation(genes(cuff))[[1]],sep=",")
 sink()
 
 #for $i, $p in enumerate($plots, start=1):
@@ -355,11 +356,15 @@
 
     ## Heatmap ##
 	#elif $p.plot['type'] == "heatmap":
+            #if $p.plot.genes and len($p.plot.genes) > 0:
 myGeneIds &lt;- c()
 		#for $g in $p.plot.genes:
 myGeneIds &lt;- c(myGeneIds, "$g['gene_id']")
 		#end for
 myGenes &lt;- getGenes(cuff, myGeneIds)
+            #else 
+myGenes &lt;- getGenes(cuff,annotation(genes(cuff))[[1]])
+            #end if
 csHeatmap(get_features(myGenes, "${p.plot.features}"), clustering="${p.plot.clustering}", labCol="${p.plot.labcol}", labRow="${p.plot.labrow}", border="${p.plot.border}")
 
     ## Cluster ##