# HG changeset patch
# User diego
# Date 1335058341 14400
# Node ID d50638ebd809cfbed4946a9190c10f5b77a225b9
Uploaded
diff -r 000000000000 -r d50638ebd809 rtg.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rtg.py Sat Apr 21 21:32:21 2012 -0400
@@ -0,0 +1,239 @@
+"""
+rtg datatypes
+"""
+
+import data
+from galaxy.datatypes import sequence
+import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess
+import galaxy.model
+from galaxy.datatypes import metadata
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy import util
+from galaxy.datatypes.images import Html
+from galaxy.datatypes.sequence import Sequence
+from galaxy.datatypes.binary import Binary
+from sniff import *
+from pprint import pprint
+from ConfigParser import ConfigParser
+
+log = logging.getLogger(__name__)
+basepath = os.path.dirname(__file__)
+rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg"))
+
+class FakeSecHead(object):
+ def __init__(self, fp):
+ self.fp = fp
+ self.sechead = '[asection]\n'
+ def readline(self):
+ if self.sechead:
+ try: return self.sechead
+ finally: self.sechead = None
+ else: return self.fp.readline()
+
+cfg = ConfigParser()
+cfg.readfp(FakeSecHead(open(rtgcfg)))
+
+class Sdf( Html ):
+ composite_type = 'auto_primary_file'
+ allow_datatype_change = False
+ file_ext = 'sdf'
+
+ MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter)
+ MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter)
+ MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter)
+ MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
+ MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter)
+ MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
+ MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter)
+ MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter)
+
+ def __init__( self, **kwd ):
+ Html.__init__( self, **kwd )
+ log.debug( "Rtg log info %s" % ' __init__')
+ self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False )
+ self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False )
+ self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False )
+ self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+ self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+ self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+ self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+ self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+ self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
+
+ def generate_primary_file( self, dataset = None ):
+ log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset))
+ rval = ['
RTG SDF Dataset ']
+ rval.append('This SDF dataset is composed of the following files:
')
+ for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
+ fn = composite_name
+ log.debug( "Rtg log info %s %s %s" % ('generate_primary_file',fn,composite_file))
+ opt_text = ''
+ if composite_file.optional:
+ opt_text = ' (optional)'
+ if composite_file.get('description'):
+ rval.append( '- %s (%s)%s
' % ( fn, fn, composite_file.get('description'), opt_text ) )
+ else:
+ rval.append( '- %s%s
' % ( fn, fn, opt_text ) )
+ rval.append( '
' )
+ return "\n".join( rval )
+
+ def regenerate_primary_file(self,dataset):
+ """
+ cannot do this until we are setting metadata
+ """
+ log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset))
+ bn = dataset.metadata.base_name
+ flist = os.listdir(dataset.extra_files_path)
+ rval = ['Files for RTG SDF Dataset %sComprises the following files:' % (bn)]
+ for i,fname in enumerate(flist):
+ sfname = os.path.split(fname)[-1]
+ rval.append( '- %s' % ( sfname, sfname ) )
+ rval.append( '
' )
+ f = file(dataset.file_name,'w')
+ f.write("\n".join( rval ))
+ f.write('\n')
+ f.close()
+
+ def set_meta( self, dataset, **kwd ):
+ Html.set_meta( self, dataset, **kwd )
+ self.regenerate_primary_file(dataset)
+ if (os.path.isdir(dataset.extra_files_path + '/left')):
+ sdfDir = dataset.extra_files_path + '/left'
+ dataset.metadata.paired = 'TRUE'
+ else:
+ sdfDir = dataset.extra_files_path
+ dataset.metadata.paired = 'FALSE'
+ p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r")
+ while 1:
+ line = p.readline()
+ if not line:
+ break
+ if line.startswith('SDF-ID'):
+ dataset.metadata.sdfId = line.split(':', 1)[1].strip()
+ elif line.startswith('Number of sequences'):
+ dataset.metadata.sequences = line.split(':', 1)[1].strip()
+ elif line.startswith('Type'):
+ dataset.metadata.type = line.split(':', 1)[1].strip()
+ elif line.startswith('Source'):
+ dataset.metadata.source = line.split(':', 1)[1].strip()
+ elif line.startswith('Quality scores available'):
+ dataset.metadata.hasQuality = 'TRUE'
+ elif line.startswith('Maximum length'):
+ dataset.metadata.maxLength = line.split(':', 1)[1].strip()
+ elif line.startswith('Minimum length'):
+ dataset.metadata.minLength = line.split(':', 1)[1].strip()
+ if dataset.metadata.hasQuality != 'TRUE':
+ dataset.metadata.hasQuality = 'FALSE'
+
+ if __name__ == '__main__':
+ import doctest, sys
+ doctest.testmod(sys.modules[__name__])
+
+class Cgtsv ( Sequence ):
+ """Class representing a generic CG TSV sequence"""
+ file_ext = "tsvcg"
+
+ def set_meta( self, dataset, **kwd ):
+ """
+ Set the number of sequences and the number of data lines
+ in dataset.
+ """
+ if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
+ dataset.metadata.sequences = None
+ return
+ sequences = 0
+ for line in file( dataset.file_name ):
+ line = line.strip()
+ if line:
+ if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ):
+ # We don't count comment lines for sequence data types
+ continue
+ sequences += 1
+ dataset.metadata.sequences = sequences
+ def sniff ( self, filename ):
+ """
+ Determines whether the file is in CG TSV format
+ For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf
+ """
+ bases_regexp = re.compile( "^[NGTAC]*" )
+ headers = get_headers( filename, '\t' )
+ try:
+ count = 0
+ if len(headers) < 2:
+ return False
+ for hdr in headers:
+ if len( hdr ) > 1 and hdr[0]:
+ if hdr[0].startswith( '#' ):
+ continue
+ if len(hdr) != 3:
+ return False
+ if hdr[0].startswith( '>' ):
+ if hdr[0] != ">flags":
+ return False
+ if hdr[1] != "reads":
+ return False
+ else:
+ try:
+ map( int, [hdr[0]] )
+ if not bases_regexp.match(hdr[1]):
+ return False
+ except:
+ return False
+ count += 1
+ if count >= 5:
+ return True
+ # Do other necessary checking here...
+ except:
+ return False
+ # If we haven't yet returned False, then...
+ return True
+
+class Samix( Binary ):
+ """Class describing a tabix-ed SAM file"""
+ file_ext = "sam.gz"
+ MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
+ def init_meta( self, dataset, copy_from=None ):
+ Binary.init_meta( self, dataset, copy_from=copy_from )
+ def set_meta( self, dataset, overwrite = True, **kwd ):
+ """ Creates the index for the SAM file. """
+ # These metadata values are not accessible by users, always overwrite
+ #f = open('/home/alan/galtmp', 'w')
+
+ index_file = dataset.metadata.sam_index
+ if not index_file:
+ index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset )
+ # print >>f, 'idx file ', index_file, '\n'
+ # Create the Sam index
+ stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name
+ command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name))
+ #print >>f, 'idx cmd ', command, '\n'
+ proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
+ exit_code = proc.wait()
+ #Did index succeed?
+ stderr = open( stderr_name ).read().strip()
+ if stderr:
+ if exit_code != 0:
+ os.unlink( stderr_name ) #clean up
+ f.close();
+ raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr
+ else:
+ print stderr
+ #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name
+ shutil.move(dataset.file_name + '.tbi', index_file.file_name)
+ dataset.metadata.sam_index = index_file
+ # f.close();
+ # Remove temp file
+ os.unlink( stderr_name )
+ def set_peek( self, dataset, is_multi_byte=False ):
+ if not dataset.dataset.purged:
+ dataset.peek = "Tabix-ed sam alignments file"
+ dataset.blurb = data.nice_size( dataset.get_size() )
+ else:
+ dataset.peek = 'file does not exist'
+ dataset.blurb = 'file purged from disk'
+ def display_peek( self, dataset ):
+ try:
+ return dataset.peek
+ except:
+ return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
+