Mercurial > repos > diego > rtg_investigator
view lib/galaxy/datatypes/rtg.py @ 1:8593828f91e7 default tip
Full galaxy wrapper
author | diego |
---|---|
date | Sat, 21 Apr 2012 21:36:15 -0400 |
parents | |
children |
line wrap: on
line source
""" rtg datatypes """ import data from galaxy.datatypes import sequence import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess import galaxy.model from galaxy.datatypes import metadata from galaxy.datatypes.metadata import MetadataElement from galaxy import util from galaxy.datatypes.images import Html from galaxy.datatypes.sequence import Sequence from galaxy.datatypes.binary import Binary from sniff import * from pprint import pprint from ConfigParser import ConfigParser log = logging.getLogger(__name__) basepath = os.path.dirname(__file__) rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg")) class FakeSecHead(object): def __init__(self, fp): self.fp = fp self.sechead = '[asection]\n' def readline(self): if self.sechead: try: return self.sechead finally: self.sechead = None else: return self.fp.readline() cfg = ConfigParser() cfg.readfp(FakeSecHead(open(rtgcfg))) class Sdf( Html ): composite_type = 'auto_primary_file' allow_datatype_change = False file_ext = 'sdf' MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter) MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter) MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter) MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter) MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter) MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter) def __init__( self, **kwd ): Html.__init__( self, **kwd ) log.debug( "Rtg log info %s" % ' __init__') self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False ) self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False ) self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False ) self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) def generate_primary_file( self, dataset = None ): log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset)) rval = ['<html><head><title>RTG SDF Dataset </title></head><p/>'] rval.append('<div>This SDF dataset is composed of the following files:<p/><ul>') for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): fn = composite_name log.debug( "Rtg log info %s %s %s" % ('generate_primary_file',fn,composite_file)) opt_text = '' if composite_file.optional: opt_text = ' (optional)' if composite_file.get('description'): rval.append( '<li><a href="%s" type="application/octet-stream">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) else: rval.append( '<li><a href="%s" type="application/octet-stream">%s</a>%s</li>' % ( fn, fn, opt_text ) ) rval.append( '</ul></div></html>' ) return "\n".join( rval ) def regenerate_primary_file(self,dataset): """ cannot do this until we are setting metadata """ log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset)) bn = dataset.metadata.base_name flist = os.listdir(dataset.extra_files_path) rval = ['<html><head><title>Files for RTG SDF Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)] for i,fname in enumerate(flist): sfname = os.path.split(fname)[-1] rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) ) rval.append( '</ul></html>' ) f = file(dataset.file_name,'w') f.write("\n".join( rval )) f.write('\n') f.close() def set_meta( self, dataset, **kwd ): Html.set_meta( self, dataset, **kwd ) self.regenerate_primary_file(dataset) if (os.path.isdir(dataset.extra_files_path + '/left')): sdfDir = dataset.extra_files_path + '/left' dataset.metadata.paired = 'TRUE' else: sdfDir = dataset.extra_files_path dataset.metadata.paired = 'FALSE' p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r") while 1: line = p.readline() if not line: break if line.startswith('SDF-ID'): dataset.metadata.sdfId = line.split(':', 1)[1].strip() elif line.startswith('Number of sequences'): dataset.metadata.sequences = line.split(':', 1)[1].strip() elif line.startswith('Type'): dataset.metadata.type = line.split(':', 1)[1].strip() elif line.startswith('Source'): dataset.metadata.source = line.split(':', 1)[1].strip() elif line.startswith('Quality scores available'): dataset.metadata.hasQuality = 'TRUE' elif line.startswith('Maximum length'): dataset.metadata.maxLength = line.split(':', 1)[1].strip() elif line.startswith('Minimum length'): dataset.metadata.minLength = line.split(':', 1)[1].strip() if dataset.metadata.hasQuality != 'TRUE': dataset.metadata.hasQuality = 'FALSE' if __name__ == '__main__': import doctest, sys doctest.testmod(sys.modules[__name__]) class Cgtsv ( Sequence ): """Class representing a generic CG TSV sequence""" file_ext = "tsvcg" def set_meta( self, dataset, **kwd ): """ Set the number of sequences and the number of data lines in dataset. """ if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: dataset.metadata.sequences = None return sequences = 0 for line in file( dataset.file_name ): line = line.strip() if line: if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ): # We don't count comment lines for sequence data types continue sequences += 1 dataset.metadata.sequences = sequences def sniff ( self, filename ): """ Determines whether the file is in CG TSV format For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf """ bases_regexp = re.compile( "^[NGTAC]*" ) headers = get_headers( filename, '\t' ) try: count = 0 if len(headers) < 2: return False for hdr in headers: if len( hdr ) > 1 and hdr[0]: if hdr[0].startswith( '#' ): continue if len(hdr) != 3: return False if hdr[0].startswith( '>' ): if hdr[0] != ">flags": return False if hdr[1] != "reads": return False else: try: map( int, [hdr[0]] ) if not bases_regexp.match(hdr[1]): return False except: return False count += 1 if count >= 5: return True # Do other necessary checking here... except: return False # If we haven't yet returned False, then... return True class Samix( Binary ): """Class describing a tabix-ed SAM file""" file_ext = "sam.gz" MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) def init_meta( self, dataset, copy_from=None ): Binary.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, **kwd ): """ Creates the index for the SAM file. """ # These metadata values are not accessible by users, always overwrite #f = open('/home/alan/galtmp', 'w') index_file = dataset.metadata.sam_index if not index_file: index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset ) # print >>f, 'idx file ', index_file, '\n' # Create the Sam index stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name)) #print >>f, 'idx cmd ', command, '\n' proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) exit_code = proc.wait() #Did index succeed? stderr = open( stderr_name ).read().strip() if stderr: if exit_code != 0: os.unlink( stderr_name ) #clean up f.close(); raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr else: print stderr #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name shutil.move(dataset.file_name + '.tbi', index_file.file_name) dataset.metadata.sam_index = index_file # f.close(); # Remove temp file os.unlink( stderr_name ) def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = "Tabix-ed sam alignments file" dataset.blurb = data.nice_size( dataset.get_size() ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def display_peek( self, dataset ): try: return dataset.peek except: return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )