Mercurial > repos > diego > rtg_investigator

"""
rtg datatypes
"""

import data
from galaxy.datatypes import sequence
import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess
import galaxy.model
from galaxy.datatypes import metadata
from galaxy.datatypes.metadata import MetadataElement
from galaxy import util
from galaxy.datatypes.images import Html
from galaxy.datatypes.sequence import Sequence
from galaxy.datatypes.binary import Binary
from sniff import *
from pprint import pprint
from ConfigParser import ConfigParser

log = logging.getLogger(__name__)
basepath = os.path.dirname(__file__)
rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg"))

class FakeSecHead(object):
    def __init__(self, fp):
        self.fp = fp
        self.sechead = '[asection]\n'
    def readline(self):
        if self.sechead:
          try: return self.sechead
          finally: self.sechead = None
        else: return self.fp.readline()

cfg = ConfigParser()
cfg.readfp(FakeSecHead(open(rtgcfg)))

class Sdf( Html ):
    composite_type = 'auto_primary_file'
    allow_datatype_change = False
    file_ext = 'sdf'

    MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter)
    MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter)
    MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter)
    MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
    MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter)
    MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
    MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter)
    MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter)

    def __init__( self, **kwd ):
        Html.__init__( self, **kwd )
        log.debug( "Rtg log info  %s" % ' __init__')
        self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False )
        self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False )
        self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False )
        self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
        self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
        self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
        self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
        self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
        self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )

    def generate_primary_file( self, dataset = None ):
        log.debug( "Rtg log info  %s %s" % ('generate_primary_file',dataset))
        rval = ['<html><head><title>RTG SDF Dataset </title></head><p/>']
        rval.append('<div>This SDF dataset is composed of the following files:<p/><ul>')
        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
            fn = composite_name
            log.debug( "Rtg log info  %s %s %s" % ('generate_primary_file',fn,composite_file))
            opt_text = ''
            if composite_file.optional:
                opt_text = ' (optional)'
            if composite_file.get('description'):
                rval.append( '<li><a href="%s" type="application/octet-stream">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
            else:
                rval.append( '<li><a href="%s" type="application/octet-stream">%s</a>%s</li>' % ( fn, fn, opt_text ) )
        rval.append( '</ul></div></html>' )
        return "\n".join( rval )

    def regenerate_primary_file(self,dataset):
        """
        cannot do this until we are setting metadata
        """
        log.debug( "Rtg log info  %s %s" % ('regenerate_primary_file',dataset))
        bn = dataset.metadata.base_name
        flist = os.listdir(dataset.extra_files_path)
        rval = ['<html><head><title>Files for RTG SDF Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
        for i,fname in enumerate(flist):
            sfname = os.path.split(fname)[-1]
            rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
        rval.append( '</ul></html>' )
        f = file(dataset.file_name,'w')
        f.write("\n".join( rval ))
        f.write('\n')
        f.close()

    def set_meta( self, dataset, **kwd ):
        Html.set_meta( self, dataset, **kwd )
        self.regenerate_primary_file(dataset)
        if (os.path.isdir(dataset.extra_files_path + '/left')):
            sdfDir = dataset.extra_files_path + '/left'
            dataset.metadata.paired = 'TRUE'
        else:
            sdfDir = dataset.extra_files_path
            dataset.metadata.paired = 'FALSE'
        p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r")
        while 1:
            line = p.readline()
            if not line:
                break
            if line.startswith('SDF-ID'):
              dataset.metadata.sdfId = line.split(':', 1)[1].strip()
            elif line.startswith('Number of sequences'):
                dataset.metadata.sequences = line.split(':', 1)[1].strip()
            elif line.startswith('Type'):
                dataset.metadata.type = line.split(':', 1)[1].strip()
            elif line.startswith('Source'):
                dataset.metadata.source = line.split(':', 1)[1].strip()
            elif line.startswith('Quality scores available'):
                dataset.metadata.hasQuality = 'TRUE'
            elif line.startswith('Maximum length'):
                dataset.metadata.maxLength = line.split(':', 1)[1].strip()
            elif line.startswith('Minimum length'):
                dataset.metadata.minLength = line.split(':', 1)[1].strip()
        if dataset.metadata.hasQuality != 'TRUE':
            dataset.metadata.hasQuality = 'FALSE'

    if __name__ == '__main__':
        import doctest, sys
        doctest.testmod(sys.modules[__name__])

class Cgtsv ( Sequence ):
    """Class representing a generic CG TSV sequence"""
    file_ext = "tsvcg"

    def set_meta( self, dataset, **kwd ):
        """
        Set the number of sequences and the number of data lines
        in dataset.
        """
        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
            dataset.metadata.sequences = None
            return
        sequences = 0
        for line in file( dataset.file_name ):
            line = line.strip()
            if line:
                if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ):
                    # We don't count comment lines for sequence data types
                    continue
                sequences += 1
        dataset.metadata.sequences = sequences
    def sniff ( self, filename ):
        """
        Determines whether the file is in CG TSV format
        For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf
        """
        bases_regexp = re.compile( "^[NGTAC]*" )
        headers = get_headers( filename, '\t' )
        try:
            count = 0
            if len(headers) < 2:
                return False
            for hdr in headers:
                if len( hdr ) > 1 and hdr[0]:
                    if hdr[0].startswith( '#' ):
                        continue
                    if len(hdr) != 3:
                        return False
                    if  hdr[0].startswith( '>' ):
                        if hdr[0] != ">flags":
                            return False
                        if hdr[1] != "reads":
                            return False
                    else:
                        try:
                            map( int, [hdr[0]] )
                            if not bases_regexp.match(hdr[1]):
                                return False
                        except:
                            return False
                    count += 1
                    if count >= 5:
                        return True
                # Do other necessary checking here...
        except:
            return False
        # If we haven't yet returned False, then...
        return True

class Samix( Binary ):
    """Class describing a tabix-ed SAM file"""
    file_ext = "sam.gz"
    MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
    def init_meta( self, dataset, copy_from=None ):
        Binary.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, **kwd ):
        """ Creates the index for the SAM file. """
        # These metadata values are not accessible by users, always overwrite
        #f = open('/home/alan/galtmp', 'w')

        index_file = dataset.metadata.sam_index
        if not index_file:
            index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset )
         #   print >>f, 'idx file ', index_file, '\n'
        # Create the Sam index
        stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name
        command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name))
        #print >>f, 'idx cmd ', command, '\n'
        proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
        exit_code = proc.wait()
        #Did index succeed?
        stderr = open( stderr_name ).read().strip()
        if stderr:
            if exit_code != 0:
                os.unlink( stderr_name ) #clean up
                f.close();
                raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr
            else:
                print stderr
        #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name
        shutil.move(dataset.file_name + '.tbi', index_file.file_name)
        dataset.metadata.sam_index = index_file
       # f.close();
        # Remove temp file
        os.unlink( stderr_name )
    def set_peek( self, dataset, is_multi_byte=False ):
      if not dataset.dataset.purged:
          dataset.peek  = "Tabix-ed sam alignments file"
          dataset.blurb = data.nice_size( dataset.get_size() )
      else:
          dataset.peek = 'file does not exist'
          dataset.blurb = 'file purged from disk'
    def display_peek( self, dataset ):
        try:
            return dataset.peek
        except:
            return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
author	diego
date	Sat, 21 Apr 2012 21:36:15 -0400
parents
children