comparison lib/galaxy/datatypes/rtg.py @ 1:8593828f91e7 default tip

Full galaxy wrapper
author diego
date Sat, 21 Apr 2012 21:36:15 -0400
parents
children
comparison
equal deleted inserted replaced
0:d50638ebd809 1:8593828f91e7
1 """
2 rtg datatypes
3 """
4
5 import data
6 from galaxy.datatypes import sequence
7 import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess
8 import galaxy.model
9 from galaxy.datatypes import metadata
10 from galaxy.datatypes.metadata import MetadataElement
11 from galaxy import util
12 from galaxy.datatypes.images import Html
13 from galaxy.datatypes.sequence import Sequence
14 from galaxy.datatypes.binary import Binary
15 from sniff import *
16 from pprint import pprint
17 from ConfigParser import ConfigParser
18
19 log = logging.getLogger(__name__)
20 basepath = os.path.dirname(__file__)
21 rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg"))
22
23 class FakeSecHead(object):
24 def __init__(self, fp):
25 self.fp = fp
26 self.sechead = '[asection]\n'
27 def readline(self):
28 if self.sechead:
29 try: return self.sechead
30 finally: self.sechead = None
31 else: return self.fp.readline()
32
33 cfg = ConfigParser()
34 cfg.readfp(FakeSecHead(open(rtgcfg)))
35
36 class Sdf( Html ):
37 composite_type = 'auto_primary_file'
38 allow_datatype_change = False
39 file_ext = 'sdf'
40
41 MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter)
42 MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter)
43 MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter)
44 MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
45 MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter)
46 MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
47 MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter)
48 MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter)
49
50 def __init__( self, **kwd ):
51 Html.__init__( self, **kwd )
52 log.debug( "Rtg log info %s" % ' __init__')
53 self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False )
54 self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False )
55 self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False )
56 self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
57 self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
58 self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
59 self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
60 self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
61 self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
62
63 def generate_primary_file( self, dataset = None ):
64 log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset))
65 rval = ['<html><head><title>RTG SDF Dataset </title></head><p/>']
66 rval.append('<div>This SDF dataset is composed of the following files:<p/><ul>')
67 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
68 fn = composite_name
69 log.debug( "Rtg log info %s %s %s" % ('generate_primary_file',fn,composite_file))
70 opt_text = ''
71 if composite_file.optional:
72 opt_text = ' (optional)'
73 if composite_file.get('description'):
74 rval.append( '<li><a href="%s" type="application/octet-stream">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
75 else:
76 rval.append( '<li><a href="%s" type="application/octet-stream">%s</a>%s</li>' % ( fn, fn, opt_text ) )
77 rval.append( '</ul></div></html>' )
78 return "\n".join( rval )
79
80 def regenerate_primary_file(self,dataset):
81 """
82 cannot do this until we are setting metadata
83 """
84 log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset))
85 bn = dataset.metadata.base_name
86 flist = os.listdir(dataset.extra_files_path)
87 rval = ['<html><head><title>Files for RTG SDF Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
88 for i,fname in enumerate(flist):
89 sfname = os.path.split(fname)[-1]
90 rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
91 rval.append( '</ul></html>' )
92 f = file(dataset.file_name,'w')
93 f.write("\n".join( rval ))
94 f.write('\n')
95 f.close()
96
97 def set_meta( self, dataset, **kwd ):
98 Html.set_meta( self, dataset, **kwd )
99 self.regenerate_primary_file(dataset)
100 if (os.path.isdir(dataset.extra_files_path + '/left')):
101 sdfDir = dataset.extra_files_path + '/left'
102 dataset.metadata.paired = 'TRUE'
103 else:
104 sdfDir = dataset.extra_files_path
105 dataset.metadata.paired = 'FALSE'
106 p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r")
107 while 1:
108 line = p.readline()
109 if not line:
110 break
111 if line.startswith('SDF-ID'):
112 dataset.metadata.sdfId = line.split(':', 1)[1].strip()
113 elif line.startswith('Number of sequences'):
114 dataset.metadata.sequences = line.split(':', 1)[1].strip()
115 elif line.startswith('Type'):
116 dataset.metadata.type = line.split(':', 1)[1].strip()
117 elif line.startswith('Source'):
118 dataset.metadata.source = line.split(':', 1)[1].strip()
119 elif line.startswith('Quality scores available'):
120 dataset.metadata.hasQuality = 'TRUE'
121 elif line.startswith('Maximum length'):
122 dataset.metadata.maxLength = line.split(':', 1)[1].strip()
123 elif line.startswith('Minimum length'):
124 dataset.metadata.minLength = line.split(':', 1)[1].strip()
125 if dataset.metadata.hasQuality != 'TRUE':
126 dataset.metadata.hasQuality = 'FALSE'
127
128 if __name__ == '__main__':
129 import doctest, sys
130 doctest.testmod(sys.modules[__name__])
131
132 class Cgtsv ( Sequence ):
133 """Class representing a generic CG TSV sequence"""
134 file_ext = "tsvcg"
135
136 def set_meta( self, dataset, **kwd ):
137 """
138 Set the number of sequences and the number of data lines
139 in dataset.
140 """
141 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
142 dataset.metadata.sequences = None
143 return
144 sequences = 0
145 for line in file( dataset.file_name ):
146 line = line.strip()
147 if line:
148 if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ):
149 # We don't count comment lines for sequence data types
150 continue
151 sequences += 1
152 dataset.metadata.sequences = sequences
153 def sniff ( self, filename ):
154 """
155 Determines whether the file is in CG TSV format
156 For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf
157 """
158 bases_regexp = re.compile( "^[NGTAC]*" )
159 headers = get_headers( filename, '\t' )
160 try:
161 count = 0
162 if len(headers) < 2:
163 return False
164 for hdr in headers:
165 if len( hdr ) > 1 and hdr[0]:
166 if hdr[0].startswith( '#' ):
167 continue
168 if len(hdr) != 3:
169 return False
170 if hdr[0].startswith( '>' ):
171 if hdr[0] != ">flags":
172 return False
173 if hdr[1] != "reads":
174 return False
175 else:
176 try:
177 map( int, [hdr[0]] )
178 if not bases_regexp.match(hdr[1]):
179 return False
180 except:
181 return False
182 count += 1
183 if count >= 5:
184 return True
185 # Do other necessary checking here...
186 except:
187 return False
188 # If we haven't yet returned False, then...
189 return True
190
191 class Samix( Binary ):
192 """Class describing a tabix-ed SAM file"""
193 file_ext = "sam.gz"
194 MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
195 def init_meta( self, dataset, copy_from=None ):
196 Binary.init_meta( self, dataset, copy_from=copy_from )
197 def set_meta( self, dataset, overwrite = True, **kwd ):
198 """ Creates the index for the SAM file. """
199 # These metadata values are not accessible by users, always overwrite
200 #f = open('/home/alan/galtmp', 'w')
201
202 index_file = dataset.metadata.sam_index
203 if not index_file:
204 index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset )
205 # print >>f, 'idx file ', index_file, '\n'
206 # Create the Sam index
207 stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name
208 command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name))
209 #print >>f, 'idx cmd ', command, '\n'
210 proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
211 exit_code = proc.wait()
212 #Did index succeed?
213 stderr = open( stderr_name ).read().strip()
214 if stderr:
215 if exit_code != 0:
216 os.unlink( stderr_name ) #clean up
217 f.close();
218 raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr
219 else:
220 print stderr
221 #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name
222 shutil.move(dataset.file_name + '.tbi', index_file.file_name)
223 dataset.metadata.sam_index = index_file
224 # f.close();
225 # Remove temp file
226 os.unlink( stderr_name )
227 def set_peek( self, dataset, is_multi_byte=False ):
228 if not dataset.dataset.purged:
229 dataset.peek = "Tabix-ed sam alignments file"
230 dataset.blurb = data.nice_size( dataset.get_size() )
231 else:
232 dataset.peek = 'file does not exist'
233 dataset.blurb = 'file purged from disk'
234 def display_peek( self, dataset ):
235 try:
236 return dataset.peek
237 except:
238 return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
239