1
|
1 """
|
|
2 rtg datatypes
|
|
3 """
|
|
4
|
|
5 import data
|
|
6 from galaxy.datatypes import sequence
|
|
7 import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess
|
|
8 import galaxy.model
|
|
9 from galaxy.datatypes import metadata
|
|
10 from galaxy.datatypes.metadata import MetadataElement
|
|
11 from galaxy import util
|
|
12 from galaxy.datatypes.images import Html
|
|
13 from galaxy.datatypes.sequence import Sequence
|
|
14 from galaxy.datatypes.binary import Binary
|
|
15 from sniff import *
|
|
16 from pprint import pprint
|
|
17 from ConfigParser import ConfigParser
|
|
18
|
|
19 log = logging.getLogger(__name__)
|
|
20 basepath = os.path.dirname(__file__)
|
|
21 rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg"))
|
|
22
|
|
23 class FakeSecHead(object):
|
|
24 def __init__(self, fp):
|
|
25 self.fp = fp
|
|
26 self.sechead = '[asection]\n'
|
|
27 def readline(self):
|
|
28 if self.sechead:
|
|
29 try: return self.sechead
|
|
30 finally: self.sechead = None
|
|
31 else: return self.fp.readline()
|
|
32
|
|
33 cfg = ConfigParser()
|
|
34 cfg.readfp(FakeSecHead(open(rtgcfg)))
|
|
35
|
|
36 class Sdf( Html ):
|
|
37 composite_type = 'auto_primary_file'
|
|
38 allow_datatype_change = False
|
|
39 file_ext = 'sdf'
|
|
40
|
|
41 MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter)
|
|
42 MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter)
|
|
43 MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter)
|
|
44 MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
|
|
45 MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter)
|
|
46 MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter)
|
|
47 MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter)
|
|
48 MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter)
|
|
49
|
|
50 def __init__( self, **kwd ):
|
|
51 Html.__init__( self, **kwd )
|
|
52 log.debug( "Rtg log info %s" % ' __init__')
|
|
53 self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False )
|
|
54 self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False )
|
|
55 self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False )
|
|
56 self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
57 self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
58 self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
59 self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
60 self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
61 self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True )
|
|
62
|
|
63 def generate_primary_file( self, dataset = None ):
|
|
64 log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset))
|
|
65 rval = ['<html><head><title>RTG SDF Dataset </title></head><p/>']
|
|
66 rval.append('<div>This SDF dataset is composed of the following files:<p/><ul>')
|
|
67 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
|
|
68 fn = composite_name
|
|
69 log.debug( "Rtg log info %s %s %s" % ('generate_primary_file',fn,composite_file))
|
|
70 opt_text = ''
|
|
71 if composite_file.optional:
|
|
72 opt_text = ' (optional)'
|
|
73 if composite_file.get('description'):
|
|
74 rval.append( '<li><a href="%s" type="application/octet-stream">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
|
|
75 else:
|
|
76 rval.append( '<li><a href="%s" type="application/octet-stream">%s</a>%s</li>' % ( fn, fn, opt_text ) )
|
|
77 rval.append( '</ul></div></html>' )
|
|
78 return "\n".join( rval )
|
|
79
|
|
80 def regenerate_primary_file(self,dataset):
|
|
81 """
|
|
82 cannot do this until we are setting metadata
|
|
83 """
|
|
84 log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset))
|
|
85 bn = dataset.metadata.base_name
|
|
86 flist = os.listdir(dataset.extra_files_path)
|
|
87 rval = ['<html><head><title>Files for RTG SDF Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
|
|
88 for i,fname in enumerate(flist):
|
|
89 sfname = os.path.split(fname)[-1]
|
|
90 rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
|
|
91 rval.append( '</ul></html>' )
|
|
92 f = file(dataset.file_name,'w')
|
|
93 f.write("\n".join( rval ))
|
|
94 f.write('\n')
|
|
95 f.close()
|
|
96
|
|
97 def set_meta( self, dataset, **kwd ):
|
|
98 Html.set_meta( self, dataset, **kwd )
|
|
99 self.regenerate_primary_file(dataset)
|
|
100 if (os.path.isdir(dataset.extra_files_path + '/left')):
|
|
101 sdfDir = dataset.extra_files_path + '/left'
|
|
102 dataset.metadata.paired = 'TRUE'
|
|
103 else:
|
|
104 sdfDir = dataset.extra_files_path
|
|
105 dataset.metadata.paired = 'FALSE'
|
|
106 p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r")
|
|
107 while 1:
|
|
108 line = p.readline()
|
|
109 if not line:
|
|
110 break
|
|
111 if line.startswith('SDF-ID'):
|
|
112 dataset.metadata.sdfId = line.split(':', 1)[1].strip()
|
|
113 elif line.startswith('Number of sequences'):
|
|
114 dataset.metadata.sequences = line.split(':', 1)[1].strip()
|
|
115 elif line.startswith('Type'):
|
|
116 dataset.metadata.type = line.split(':', 1)[1].strip()
|
|
117 elif line.startswith('Source'):
|
|
118 dataset.metadata.source = line.split(':', 1)[1].strip()
|
|
119 elif line.startswith('Quality scores available'):
|
|
120 dataset.metadata.hasQuality = 'TRUE'
|
|
121 elif line.startswith('Maximum length'):
|
|
122 dataset.metadata.maxLength = line.split(':', 1)[1].strip()
|
|
123 elif line.startswith('Minimum length'):
|
|
124 dataset.metadata.minLength = line.split(':', 1)[1].strip()
|
|
125 if dataset.metadata.hasQuality != 'TRUE':
|
|
126 dataset.metadata.hasQuality = 'FALSE'
|
|
127
|
|
128 if __name__ == '__main__':
|
|
129 import doctest, sys
|
|
130 doctest.testmod(sys.modules[__name__])
|
|
131
|
|
132 class Cgtsv ( Sequence ):
|
|
133 """Class representing a generic CG TSV sequence"""
|
|
134 file_ext = "tsvcg"
|
|
135
|
|
136 def set_meta( self, dataset, **kwd ):
|
|
137 """
|
|
138 Set the number of sequences and the number of data lines
|
|
139 in dataset.
|
|
140 """
|
|
141 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
|
|
142 dataset.metadata.sequences = None
|
|
143 return
|
|
144 sequences = 0
|
|
145 for line in file( dataset.file_name ):
|
|
146 line = line.strip()
|
|
147 if line:
|
|
148 if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ):
|
|
149 # We don't count comment lines for sequence data types
|
|
150 continue
|
|
151 sequences += 1
|
|
152 dataset.metadata.sequences = sequences
|
|
153 def sniff ( self, filename ):
|
|
154 """
|
|
155 Determines whether the file is in CG TSV format
|
|
156 For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf
|
|
157 """
|
|
158 bases_regexp = re.compile( "^[NGTAC]*" )
|
|
159 headers = get_headers( filename, '\t' )
|
|
160 try:
|
|
161 count = 0
|
|
162 if len(headers) < 2:
|
|
163 return False
|
|
164 for hdr in headers:
|
|
165 if len( hdr ) > 1 and hdr[0]:
|
|
166 if hdr[0].startswith( '#' ):
|
|
167 continue
|
|
168 if len(hdr) != 3:
|
|
169 return False
|
|
170 if hdr[0].startswith( '>' ):
|
|
171 if hdr[0] != ">flags":
|
|
172 return False
|
|
173 if hdr[1] != "reads":
|
|
174 return False
|
|
175 else:
|
|
176 try:
|
|
177 map( int, [hdr[0]] )
|
|
178 if not bases_regexp.match(hdr[1]):
|
|
179 return False
|
|
180 except:
|
|
181 return False
|
|
182 count += 1
|
|
183 if count >= 5:
|
|
184 return True
|
|
185 # Do other necessary checking here...
|
|
186 except:
|
|
187 return False
|
|
188 # If we haven't yet returned False, then...
|
|
189 return True
|
|
190
|
|
191 class Samix( Binary ):
|
|
192 """Class describing a tabix-ed SAM file"""
|
|
193 file_ext = "sam.gz"
|
|
194 MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
|
|
195 def init_meta( self, dataset, copy_from=None ):
|
|
196 Binary.init_meta( self, dataset, copy_from=copy_from )
|
|
197 def set_meta( self, dataset, overwrite = True, **kwd ):
|
|
198 """ Creates the index for the SAM file. """
|
|
199 # These metadata values are not accessible by users, always overwrite
|
|
200 #f = open('/home/alan/galtmp', 'w')
|
|
201
|
|
202 index_file = dataset.metadata.sam_index
|
|
203 if not index_file:
|
|
204 index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset )
|
|
205 # print >>f, 'idx file ', index_file, '\n'
|
|
206 # Create the Sam index
|
|
207 stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name
|
|
208 command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name))
|
|
209 #print >>f, 'idx cmd ', command, '\n'
|
|
210 proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
|
|
211 exit_code = proc.wait()
|
|
212 #Did index succeed?
|
|
213 stderr = open( stderr_name ).read().strip()
|
|
214 if stderr:
|
|
215 if exit_code != 0:
|
|
216 os.unlink( stderr_name ) #clean up
|
|
217 f.close();
|
|
218 raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr
|
|
219 else:
|
|
220 print stderr
|
|
221 #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name
|
|
222 shutil.move(dataset.file_name + '.tbi', index_file.file_name)
|
|
223 dataset.metadata.sam_index = index_file
|
|
224 # f.close();
|
|
225 # Remove temp file
|
|
226 os.unlink( stderr_name )
|
|
227 def set_peek( self, dataset, is_multi_byte=False ):
|
|
228 if not dataset.dataset.purged:
|
|
229 dataset.peek = "Tabix-ed sam alignments file"
|
|
230 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
231 else:
|
|
232 dataset.peek = 'file does not exist'
|
|
233 dataset.blurb = 'file purged from disk'
|
|
234 def display_peek( self, dataset ):
|
|
235 try:
|
|
236 return dataset.peek
|
|
237 except:
|
|
238 return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
239
|