Mercurial > repos > diego > rtg_investigator
comparison lib/galaxy/datatypes/rtg.py @ 1:8593828f91e7 default tip
Full galaxy wrapper
author | diego |
---|---|
date | Sat, 21 Apr 2012 21:36:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d50638ebd809 | 1:8593828f91e7 |
---|---|
1 """ | |
2 rtg datatypes | |
3 """ | |
4 | |
5 import data | |
6 from galaxy.datatypes import sequence | |
7 import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess | |
8 import galaxy.model | |
9 from galaxy.datatypes import metadata | |
10 from galaxy.datatypes.metadata import MetadataElement | |
11 from galaxy import util | |
12 from galaxy.datatypes.images import Html | |
13 from galaxy.datatypes.sequence import Sequence | |
14 from galaxy.datatypes.binary import Binary | |
15 from sniff import * | |
16 from pprint import pprint | |
17 from ConfigParser import ConfigParser | |
18 | |
19 log = logging.getLogger(__name__) | |
20 basepath = os.path.dirname(__file__) | |
21 rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg")) | |
22 | |
23 class FakeSecHead(object): | |
24 def __init__(self, fp): | |
25 self.fp = fp | |
26 self.sechead = '[asection]\n' | |
27 def readline(self): | |
28 if self.sechead: | |
29 try: return self.sechead | |
30 finally: self.sechead = None | |
31 else: return self.fp.readline() | |
32 | |
33 cfg = ConfigParser() | |
34 cfg.readfp(FakeSecHead(open(rtgcfg))) | |
35 | |
36 class Sdf( Html ): | |
37 composite_type = 'auto_primary_file' | |
38 allow_datatype_change = False | |
39 file_ext = 'sdf' | |
40 | |
41 MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter) | |
42 MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter) | |
43 MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter) | |
44 MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) | |
45 MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter) | |
46 MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) | |
47 MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter) | |
48 MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter) | |
49 | |
50 def __init__( self, **kwd ): | |
51 Html.__init__( self, **kwd ) | |
52 log.debug( "Rtg log info %s" % ' __init__') | |
53 self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False ) | |
54 self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False ) | |
55 self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False ) | |
56 self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
57 self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
58 self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
59 self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
60 self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
61 self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) | |
62 | |
63 def generate_primary_file( self, dataset = None ): | |
64 log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset)) | |
65 rval = ['<html><head><title>RTG SDF Dataset </title></head><p/>'] | |
66 rval.append('<div>This SDF dataset is composed of the following files:<p/><ul>') | |
67 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): | |
68 fn = composite_name | |
69 log.debug( "Rtg log info %s %s %s" % ('generate_primary_file',fn,composite_file)) | |
70 opt_text = '' | |
71 if composite_file.optional: | |
72 opt_text = ' (optional)' | |
73 if composite_file.get('description'): | |
74 rval.append( '<li><a href="%s" type="application/octet-stream">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) | |
75 else: | |
76 rval.append( '<li><a href="%s" type="application/octet-stream">%s</a>%s</li>' % ( fn, fn, opt_text ) ) | |
77 rval.append( '</ul></div></html>' ) | |
78 return "\n".join( rval ) | |
79 | |
80 def regenerate_primary_file(self,dataset): | |
81 """ | |
82 cannot do this until we are setting metadata | |
83 """ | |
84 log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset)) | |
85 bn = dataset.metadata.base_name | |
86 flist = os.listdir(dataset.extra_files_path) | |
87 rval = ['<html><head><title>Files for RTG SDF Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)] | |
88 for i,fname in enumerate(flist): | |
89 sfname = os.path.split(fname)[-1] | |
90 rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) ) | |
91 rval.append( '</ul></html>' ) | |
92 f = file(dataset.file_name,'w') | |
93 f.write("\n".join( rval )) | |
94 f.write('\n') | |
95 f.close() | |
96 | |
97 def set_meta( self, dataset, **kwd ): | |
98 Html.set_meta( self, dataset, **kwd ) | |
99 self.regenerate_primary_file(dataset) | |
100 if (os.path.isdir(dataset.extra_files_path + '/left')): | |
101 sdfDir = dataset.extra_files_path + '/left' | |
102 dataset.metadata.paired = 'TRUE' | |
103 else: | |
104 sdfDir = dataset.extra_files_path | |
105 dataset.metadata.paired = 'FALSE' | |
106 p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r") | |
107 while 1: | |
108 line = p.readline() | |
109 if not line: | |
110 break | |
111 if line.startswith('SDF-ID'): | |
112 dataset.metadata.sdfId = line.split(':', 1)[1].strip() | |
113 elif line.startswith('Number of sequences'): | |
114 dataset.metadata.sequences = line.split(':', 1)[1].strip() | |
115 elif line.startswith('Type'): | |
116 dataset.metadata.type = line.split(':', 1)[1].strip() | |
117 elif line.startswith('Source'): | |
118 dataset.metadata.source = line.split(':', 1)[1].strip() | |
119 elif line.startswith('Quality scores available'): | |
120 dataset.metadata.hasQuality = 'TRUE' | |
121 elif line.startswith('Maximum length'): | |
122 dataset.metadata.maxLength = line.split(':', 1)[1].strip() | |
123 elif line.startswith('Minimum length'): | |
124 dataset.metadata.minLength = line.split(':', 1)[1].strip() | |
125 if dataset.metadata.hasQuality != 'TRUE': | |
126 dataset.metadata.hasQuality = 'FALSE' | |
127 | |
128 if __name__ == '__main__': | |
129 import doctest, sys | |
130 doctest.testmod(sys.modules[__name__]) | |
131 | |
132 class Cgtsv ( Sequence ): | |
133 """Class representing a generic CG TSV sequence""" | |
134 file_ext = "tsvcg" | |
135 | |
136 def set_meta( self, dataset, **kwd ): | |
137 """ | |
138 Set the number of sequences and the number of data lines | |
139 in dataset. | |
140 """ | |
141 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: | |
142 dataset.metadata.sequences = None | |
143 return | |
144 sequences = 0 | |
145 for line in file( dataset.file_name ): | |
146 line = line.strip() | |
147 if line: | |
148 if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ): | |
149 # We don't count comment lines for sequence data types | |
150 continue | |
151 sequences += 1 | |
152 dataset.metadata.sequences = sequences | |
153 def sniff ( self, filename ): | |
154 """ | |
155 Determines whether the file is in CG TSV format | |
156 For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf | |
157 """ | |
158 bases_regexp = re.compile( "^[NGTAC]*" ) | |
159 headers = get_headers( filename, '\t' ) | |
160 try: | |
161 count = 0 | |
162 if len(headers) < 2: | |
163 return False | |
164 for hdr in headers: | |
165 if len( hdr ) > 1 and hdr[0]: | |
166 if hdr[0].startswith( '#' ): | |
167 continue | |
168 if len(hdr) != 3: | |
169 return False | |
170 if hdr[0].startswith( '>' ): | |
171 if hdr[0] != ">flags": | |
172 return False | |
173 if hdr[1] != "reads": | |
174 return False | |
175 else: | |
176 try: | |
177 map( int, [hdr[0]] ) | |
178 if not bases_regexp.match(hdr[1]): | |
179 return False | |
180 except: | |
181 return False | |
182 count += 1 | |
183 if count >= 5: | |
184 return True | |
185 # Do other necessary checking here... | |
186 except: | |
187 return False | |
188 # If we haven't yet returned False, then... | |
189 return True | |
190 | |
191 class Samix( Binary ): | |
192 """Class describing a tabix-ed SAM file""" | |
193 file_ext = "sam.gz" | |
194 MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) | |
195 def init_meta( self, dataset, copy_from=None ): | |
196 Binary.init_meta( self, dataset, copy_from=copy_from ) | |
197 def set_meta( self, dataset, overwrite = True, **kwd ): | |
198 """ Creates the index for the SAM file. """ | |
199 # These metadata values are not accessible by users, always overwrite | |
200 #f = open('/home/alan/galtmp', 'w') | |
201 | |
202 index_file = dataset.metadata.sam_index | |
203 if not index_file: | |
204 index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset ) | |
205 # print >>f, 'idx file ', index_file, '\n' | |
206 # Create the Sam index | |
207 stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name | |
208 command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name)) | |
209 #print >>f, 'idx cmd ', command, '\n' | |
210 proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) | |
211 exit_code = proc.wait() | |
212 #Did index succeed? | |
213 stderr = open( stderr_name ).read().strip() | |
214 if stderr: | |
215 if exit_code != 0: | |
216 os.unlink( stderr_name ) #clean up | |
217 f.close(); | |
218 raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr | |
219 else: | |
220 print stderr | |
221 #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name | |
222 shutil.move(dataset.file_name + '.tbi', index_file.file_name) | |
223 dataset.metadata.sam_index = index_file | |
224 # f.close(); | |
225 # Remove temp file | |
226 os.unlink( stderr_name ) | |
227 def set_peek( self, dataset, is_multi_byte=False ): | |
228 if not dataset.dataset.purged: | |
229 dataset.peek = "Tabix-ed sam alignments file" | |
230 dataset.blurb = data.nice_size( dataset.get_size() ) | |
231 else: | |
232 dataset.peek = 'file does not exist' | |
233 dataset.blurb = 'file purged from disk' | |
234 def display_peek( self, dataset ): | |
235 try: | |
236 return dataset.peek | |
237 except: | |
238 return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
239 |