9
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
|
6 import binascii
|
|
7
|
|
8 from galaxy.datatypes.sniff import *
|
|
9 from galaxy.datatypes import data
|
|
10 from galaxy.datatypes.data import Text
|
|
11 from galaxy.datatypes.xml import GenericXml
|
|
12 from galaxy.datatypes.binary import Binary
|
|
13 from galaxy.datatypes.tabular import Tabular
|
|
14 from galaxy.datatypes.interval import Gff
|
|
15
|
|
16 log = logging.getLogger(__name__)
|
|
17
|
|
18
|
|
19 class Wiff( Binary ):
|
|
20 """Class for wiff files."""
|
|
21 file_ext = 'wiff'
|
|
22 allow_datatype_change = False
|
|
23 composite_type = 'auto_primary_file'
|
|
24
|
|
25 def __init__(self, **kwd):
|
|
26 Binary.__init__(self, **kwd)
|
|
27 self.add_composite_file( 'wiff',
|
|
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
|
|
29 is_binary = True )
|
|
30 self.add_composite_file( 'wiff_scan',
|
|
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
|
|
32 optional = 'True', is_binary = True )
|
|
33
|
|
34 def generate_primary_file( self, dataset = None ):
|
|
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
|
|
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
|
|
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
|
|
38 fn = composite_name
|
|
39 opt_text = ''
|
|
40 if composite_file.optional:
|
|
41 opt_text = ' (optional)'
|
|
42 if composite_file.get('description'):
|
|
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
|
|
44 else:
|
|
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
|
|
46 rval.append( '</ul></div></html>' )
|
|
47 return "\n".join( rval )
|
|
48
|
|
49
|
|
50
|
|
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
52 Binary.register_unsniffable_binary_ext('wiff')
|
|
53
|
|
54
|
|
55 class IdpDB( Binary ):
|
|
56 file_ext = "idpDB"
|
|
57
|
|
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
59 Binary.register_unsniffable_binary_ext('idpDB')
|
|
60
|
|
61
|
|
62 class PepXmlReport( Tabular ):
|
|
63 """pepxml converted to tabular report"""
|
|
64 file_ext = "tsv"
|
|
65
|
|
66 def __init__(self, **kwd):
|
|
67 Tabular.__init__( self, **kwd )
|
|
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
|
|
69
|
|
70 def display_peek( self, dataset ):
|
|
71 """Returns formated html of peek"""
|
|
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
73
|
|
74
|
|
75 class ProtXmlReport( Tabular ):
|
|
76 """protxml converted to tabular report"""
|
|
77 file_ext = "tsv"
|
|
78 comment_lines = 1
|
|
79
|
|
80 def __init__(self, **kwd):
|
|
81 Tabular.__init__( self, **kwd )
|
|
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
|
|
83
|
|
84 def display_peek( self, dataset ):
|
|
85 """Returns formated html of peek"""
|
|
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
87
|
|
88 class ProteomicsXml( GenericXml ):
|
|
89 """ An enhanced XML datatype used to reuse code across several
|
|
90 proteomic/mass-spec datatypes. """
|
|
91
|
|
92 def sniff(self, filename):
|
|
93 """ Determines whether the file is the correct XML type. """
|
|
94 with open(filename, 'r') as contents:
|
|
95 while True:
|
|
96 line = contents.readline()
|
|
97 if line == None or not line.startswith('<?'):
|
|
98 break
|
|
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
|
|
100 return line != None and re.match(pattern, line) != None
|
|
101
|
|
102 def set_peek( self, dataset, is_multi_byte=False ):
|
|
103 """Set the peek and blurb text"""
|
|
104 if not dataset.dataset.purged:
|
|
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
106 dataset.blurb = self.blurb
|
|
107 else:
|
|
108 dataset.peek = 'file does not exist'
|
|
109 dataset.blurb = 'file purged from disk'
|
|
110
|
|
111
|
|
112 class PepXml(ProteomicsXml):
|
|
113 """pepXML data"""
|
|
114 file_ext = "pepxml"
|
|
115 blurb = 'pepXML data'
|
|
116 root = "msms_pipeline_analysis"
|
|
117
|
|
118
|
|
119 class MzML(ProteomicsXml):
|
|
120 """mzML data"""
|
|
121 file_ext = "mzml"
|
|
122 blurb = 'mzML Mass Spectrometry data'
|
|
123 root = "(mzML|indexedmzML)"
|
|
124
|
|
125
|
|
126 class ProtXML(ProteomicsXml):
|
|
127 """protXML data"""
|
|
128 file_ext = "protxml"
|
|
129 blurb = 'prot XML Search Results'
|
|
130 root = "protein_summary"
|
|
131
|
|
132
|
|
133 class MzXML(ProteomicsXml):
|
|
134 """mzXML data"""
|
|
135 file_ext = "mzxml"
|
|
136 blurb = "mzXML Mass Spectrometry data"
|
|
137 root = "mzXML"
|
|
138
|
|
139 ## PSI datatypes
|
|
140 class MzIdentML(ProteomicsXml):
|
|
141 file_ext = "mzid"
|
|
142 blurb = "XML identified peptides and proteins."
|
|
143 root = "MzIdentML"
|
|
144
|
|
145
|
|
146 class TraML(ProteomicsXml):
|
|
147 file_ext = "traml"
|
|
148 blurb = "TraML transition list"
|
|
149 root = "TraML"
|
|
150
|
|
151
|
|
152 class MzQuantML(ProteomicsXml):
|
|
153 file_ext = "mzq"
|
|
154 blurb = "XML quantification data"
|
|
155 root = "MzQuantML"
|
|
156
|
|
157
|
|
158 class ConsensusXML(ProteomicsXml):
|
|
159 file_ext = "consensusxml"
|
|
160 blurb = "OpenMS multiple LC-MS map alignment file"
|
|
161 root = "consensusXML"
|
|
162
|
|
163
|
|
164 class FeatureXML(ProteomicsXml):
|
|
165 file_ext = "featurexml"
|
|
166 blurb = "OpenMS feature file"
|
|
167 root = "featureMap"
|
|
168
|
|
169
|
|
170 class IdXML(ProteomicsXml):
|
|
171 file_ext = "idxml"
|
|
172 blurb = "OpenMS identification file"
|
|
173 root = "IdXML"
|
|
174
|
|
175
|
|
176 class Mgf( Text ):
|
|
177 """Mascot Generic Format data"""
|
|
178 file_ext = "mgf"
|
|
179
|
|
180 def set_peek( self, dataset, is_multi_byte=False ):
|
|
181 """Set the peek and blurb text"""
|
|
182 if not dataset.dataset.purged:
|
|
183 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
184 dataset.blurb = 'mgf Mascot Generic Format'
|
|
185 else:
|
|
186 dataset.peek = 'file does not exist'
|
|
187 dataset.blurb = 'file purged from disk'
|
|
188
|
|
189 def sniff( self, filename ):
|
|
190 mgf_begin_ions = "BEGIN IONS"
|
|
191 max_lines=100
|
|
192
|
|
193 for i, line in enumerate( file( filename ) ):
|
|
194 line = line.rstrip( '\n\r' )
|
|
195 if line==mgf_begin_ions:
|
|
196 return True
|
|
197 if i>max_lines:
|
|
198 return False
|
|
199
|
|
200
|
|
201 class MascotDat( Text ):
|
|
202 """Mascot search results """
|
|
203 file_ext = "mascotdat"
|
|
204
|
|
205 def set_peek( self, dataset, is_multi_byte=False ):
|
|
206 """Set the peek and blurb text"""
|
|
207 if not dataset.dataset.purged:
|
|
208 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
209 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
210 else:
|
|
211 dataset.peek = 'file does not exist'
|
|
212 dataset.blurb = 'file purged from disk'
|
|
213
|
|
214
|
|
215 def sniff( self, filename ):
|
|
216 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
217 max_lines=10
|
|
218
|
|
219 for i, line in enumerate( file( filename ) ):
|
|
220 line = line.rstrip( '\n\r' )
|
|
221 if line==mime_version:
|
|
222 return True
|
|
223 if i>max_lines:
|
|
224 return False
|
|
225
|
|
226
|
|
227 class RAW( Binary ):
|
|
228 """Class describing a Thermo Finnigan binary RAW file"""
|
|
229 file_ext = "raw"
|
|
230 def sniff( self, filename ):
|
|
231 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
232 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
233 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
234 # the start of the file.
|
|
235 try:
|
|
236 header = open( filename ).read(20)
|
|
237 hexheader = binascii.b2a_hex( header )
|
|
238 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
239 if hexheader.find(finnigan) != -1:
|
|
240 return True
|
|
241 return False
|
|
242 except:
|
|
243 return False
|
|
244 def set_peek( self, dataset, is_multi_byte=False ):
|
|
245 if not dataset.dataset.purged:
|
|
246 dataset.peek = "Thermo Finnigan RAW file"
|
|
247 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
248 else:
|
|
249 dataset.peek = 'file does not exist'
|
|
250 dataset.blurb = 'file purged from disk'
|
|
251 def display_peek( self, dataset ):
|
|
252 try:
|
|
253 return dataset.peek
|
|
254 except:
|
|
255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
256
|
|
257
|
|
258 if hasattr(Binary, 'register_sniffable_binary_format'):
|
|
259 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
|
|
260
|
|
261
|
|
262 class Msp( Text ):
|
|
263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
264 file_ext = "msp"
|
|
265
|
|
266 @staticmethod
|
|
267 def next_line_starts_with(contents, prefix):
|
|
268 next_line = contents.readline()
|
|
269 return next_line != None and next_line.startswith(prefix)
|
|
270
|
|
271 def sniff(self, filename):
|
|
272 """ Determines whether the file is a NIST MSP output file.
|
|
273
|
|
274 >>> fname = get_test_fname('test.msp')
|
|
275 >>> Msp().sniff(fname)
|
|
276 True
|
|
277 >>> fname = get_test_fname('test.mzXML')
|
|
278 >>> Msp().sniff(fname)
|
|
279 False
|
|
280 """
|
|
281 with open(filename, 'r') as contents:
|
|
282 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
283
|
|
284 class Ms2(Text):
|
|
285 file_ext = "ms2"
|
|
286
|
|
287 def sniff(self, filename):
|
|
288 """ Determines whether the file is a valid ms2 file.
|
|
289
|
|
290 >>> fname = get_test_fname('test.msp')
|
|
291 >>> Ms2().sniff(fname)
|
|
292 False
|
|
293 >>> fname = get_test_fname('test.ms2')
|
|
294 >>> Ms2().sniff(fname)
|
|
295 True
|
|
296 """
|
|
297
|
|
298 with open(filename, 'r') as contents:
|
|
299 header_lines = []
|
|
300 while True:
|
|
301 line = contents.readline()
|
|
302 if line == None or len(line) == 0:
|
|
303 pass
|
|
304 elif line.startswith('H\t'):
|
|
305 header_lines.append(line)
|
|
306 else:
|
|
307 break
|
|
308 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
309 found_header = False
|
|
310 for header_line in header_lines:
|
|
311 if header_line.startswith('H\t%s' % (header_field)):
|
|
312 found_header = True
|
|
313 break
|
|
314 if not found_header:
|
|
315 return False
|
|
316
|
|
317 return True
|
|
318
|
|
319 # unsniffable binary format, should do something about this
|
|
320 class XHunterAslFormat( Binary ):
|
|
321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
322 file_ext = "hlf"
|
|
323
|
|
324 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
325 Binary.register_unsniffable_binary_ext('hlf')
|