0
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
5
|
6 import binascii
|
|
7
|
0
|
8 from galaxy.datatypes.sniff import *
|
5
|
9 from galaxy.datatypes.data import Text
|
|
10 from galaxy.datatypes.xml import GenericXml
|
|
11 from galaxy.datatypes.binary import Binary
|
|
12 from galaxy.datatypes.tabular import Tabular
|
|
13 from galaxy.datatypes.interval import Gff
|
0
|
14
|
|
15 log = logging.getLogger(__name__)
|
|
16
|
4
|
17 class ProtGff( Gff ):
|
|
18 """Tab delimited data in Gff format"""
|
|
19 file_ext = "prot_gff"
|
|
20 def set_peek( self, dataset, is_multi_byte=False ):
|
|
21 """Set the peek and blurb text"""
|
|
22 if not dataset.dataset.purged:
|
|
23 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
24 dataset.blurb = 'Proteogenomics GFF'
|
|
25 else:
|
|
26 dataset.peek = 'file does not exist'
|
|
27 dataset.blurb = 'file purged from disk'
|
|
28
|
|
29 def sniff( self, filename ):
|
|
30 handle = open(filename)
|
|
31 xmlns_re = re.compile("^##gff-version")
|
|
32 for i in range(3):
|
|
33 line = handle.readline()
|
|
34 if xmlns_re.match(line.strip()):
|
|
35 handle.close()
|
|
36 return True
|
|
37
|
|
38 handle.close()
|
|
39 return False
|
|
40
|
0
|
41
|
|
42 class Xls( Binary ):
|
|
43 """Class describing a binary excel spreadsheet file"""
|
|
44 file_ext = "xls"
|
|
45
|
|
46 def set_peek( self, dataset, is_multi_byte=False ):
|
|
47 if not dataset.dataset.purged:
|
|
48 dataset.peek = "Excel Spreadsheet file"
|
|
49 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
50 else:
|
|
51 dataset.peek = 'file does not exist'
|
|
52 dataset.blurb = 'file purged from disk'
|
|
53 def display_peek( self, dataset ):
|
|
54 try:
|
|
55 return dataset.peek
|
|
56 except:
|
|
57 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
58
|
5
|
59 class IdpDB( Binary ):
|
|
60 file_ext = "idpDB"
|
|
61
|
|
62 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
63 Binary.register_unsniffable_binary_ext('idpDB')
|
|
64
|
|
65
|
|
66 class PepXmlReport( Tabular ):
|
|
67 """pepxml converted to tabular report"""
|
|
68 file_ext = "tsv"
|
|
69
|
|
70 def __init__(self, **kwd):
|
|
71 Tabular.__init__( self, **kwd )
|
|
72 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
|
|
73
|
|
74 def display_peek( self, dataset ):
|
|
75 """Returns formated html of peek"""
|
|
76 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
77
|
|
78
|
|
79 class ProtXmlReport( Tabular ):
|
|
80 """protxml converted to tabular report"""
|
|
81 file_ext = "tsv"
|
|
82 comment_lines = 1
|
|
83
|
|
84 def __init__(self, **kwd):
|
|
85 Tabular.__init__( self, **kwd )
|
|
86 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
|
|
87
|
|
88 def display_peek( self, dataset ):
|
|
89 """Returns formated html of peek"""
|
|
90 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
|
|
91
|
|
92 class ProteomicsXml( GenericXml ):
|
0
|
93 """ An enhanced XML datatype used to reuse code across several
|
|
94 proteomic/mass-spec datatypes. """
|
|
95
|
|
96 def sniff(self, filename):
|
|
97 """ Determines whether the file is the correct XML type. """
|
5
|
98 with open(filename, 'r') as contents:
|
0
|
99 while True:
|
|
100 line = contents.readline()
|
|
101 if line == None or not line.startswith('<?'):
|
|
102 break
|
|
103 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
|
|
104 return line != None and re.match(pattern, line) != None
|
|
105
|
|
106 def set_peek( self, dataset, is_multi_byte=False ):
|
|
107 """Set the peek and blurb text"""
|
|
108 if not dataset.dataset.purged:
|
|
109 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
110 dataset.blurb = self.blurb
|
|
111 else:
|
|
112 dataset.peek = 'file does not exist'
|
|
113 dataset.blurb = 'file purged from disk'
|
|
114
|
5
|
115
|
0
|
116 class PepXml(ProteomicsXml):
|
|
117 """pepXML data"""
|
|
118 file_ext = "pepxml"
|
|
119 blurb = 'pepXML data'
|
|
120 root = "msms_pipeline_analysis"
|
5
|
121
|
0
|
122
|
|
123 class MzML(ProteomicsXml):
|
|
124 """mzML data"""
|
|
125 file_ext = "mzml"
|
|
126 blurb = 'mzML Mass Spectrometry data'
|
|
127 root = "(mzML|indexedmzML)"
|
|
128
|
|
129
|
|
130 class ProtXML(ProteomicsXml):
|
|
131 """protXML data"""
|
|
132 file_ext = "protxml"
|
|
133 blurb = 'prot XML Search Results'
|
|
134 root = "protein_summary"
|
|
135
|
|
136
|
|
137 class MzXML(ProteomicsXml):
|
|
138 """mzXML data"""
|
5
|
139 file_ext = "mzxml"
|
0
|
140 blurb = "mzXML Mass Spectrometry data"
|
|
141 root = "mzXML"
|
|
142
|
|
143 ## PSI datatypes
|
|
144 class MzIdentML(ProteomicsXml):
|
|
145 file_ext = "mzid"
|
|
146 blurb = "XML identified peptides and proteins."
|
|
147 root = "MzIdentML"
|
5
|
148
|
0
|
149
|
|
150 class TraML(ProteomicsXml):
|
5
|
151 file_ext = "traml"
|
0
|
152 blurb = "TraML transition list"
|
|
153 root = "TraML"
|
|
154
|
|
155
|
|
156 class MzQuantML(ProteomicsXml):
|
|
157 file_ext = "mzq"
|
|
158 blurb = "XML quantification data"
|
|
159 root = "MzQuantML"
|
|
160
|
5
|
161
|
|
162 class ConsensusXML(ProteomicsXml):
|
|
163 file_ext = "consensusxml"
|
|
164 blurb = "OpenMS multiple LC-MS map alignment file"
|
|
165 root = "consensusXML"
|
|
166
|
|
167
|
|
168 class FeatureXML(ProteomicsXml):
|
|
169 file_ext = "featurexml"
|
|
170 blurb = "OpenMS feature file"
|
|
171 root = "featureMap"
|
|
172
|
|
173
|
|
174 class IdXML(ProteomicsXml):
|
|
175 file_ext = "idxml"
|
|
176 blurb = "OpenMS identification file"
|
|
177 root = "IdXML"
|
|
178
|
|
179
|
0
|
180 class Mgf( Text ):
|
|
181 """Mascot Generic Format data"""
|
|
182 file_ext = "mgf"
|
|
183
|
|
184 def set_peek( self, dataset, is_multi_byte=False ):
|
|
185 """Set the peek and blurb text"""
|
|
186 if not dataset.dataset.purged:
|
|
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
188 dataset.blurb = 'mgf Mascot Generic Format'
|
|
189 else:
|
|
190 dataset.peek = 'file does not exist'
|
|
191 dataset.blurb = 'file purged from disk'
|
|
192
|
|
193 def sniff( self, filename ):
|
|
194 mgf_begin_ions = "BEGIN IONS"
|
|
195 max_lines=100
|
|
196
|
|
197 for i, line in enumerate( file( filename ) ):
|
|
198 line = line.rstrip( '\n\r' )
|
|
199 if line==mgf_begin_ions:
|
|
200 return True
|
|
201 if i>max_lines:
|
|
202 return False
|
5
|
203
|
|
204
|
0
|
205 class MascotDat( Text ):
|
|
206 """Mascot search results """
|
|
207 file_ext = "mascotdat"
|
|
208
|
|
209 def set_peek( self, dataset, is_multi_byte=False ):
|
|
210 """Set the peek and blurb text"""
|
|
211 if not dataset.dataset.purged:
|
|
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
213 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
214 else:
|
|
215 dataset.peek = 'file does not exist'
|
|
216 dataset.blurb = 'file purged from disk'
|
|
217
|
|
218
|
|
219 def sniff( self, filename ):
|
|
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
221 max_lines=10
|
|
222
|
|
223 for i, line in enumerate( file( filename ) ):
|
|
224 line = line.rstrip( '\n\r' )
|
|
225 if line==mime_version:
|
|
226 return True
|
|
227 if i>max_lines:
|
|
228 return False
|
|
229
|
|
230
|
|
231 class RAW( Binary ):
|
|
232 """Class describing a Thermo Finnigan binary RAW file"""
|
|
233 file_ext = "raw"
|
|
234 def sniff( self, filename ):
|
|
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
238 # the start of the file.
|
|
239 try:
|
|
240 header = open( filename ).read(20)
|
|
241 hexheader = binascii.b2a_hex( header )
|
|
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
243 if hexheader.find(finnigan) != -1:
|
|
244 return True
|
|
245 return False
|
|
246 except:
|
|
247 return False
|
|
248 def set_peek( self, dataset, is_multi_byte=False ):
|
|
249 if not dataset.dataset.purged:
|
|
250 dataset.peek = "Thermo Finnigan RAW file"
|
|
251 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
252 else:
|
|
253 dataset.peek = 'file does not exist'
|
|
254 dataset.blurb = 'file purged from disk'
|
|
255 def display_peek( self, dataset ):
|
|
256 try:
|
|
257 return dataset.peek
|
|
258 except:
|
|
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
260
|
|
261
|
|
262 if hasattr(Binary, 'register_sniffable_binary_format'):
|
5
|
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
|
0
|
264
|
|
265
|
5
|
266 class Msp( Text ):
|
0
|
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
268 file_ext = "msp"
|
|
269
|
|
270 @staticmethod
|
|
271 def next_line_starts_with(contents, prefix):
|
|
272 next_line = contents.readline()
|
|
273 return next_line != None and next_line.startswith(prefix)
|
|
274
|
|
275 def sniff(self, filename):
|
|
276 """ Determines whether the file is a NIST MSP output file.
|
|
277
|
|
278 >>> fname = get_test_fname('test.msp')
|
|
279 >>> Msp().sniff(fname)
|
|
280 True
|
|
281 >>> fname = get_test_fname('test.mzXML')
|
|
282 >>> Msp().sniff(fname)
|
|
283 False
|
|
284 """
|
|
285 with open(filename, 'r') as contents:
|
|
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
287
|
|
288 class Ms2(Text):
|
|
289 file_ext = "ms2"
|
|
290
|
|
291 def sniff(self, filename):
|
|
292 """ Determines whether the file is a valid ms2 file.
|
|
293
|
|
294 >>> fname = get_test_fname('test.msp')
|
|
295 >>> Ms2().sniff(fname)
|
|
296 False
|
|
297 >>> fname = get_test_fname('test.ms2')
|
|
298 >>> Ms2().sniff(fname)
|
|
299 True
|
|
300 """
|
|
301
|
|
302 with open(filename, 'r') as contents:
|
|
303 header_lines = []
|
|
304 while True:
|
|
305 line = contents.readline()
|
|
306 if line == None or len(line) == 0:
|
|
307 pass
|
|
308 elif line.startswith('H\t'):
|
|
309 header_lines.append(line)
|
|
310 else:
|
|
311 break
|
|
312 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
313 found_header = False
|
|
314 for header_line in header_lines:
|
|
315 if header_line.startswith('H\t%s' % (header_field)):
|
|
316 found_header = True
|
|
317 break
|
|
318 if not found_header:
|
|
319 return False
|
|
320
|
|
321 return True
|
|
322
|
|
323 # unsniffable binary format, should do something about this
|
5
|
324 class XHunterAslFormat( Binary ):
|
0
|
325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
326 file_ext = "hlf"
|
|
327
|
|
328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
329 Binary.register_unsniffable_binary_ext('hlf')
|