0
|
1 """
|
|
2 Proteomics format classes
|
|
3 """
|
|
4 import logging
|
|
5 import re
|
|
6 from galaxy.datatypes.data import *
|
|
7 from galaxy.datatypes.xml import *
|
|
8 from galaxy.datatypes.sniff import *
|
|
9 from galaxy.datatypes.binary import *
|
4
|
10 from galaxy.datatypes.interval import *
|
0
|
11
|
|
12 log = logging.getLogger(__name__)
|
|
13
|
4
|
14 class ProtGff( Gff ):
|
|
15 """Tab delimited data in Gff format"""
|
|
16 file_ext = "prot_gff"
|
|
17 def set_peek( self, dataset, is_multi_byte=False ):
|
|
18 """Set the peek and blurb text"""
|
|
19 if not dataset.dataset.purged:
|
|
20 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
21 dataset.blurb = 'Proteogenomics GFF'
|
|
22 else:
|
|
23 dataset.peek = 'file does not exist'
|
|
24 dataset.blurb = 'file purged from disk'
|
|
25
|
|
26 def sniff( self, filename ):
|
|
27 handle = open(filename)
|
|
28 xmlns_re = re.compile("^##gff-version")
|
|
29 for i in range(3):
|
|
30 line = handle.readline()
|
|
31 if xmlns_re.match(line.strip()):
|
|
32 handle.close()
|
|
33 return True
|
|
34
|
|
35 handle.close()
|
|
36 return False
|
|
37
|
0
|
38
|
|
39 class Xls( Binary ):
|
|
40 """Class describing a binary excel spreadsheet file"""
|
|
41 file_ext = "xls"
|
|
42
|
|
43 def set_peek( self, dataset, is_multi_byte=False ):
|
|
44 if not dataset.dataset.purged:
|
|
45 dataset.peek = "Excel Spreadsheet file"
|
|
46 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
47 else:
|
|
48 dataset.peek = 'file does not exist'
|
|
49 dataset.blurb = 'file purged from disk'
|
|
50 def display_peek( self, dataset ):
|
|
51 try:
|
|
52 return dataset.peek
|
|
53 except:
|
|
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
55
|
|
56 class ProteomicsXml(GenericXml):
|
|
57 """ An enhanced XML datatype used to reuse code across several
|
|
58 proteomic/mass-spec datatypes. """
|
|
59
|
|
60 def sniff(self, filename):
|
|
61 """ Determines whether the file is the correct XML type. """
|
|
62 with open(filename, 'r') as contents:
|
|
63 while True:
|
|
64 line = contents.readline()
|
|
65 if line == None or not line.startswith('<?'):
|
|
66 break
|
|
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
|
|
68 return line != None and re.match(pattern, line) != None
|
|
69
|
|
70 def set_peek( self, dataset, is_multi_byte=False ):
|
|
71 """Set the peek and blurb text"""
|
|
72 if not dataset.dataset.purged:
|
|
73 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
74 dataset.blurb = self.blurb
|
|
75 else:
|
|
76 dataset.peek = 'file does not exist'
|
|
77 dataset.blurb = 'file purged from disk'
|
|
78
|
|
79 class PepXml(ProteomicsXml):
|
|
80 """pepXML data"""
|
|
81 file_ext = "pepxml"
|
|
82 blurb = 'pepXML data'
|
|
83 root = "msms_pipeline_analysis"
|
|
84
|
|
85
|
|
86 class MzML(ProteomicsXml):
|
|
87 """mzML data"""
|
|
88 file_ext = "mzml"
|
|
89 blurb = 'mzML Mass Spectrometry data'
|
|
90 root = "(mzML|indexedmzML)"
|
|
91
|
|
92
|
|
93 class ProtXML(ProteomicsXml):
|
|
94 """protXML data"""
|
|
95 file_ext = "protxml"
|
|
96 blurb = 'prot XML Search Results'
|
|
97 root = "protein_summary"
|
|
98
|
|
99
|
|
100 class MzXML(ProteomicsXml):
|
|
101 """mzXML data"""
|
|
102 file_ext = "mzXML"
|
|
103 blurb = "mzXML Mass Spectrometry data"
|
|
104 root = "mzXML"
|
|
105
|
|
106 ## PSI datatypes
|
|
107 class MzIdentML(ProteomicsXml):
|
|
108 file_ext = "mzid"
|
|
109 blurb = "XML identified peptides and proteins."
|
|
110 root = "MzIdentML"
|
|
111
|
|
112
|
|
113 class TraML(ProteomicsXml):
|
|
114 file_ext = "traML"
|
|
115 blurb = "TraML transition list"
|
|
116 root = "TraML"
|
|
117
|
|
118
|
|
119 class MzQuantML(ProteomicsXml):
|
|
120 file_ext = "mzq"
|
|
121 blurb = "XML quantification data"
|
|
122 root = "MzQuantML"
|
|
123
|
|
124
|
|
125 class Mgf( Text ):
|
|
126 """Mascot Generic Format data"""
|
|
127 file_ext = "mgf"
|
|
128
|
|
129 def set_peek( self, dataset, is_multi_byte=False ):
|
|
130 """Set the peek and blurb text"""
|
|
131 if not dataset.dataset.purged:
|
|
132 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
133 dataset.blurb = 'mgf Mascot Generic Format'
|
|
134 else:
|
|
135 dataset.peek = 'file does not exist'
|
|
136 dataset.blurb = 'file purged from disk'
|
|
137
|
|
138
|
|
139 def sniff( self, filename ):
|
|
140 mgf_begin_ions = "BEGIN IONS"
|
|
141 max_lines=100
|
|
142
|
|
143 for i, line in enumerate( file( filename ) ):
|
|
144 line = line.rstrip( '\n\r' )
|
|
145 if line==mgf_begin_ions:
|
|
146 return True
|
|
147 if i>max_lines:
|
|
148 return False
|
|
149
|
|
150
|
|
151 class MascotDat( Text ):
|
|
152 """Mascot search results """
|
|
153 file_ext = "mascotdat"
|
|
154
|
|
155 def set_peek( self, dataset, is_multi_byte=False ):
|
|
156 """Set the peek and blurb text"""
|
|
157 if not dataset.dataset.purged:
|
|
158 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
159 dataset.blurb = 'mascotdat Mascot Search Results'
|
|
160 else:
|
|
161 dataset.peek = 'file does not exist'
|
|
162 dataset.blurb = 'file purged from disk'
|
|
163
|
|
164
|
|
165 def sniff( self, filename ):
|
|
166 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
|
|
167 max_lines=10
|
|
168
|
|
169 for i, line in enumerate( file( filename ) ):
|
|
170 line = line.rstrip( '\n\r' )
|
|
171 if line==mime_version:
|
|
172 return True
|
|
173 if i>max_lines:
|
|
174 return False
|
|
175
|
|
176
|
|
177 class RAW( Binary ):
|
|
178 """Class describing a Thermo Finnigan binary RAW file"""
|
|
179 file_ext = "raw"
|
|
180 def sniff( self, filename ):
|
|
181 # Thermo Finnigan RAW format is proprietary and hence not well documented.
|
|
182 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
|
|
183 # This combination represents 17 bytes, but to play safe we read 20 bytes from
|
|
184 # the start of the file.
|
|
185 try:
|
|
186 header = open( filename ).read(20)
|
|
187 hexheader = binascii.b2a_hex( header )
|
|
188 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
|
|
189 if hexheader.find(finnigan) != -1:
|
|
190 return True
|
|
191 return False
|
|
192 except:
|
|
193 return False
|
|
194 def set_peek( self, dataset, is_multi_byte=False ):
|
|
195 if not dataset.dataset.purged:
|
|
196 dataset.peek = "Thermo Finnigan RAW file"
|
|
197 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
198 else:
|
|
199 dataset.peek = 'file does not exist'
|
|
200 dataset.blurb = 'file purged from disk'
|
|
201 def display_peek( self, dataset ):
|
|
202 try:
|
|
203 return dataset.peek
|
|
204 except:
|
|
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
|
|
206
|
|
207
|
|
208 if hasattr(Binary, 'register_sniffable_binary_format'):
|
|
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW)
|
|
210
|
|
211
|
|
212 class Msp(Text):
|
|
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
|
|
214 file_ext = "msp"
|
|
215
|
|
216 @staticmethod
|
|
217 def next_line_starts_with(contents, prefix):
|
|
218 next_line = contents.readline()
|
|
219 return next_line != None and next_line.startswith(prefix)
|
|
220
|
|
221 def sniff(self, filename):
|
|
222 """ Determines whether the file is a NIST MSP output file.
|
|
223
|
|
224 >>> fname = get_test_fname('test.msp')
|
|
225 >>> Msp().sniff(fname)
|
|
226 True
|
|
227 >>> fname = get_test_fname('test.mzXML')
|
|
228 >>> Msp().sniff(fname)
|
|
229 False
|
|
230 """
|
|
231 with open(filename, 'r') as contents:
|
|
232 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
|
|
233
|
|
234 class Ms2(Text):
|
|
235 file_ext = "ms2"
|
|
236
|
|
237 def sniff(self, filename):
|
|
238 """ Determines whether the file is a valid ms2 file.
|
|
239
|
|
240 >>> fname = get_test_fname('test.msp')
|
|
241 >>> Ms2().sniff(fname)
|
|
242 False
|
|
243 >>> fname = get_test_fname('test.ms2')
|
|
244 >>> Ms2().sniff(fname)
|
|
245 True
|
|
246 """
|
|
247
|
|
248 with open(filename, 'r') as contents:
|
|
249 header_lines = []
|
|
250 while True:
|
|
251 line = contents.readline()
|
|
252 if line == None or len(line) == 0:
|
|
253 pass
|
|
254 elif line.startswith('H\t'):
|
|
255 header_lines.append(line)
|
|
256 else:
|
|
257 break
|
|
258 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
|
|
259 found_header = False
|
|
260 for header_line in header_lines:
|
|
261 if header_line.startswith('H\t%s' % (header_field)):
|
|
262 found_header = True
|
|
263 break
|
|
264 if not found_header:
|
|
265 return False
|
|
266
|
|
267 return True
|
|
268
|
|
269 # unsniffable binary format, should do something about this
|
|
270 class XHunterAslFormat(Binary):
|
|
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
|
|
272 file_ext = "hlf"
|
|
273
|
|
274
|
|
275 if hasattr(Binary, 'register_unsniffable_binary_ext'):
|
|
276 Binary.register_unsniffable_binary_ext('hlf')
|