comparison proteomics.py @ 9:6ca516faacfc draft

Uploaded
author iracooke
date Thu, 05 Jun 2014 18:08:35 -0400
parents
children ef74edade8be
comparison
equal deleted inserted replaced
8:58469754fd82 9:6ca516faacfc
1 """
2 Proteomics format classes
3 """
4 import logging
5 import re
6 import binascii
7
8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes import data
10 from galaxy.datatypes.data import Text
11 from galaxy.datatypes.xml import GenericXml
12 from galaxy.datatypes.binary import Binary
13 from galaxy.datatypes.tabular import Tabular
14 from galaxy.datatypes.interval import Gff
15
16 log = logging.getLogger(__name__)
17
18
19 class Wiff( Binary ):
20 """Class for wiff files."""
21 file_ext = 'wiff'
22 allow_datatype_change = False
23 composite_type = 'auto_primary_file'
24
25 def __init__(self, **kwd):
26 Binary.__init__(self, **kwd)
27 self.add_composite_file( 'wiff',
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
29 is_binary = True )
30 self.add_composite_file( 'wiff_scan',
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
32 optional = 'True', is_binary = True )
33
34 def generate_primary_file( self, dataset = None ):
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
38 fn = composite_name
39 opt_text = ''
40 if composite_file.optional:
41 opt_text = ' (optional)'
42 if composite_file.get('description'):
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
44 else:
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
46 rval.append( '</ul></div></html>' )
47 return "\n".join( rval )
48
49
50
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
52 Binary.register_unsniffable_binary_ext('wiff')
53
54
55 class IdpDB( Binary ):
56 file_ext = "idpDB"
57
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
59 Binary.register_unsniffable_binary_ext('idpDB')
60
61
62 class PepXmlReport( Tabular ):
63 """pepxml converted to tabular report"""
64 file_ext = "tsv"
65
66 def __init__(self, **kwd):
67 Tabular.__init__( self, **kwd )
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
69
70 def display_peek( self, dataset ):
71 """Returns formated html of peek"""
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
73
74
75 class ProtXmlReport( Tabular ):
76 """protxml converted to tabular report"""
77 file_ext = "tsv"
78 comment_lines = 1
79
80 def __init__(self, **kwd):
81 Tabular.__init__( self, **kwd )
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
83
84 def display_peek( self, dataset ):
85 """Returns formated html of peek"""
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
87
88 class ProteomicsXml( GenericXml ):
89 """ An enhanced XML datatype used to reuse code across several
90 proteomic/mass-spec datatypes. """
91
92 def sniff(self, filename):
93 """ Determines whether the file is the correct XML type. """
94 with open(filename, 'r') as contents:
95 while True:
96 line = contents.readline()
97 if line == None or not line.startswith('<?'):
98 break
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
100 return line != None and re.match(pattern, line) != None
101
102 def set_peek( self, dataset, is_multi_byte=False ):
103 """Set the peek and blurb text"""
104 if not dataset.dataset.purged:
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
106 dataset.blurb = self.blurb
107 else:
108 dataset.peek = 'file does not exist'
109 dataset.blurb = 'file purged from disk'
110
111
112 class PepXml(ProteomicsXml):
113 """pepXML data"""
114 file_ext = "pepxml"
115 blurb = 'pepXML data'
116 root = "msms_pipeline_analysis"
117
118
119 class MzML(ProteomicsXml):
120 """mzML data"""
121 file_ext = "mzml"
122 blurb = 'mzML Mass Spectrometry data'
123 root = "(mzML|indexedmzML)"
124
125
126 class ProtXML(ProteomicsXml):
127 """protXML data"""
128 file_ext = "protxml"
129 blurb = 'prot XML Search Results'
130 root = "protein_summary"
131
132
133 class MzXML(ProteomicsXml):
134 """mzXML data"""
135 file_ext = "mzxml"
136 blurb = "mzXML Mass Spectrometry data"
137 root = "mzXML"
138
139 ## PSI datatypes
140 class MzIdentML(ProteomicsXml):
141 file_ext = "mzid"
142 blurb = "XML identified peptides and proteins."
143 root = "MzIdentML"
144
145
146 class TraML(ProteomicsXml):
147 file_ext = "traml"
148 blurb = "TraML transition list"
149 root = "TraML"
150
151
152 class MzQuantML(ProteomicsXml):
153 file_ext = "mzq"
154 blurb = "XML quantification data"
155 root = "MzQuantML"
156
157
158 class ConsensusXML(ProteomicsXml):
159 file_ext = "consensusxml"
160 blurb = "OpenMS multiple LC-MS map alignment file"
161 root = "consensusXML"
162
163
164 class FeatureXML(ProteomicsXml):
165 file_ext = "featurexml"
166 blurb = "OpenMS feature file"
167 root = "featureMap"
168
169
170 class IdXML(ProteomicsXml):
171 file_ext = "idxml"
172 blurb = "OpenMS identification file"
173 root = "IdXML"
174
175
176 class Mgf( Text ):
177 """Mascot Generic Format data"""
178 file_ext = "mgf"
179
180 def set_peek( self, dataset, is_multi_byte=False ):
181 """Set the peek and blurb text"""
182 if not dataset.dataset.purged:
183 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
184 dataset.blurb = 'mgf Mascot Generic Format'
185 else:
186 dataset.peek = 'file does not exist'
187 dataset.blurb = 'file purged from disk'
188
189 def sniff( self, filename ):
190 mgf_begin_ions = "BEGIN IONS"
191 max_lines=100
192
193 for i, line in enumerate( file( filename ) ):
194 line = line.rstrip( '\n\r' )
195 if line==mgf_begin_ions:
196 return True
197 if i>max_lines:
198 return False
199
200
201 class MascotDat( Text ):
202 """Mascot search results """
203 file_ext = "mascotdat"
204
205 def set_peek( self, dataset, is_multi_byte=False ):
206 """Set the peek and blurb text"""
207 if not dataset.dataset.purged:
208 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
209 dataset.blurb = 'mascotdat Mascot Search Results'
210 else:
211 dataset.peek = 'file does not exist'
212 dataset.blurb = 'file purged from disk'
213
214
215 def sniff( self, filename ):
216 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
217 max_lines=10
218
219 for i, line in enumerate( file( filename ) ):
220 line = line.rstrip( '\n\r' )
221 if line==mime_version:
222 return True
223 if i>max_lines:
224 return False
225
226
227 class RAW( Binary ):
228 """Class describing a Thermo Finnigan binary RAW file"""
229 file_ext = "raw"
230 def sniff( self, filename ):
231 # Thermo Finnigan RAW format is proprietary and hence not well documented.
232 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
233 # This combination represents 17 bytes, but to play safe we read 20 bytes from
234 # the start of the file.
235 try:
236 header = open( filename ).read(20)
237 hexheader = binascii.b2a_hex( header )
238 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
239 if hexheader.find(finnigan) != -1:
240 return True
241 return False
242 except:
243 return False
244 def set_peek( self, dataset, is_multi_byte=False ):
245 if not dataset.dataset.purged:
246 dataset.peek = "Thermo Finnigan RAW file"
247 dataset.blurb = data.nice_size( dataset.get_size() )
248 else:
249 dataset.peek = 'file does not exist'
250 dataset.blurb = 'file purged from disk'
251 def display_peek( self, dataset ):
252 try:
253 return dataset.peek
254 except:
255 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
256
257
258 if hasattr(Binary, 'register_sniffable_binary_format'):
259 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
260
261
262 class Msp( Text ):
263 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
264 file_ext = "msp"
265
266 @staticmethod
267 def next_line_starts_with(contents, prefix):
268 next_line = contents.readline()
269 return next_line != None and next_line.startswith(prefix)
270
271 def sniff(self, filename):
272 """ Determines whether the file is a NIST MSP output file.
273
274 >>> fname = get_test_fname('test.msp')
275 >>> Msp().sniff(fname)
276 True
277 >>> fname = get_test_fname('test.mzXML')
278 >>> Msp().sniff(fname)
279 False
280 """
281 with open(filename, 'r') as contents:
282 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
283
284 class Ms2(Text):
285 file_ext = "ms2"
286
287 def sniff(self, filename):
288 """ Determines whether the file is a valid ms2 file.
289
290 >>> fname = get_test_fname('test.msp')
291 >>> Ms2().sniff(fname)
292 False
293 >>> fname = get_test_fname('test.ms2')
294 >>> Ms2().sniff(fname)
295 True
296 """
297
298 with open(filename, 'r') as contents:
299 header_lines = []
300 while True:
301 line = contents.readline()
302 if line == None or len(line) == 0:
303 pass
304 elif line.startswith('H\t'):
305 header_lines.append(line)
306 else:
307 break
308 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
309 found_header = False
310 for header_line in header_lines:
311 if header_line.startswith('H\t%s' % (header_field)):
312 found_header = True
313 break
314 if not found_header:
315 return False
316
317 return True
318
319 # unsniffable binary format, should do something about this
320 class XHunterAslFormat( Binary ):
321 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
322 file_ext = "hlf"
323
324 if hasattr(Binary, 'register_unsniffable_binary_ext'):
325 Binary.register_unsniffable_binary_ext('hlf')