annotate proteomics.py @ 5:df8b867ab71a draft

Uploaded
author bgruening
date Fri, 07 Feb 2014 09:21:23 -0500
parents 09b89b345de2
children b82d4034e0f8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
1 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
2 Proteomics format classes
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
3 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
4 import logging
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
5 import re
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
6 import binascii
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
7
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
8 from galaxy.datatypes.sniff import *
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
9 from galaxy.datatypes.data import Text
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
10 from galaxy.datatypes.xml import GenericXml
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
11 from galaxy.datatypes.binary import Binary
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
12 from galaxy.datatypes.tabular import Tabular
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
13 from galaxy.datatypes.interval import Gff
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
14
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
15 log = logging.getLogger(__name__)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
16
4
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
17 class ProtGff( Gff ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
18 """Tab delimited data in Gff format"""
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
19 file_ext = "prot_gff"
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
20 def set_peek( self, dataset, is_multi_byte=False ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
21 """Set the peek and blurb text"""
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
22 if not dataset.dataset.purged:
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
23 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
24 dataset.blurb = 'Proteogenomics GFF'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
25 else:
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
26 dataset.peek = 'file does not exist'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
27 dataset.blurb = 'file purged from disk'
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
28
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
29 def sniff( self, filename ):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
30 handle = open(filename)
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
31 xmlns_re = re.compile("^##gff-version")
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
32 for i in range(3):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
33 line = handle.readline()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
34 if xmlns_re.match(line.strip()):
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
35 handle.close()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
36 return True
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
37
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
38 handle.close()
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
39 return False
Ira Cooke <iracooke@gmail.com>
parents: 0
diff changeset
40
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
41
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
42 class Xls( Binary ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
43 """Class describing a binary excel spreadsheet file"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
44 file_ext = "xls"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
45
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
46 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
47 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
48 dataset.peek = "Excel Spreadsheet file"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
49 dataset.blurb = data.nice_size( dataset.get_size() )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
50 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
51 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
52 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
53 def display_peek( self, dataset ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
54 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
55 return dataset.peek
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
56 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
57 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
58
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
59 class IdpDB( Binary ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
60 file_ext = "idpDB"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
61
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
62 if hasattr(Binary, 'register_unsniffable_binary_ext'):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
63 Binary.register_unsniffable_binary_ext('idpDB')
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
64
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
65
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
66 class PepXmlReport( Tabular ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
67 """pepxml converted to tabular report"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
68 file_ext = "tsv"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
69
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
70 def __init__(self, **kwd):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
71 Tabular.__init__( self, **kwd )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
72 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
73
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
74 def display_peek( self, dataset ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
75 """Returns formated html of peek"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
76 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
77
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
78
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
79 class ProtXmlReport( Tabular ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
80 """protxml converted to tabular report"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
81 file_ext = "tsv"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
82 comment_lines = 1
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
83
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
84 def __init__(self, **kwd):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
85 Tabular.__init__( self, **kwd )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
86 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
87
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
88 def display_peek( self, dataset ):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
89 """Returns formated html of peek"""
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
90 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
91
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
92 class ProteomicsXml( GenericXml ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
93 """ An enhanced XML datatype used to reuse code across several
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
94 proteomic/mass-spec datatypes. """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
95
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
96 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
97 """ Determines whether the file is the correct XML type. """
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
98 with open(filename, 'r') as contents:
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
99 while True:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
100 line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
101 if line == None or not line.startswith('<?'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
102 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
103 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
104 return line != None and re.match(pattern, line) != None
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
105
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
106 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
107 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
108 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
109 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
110 dataset.blurb = self.blurb
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
111 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
112 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
113 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
114
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
115
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
116 class PepXml(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
117 """pepXML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
118 file_ext = "pepxml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
119 blurb = 'pepXML data'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
120 root = "msms_pipeline_analysis"
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
121
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
122
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
123 class MzML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
124 """mzML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
125 file_ext = "mzml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
126 blurb = 'mzML Mass Spectrometry data'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
127 root = "(mzML|indexedmzML)"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
128
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
129
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
130 class ProtXML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
131 """protXML data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
132 file_ext = "protxml"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
133 blurb = 'prot XML Search Results'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
134 root = "protein_summary"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
135
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
136
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
137 class MzXML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
138 """mzXML data"""
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
139 file_ext = "mzxml"
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
140 blurb = "mzXML Mass Spectrometry data"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
141 root = "mzXML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
142
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
143 ## PSI datatypes
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
144 class MzIdentML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
145 file_ext = "mzid"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
146 blurb = "XML identified peptides and proteins."
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
147 root = "MzIdentML"
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
148
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
149
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
150 class TraML(ProteomicsXml):
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
151 file_ext = "traml"
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
152 blurb = "TraML transition list"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
153 root = "TraML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
154
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
155
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
156 class MzQuantML(ProteomicsXml):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
157 file_ext = "mzq"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
158 blurb = "XML quantification data"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
159 root = "MzQuantML"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
160
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
161
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
162 class ConsensusXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
163 file_ext = "consensusxml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
164 blurb = "OpenMS multiple LC-MS map alignment file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
165 root = "consensusXML"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
166
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
167
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
168 class FeatureXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
169 file_ext = "featurexml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
170 blurb = "OpenMS feature file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
171 root = "featureMap"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
172
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
173
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
174 class IdXML(ProteomicsXml):
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
175 file_ext = "idxml"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
176 blurb = "OpenMS identification file"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
177 root = "IdXML"
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
178
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
179
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
180 class Mgf( Text ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
181 """Mascot Generic Format data"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
182 file_ext = "mgf"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
183
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
184 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
185 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
186 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
188 dataset.blurb = 'mgf Mascot Generic Format'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
189 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
190 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
191 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
192
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
193 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
194 mgf_begin_ions = "BEGIN IONS"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
195 max_lines=100
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
196
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
197 for i, line in enumerate( file( filename ) ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
198 line = line.rstrip( '\n\r' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
199 if line==mgf_begin_ions:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
200 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
201 if i>max_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
202 return False
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
203
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
204
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
205 class MascotDat( Text ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
206 """Mascot search results """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
207 file_ext = "mascotdat"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
208
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
209 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
210 """Set the peek and blurb text"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
211 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
213 dataset.blurb = 'mascotdat Mascot Search Results'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
214 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
215 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
216 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
217
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
218
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
219 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
221 max_lines=10
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
222
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
223 for i, line in enumerate( file( filename ) ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
224 line = line.rstrip( '\n\r' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
225 if line==mime_version:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
226 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
227 if i>max_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
228 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
229
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
230
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
231 class RAW( Binary ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
232 """Class describing a Thermo Finnigan binary RAW file"""
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
233 file_ext = "raw"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
234 def sniff( self, filename ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
238 # the start of the file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
239 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
240 header = open( filename ).read(20)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
241 hexheader = binascii.b2a_hex( header )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
243 if hexheader.find(finnigan) != -1:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
244 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
245 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
246 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
247 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
248 def set_peek( self, dataset, is_multi_byte=False ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
249 if not dataset.dataset.purged:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
250 dataset.peek = "Thermo Finnigan RAW file"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
251 dataset.blurb = data.nice_size( dataset.get_size() )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
252 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
253 dataset.peek = 'file does not exist'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
254 dataset.blurb = 'file purged from disk'
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
255 def display_peek( self, dataset ):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
256 try:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
257 return dataset.peek
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
258 except:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
260
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
261
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
262 if hasattr(Binary, 'register_sniffable_binary_format'):
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
264
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
265
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
266 class Msp( Text ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
268 file_ext = "msp"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
269
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
270 @staticmethod
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
271 def next_line_starts_with(contents, prefix):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
272 next_line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
273 return next_line != None and next_line.startswith(prefix)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
274
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
275 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
276 """ Determines whether the file is a NIST MSP output file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
277
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
278 >>> fname = get_test_fname('test.msp')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
279 >>> Msp().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
280 True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
281 >>> fname = get_test_fname('test.mzXML')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
282 >>> Msp().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
283 False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
284 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
285 with open(filename, 'r') as contents:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
287
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
288 class Ms2(Text):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
289 file_ext = "ms2"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
290
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
291 def sniff(self, filename):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
292 """ Determines whether the file is a valid ms2 file.
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
293
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
294 >>> fname = get_test_fname('test.msp')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
295 >>> Ms2().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
296 False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
297 >>> fname = get_test_fname('test.ms2')
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
298 >>> Ms2().sniff(fname)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
299 True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
300 """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
301
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
302 with open(filename, 'r') as contents:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
303 header_lines = []
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
304 while True:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
305 line = contents.readline()
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
306 if line == None or len(line) == 0:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
307 pass
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
308 elif line.startswith('H\t'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
309 header_lines.append(line)
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
310 else:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
311 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
312 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
313 found_header = False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
314 for header_line in header_lines:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
315 if header_line.startswith('H\t%s' % (header_field)):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
316 found_header = True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
317 break
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
318 if not found_header:
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
319 return False
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
320
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
321 return True
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
322
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
323 # unsniffable binary format, should do something about this
5
df8b867ab71a Uploaded
bgruening
parents: 4
diff changeset
324 class XHunterAslFormat( Binary ):
0
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
326 file_ext = "hlf"
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
327
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
c10a62c886b8 Uploaded
iracooke
parents:
diff changeset
329 Binary.register_unsniffable_binary_ext('hlf')