comparison mqparam.py @ 0:256cc0e17454 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant commit ab4e4f1817080cbe8a031a82cb180610ff140847
author galaxyp
date Sat, 20 Jul 2019 04:53:23 -0400
parents
children 2d67fb758956
comparison
equal deleted inserted replaced
-1:000000000000 0:256cc0e17454
1 """
2 Create a project-specific MaxQuant parameter file.
3
4 TODO: check validity of parsed experimental design template
5 add support for parameter groups
6 add reporter ion MS2
7
8 Author: Damian Glaetzer <d.glaetzer@mailbox.org>
9 """
10
11 import ntpath
12 import os
13 import re
14 import xml.etree.ElementTree as ET
15 from itertools import zip_longest
16 from xml.dom import minidom
17
18
19 class MQParam:
20 """Represents a mqpar.xml and provides methods to modify
21 some of its parameters.
22 """
23
24 fasta_template = """<FastaFileInfo>
25 <fastaFilePath></fastaFilePath>
26 <identifierParseRule></identifierParseRule>
27 <descriptionParseRule></descriptionParseRule>
28 <taxonomyParseRule></taxonomyParseRule>
29 <variationParseRule></variationParseRule>
30 <modificationParseRule></modificationParseRule>
31 <taxonomyId></taxonomyId>
32 </FastaFileInfo>"""
33
34 def __init__(self, mqpar_out, mqpar_in, exp_design,
35 substitution_rx=r'[^\s\S]'): # no sub by default
36 """Initialize MQParam class. mqpar_in can either be a template
37 or a already suitable mqpar file.
38 >>> t = MQParam("test", './test-data/template.xml', None)
39 >>> t.root.tag
40 'MaxQuantParams'
41 >>> (t.root.find('maxQuantVersion')).text
42 '1.6.3.4'
43 """
44
45 self.orig_mqpar = mqpar_in
46 self.exp_design = exp_design
47 self.mqpar_out = mqpar_out
48 self.root = ET.parse(mqpar_in).getroot()
49 self.version = self.root.find('maxQuantVersion').text
50 # regex for substitution of certain file name characters
51 self.substitution_rx = substitution_rx
52
53 @staticmethod
54 def _add_child(el, name, text, attrib=None):
55 """Add a child element to an element.
56
57 >>> t = MQParam("test", './test-data/template.xml', None)
58 >>> MQParam._add_child(t.root, "test", "test")
59 >>> t.root.find('test').text == "test"
60 True
61 """
62
63 child = ET.SubElement(el, name, attrib=attrib if attrib else {})
64 child.text = str(text)
65
66 def _make_exp_design(self, infiles):
67 """Create a dict representing an experimental design from
68 an experimental design template and a list of input files.
69 If the experimental design template is None, create a default
70 design with one experiment for each input file, no fractions and
71 parameter group 0 for all files.
72 >>> t2 = MQParam("test", './test-data/template.xml', \
73 './test-data/two/exp_design_template.txt')
74 >>> design = t2._make_exp_design(['./test-data/BSA_min_21.mzXML', \
75 './test-data/BSA_min_22.mzXML'])
76 >>> design['Name']
77 ['./test-data/BSA_min_21.mzXML', './test-data/BSA_min_22.mzXML']
78 >>> design['Fraction']
79 ['1', '2']
80 """
81 design = {s: [] for s in ("Name", "PTM", "Fraction", "Experiment")}
82 if not self.exp_design:
83 design["Name"] = infiles
84 design["Fraction"] = ('32767',) * len(infiles)
85 design["Experiment"] = [os.path.split(f)[1] for f in infiles]
86 design["PTM"] = ('False',) * len(infiles)
87 else:
88 with open(self.exp_design) as design_file:
89 index_line = design_file.readline().strip()
90 index = []
91 for i in index_line.split('\t'):
92 if i in design:
93 index.append(i)
94 else:
95 raise Exception("Invalid comlumn index in experimental"
96 + " design template: {}".format(i))
97 for line in design_file:
98 row = line.strip().split('\t')
99 for e, i in zip_longest(row, index):
100 design[i].append(e)
101
102 # map infiles to names in exp. design template
103 names = []
104 names_to_paths = {}
105 # strip path and extension
106 for f in infiles:
107 b = os.path.basename(f)
108 basename = b[:-6] if b.endswith('.mzXML') else b[:-11]
109 names_to_paths[basename] = f
110 for name in design['Name']:
111 # same substitution as in maxquant.xml,
112 # when passing the element identifiers
113 fname = re.sub(self.substitution_rx, '_', name)
114 names.append(names_to_paths[fname] if fname in names_to_paths
115 else None)
116 # replace orig. file names with matching links to galaxy datasets
117 design['Name'] = names
118
119 return design
120
121 def add_infiles(self, infiles, interactive):
122 """Add a list of raw/mzxml files to the mqpar.xml.
123 If experimental design template was specified,
124 modify other parameters accordingly.
125 The files must be specified as absolute paths
126 for maxquant to find them.
127 >>> t1 = MQParam("test", './test-data/template.xml', None)
128 >>> t1.add_infiles(('test1', ), True)
129 >>> t1.root.find("filePaths")[0].text
130 'test1'
131 >>> t1.root.find("fractions")[0].text
132 '32767'
133 >>> len(t1.root.find("fractions"))
134 1
135 >>> t2 = MQParam("test", './test-data/template.xml', \
136 './test-data/exp_design_test.txt')
137 >>> t2.add_infiles(('test-data/QEplus021874.thermo.raw', \
138 'test-data/QEplus021876.thermo.raw'), True)
139 >>> len(t2.root.find("filePaths"))
140 2
141 >>> t2.root.find("filePaths")[1].text
142 'test-data/QEplus021876.thermo.raw'
143 >>> t2.root.find("experiments")[1].text
144 '2'
145 >>> t2.root.find("fractions")[0].text
146 '3'
147 """
148
149 # Create experimental design for interactive mode.
150 # In non-interactive mode only filepaths are modified, but
151 # their order from the original mqpar must be kept.
152 if interactive:
153 index = range(len(infiles))
154 nodenames = ('filePaths', 'experiments', 'fractions',
155 'ptms', 'paramGroupIndices', 'referenceChannel')
156 design = self._make_exp_design(infiles)
157 else:
158 index = [-1] * len(infiles)
159 # kind of a BUG: fails if filename starts with '.'
160 infilenames = [os.path.basename(f).split('.')[0] for f in infiles]
161 i = 0
162 for child in self.root.find('filePaths'):
163 # either windows or posix path
164 win = ntpath.basename(child.text)
165 posix = os.path.basename(child.text)
166 basename = win if len(win) < len(posix) else posix
167 basename_with_sub = re.sub(self.substitution_rx, '_',
168 basename.split('.')[0])
169 # match infiles to their names in mqpar.xml,
170 # ignore files missing in mqpar.xml
171 if basename_with_sub in infilenames:
172 index[i] = infilenames.index(basename_with_sub)
173 i += 1
174 else:
175 raise ValueError("no matching infile found for "
176 + child.text)
177
178 nodenames = ('filePaths', )
179 design = {'Name': infiles}
180
181 # Get parent nodes from document
182 nodes = dict()
183 for nodename in nodenames:
184 node = self.root.find(nodename)
185 if node is None:
186 raise ValueError('Element {} not found in parameter file'
187 .format(nodename))
188 nodes[nodename] = node
189 node.clear()
190 node.tag = nodename
191
192 # Append sub-elements to nodes (one per file)
193 for i in index:
194 if i > -1 and design['Name'][i]:
195 MQParam._add_child(nodes['filePaths'], 'string',
196 design['Name'][i])
197 if interactive:
198 MQParam._add_child(nodes['experiments'], 'string',
199 design['Experiment'][i])
200 MQParam._add_child(nodes['fractions'], 'short',
201 design['Fraction'][i])
202 MQParam._add_child(nodes['ptms'], 'boolean',
203 design['PTM'][i])
204 MQParam._add_child(nodes['paramGroupIndices'], 'int', 0)
205 MQParam._add_child(nodes['referenceChannel'], 'string', '')
206
207 def add_fasta_files(self, files,
208 identifier=r'>([^\s]*)',
209 description=r'>(.*)'):
210 """Add fasta file groups.
211 >>> t = MQParam('test', './test-data/template.xml', None)
212 >>> t.add_fasta_files(('test1', 'test2'))
213 >>> len(t.root.find('fastaFiles'))
214 2
215 >>> t.root.find('fastaFiles')[0].find("fastaFilePath").text
216 'test1'
217 """
218 fasta_node = self.root.find("fastaFiles")
219 fasta_node.clear()
220 fasta_node.tag = "fastaFiles"
221
222 for index in range(len(files)):
223 filepath = '<fastaFilePath>' + files[index]
224 fasta = self.fasta_template.replace('<fastaFilePath>', filepath)
225 fasta = fasta.replace('<identifierParseRule>',
226 '<identifierParseRule>' + identifier)
227 fasta = fasta.replace('<descriptionParseRule>',
228 '<descriptionParseRule>' + description)
229 ff_node = self.root.find('.fastaFiles')
230 fastaentry = ET.fromstring(fasta)
231 ff_node.append(fastaentry)
232
233 def set_simple_param(self, key, value):
234 """Set a simple parameter.
235 >>> t = MQParam(None, './test-data/template.xml', None)
236 >>> t.set_simple_param('min_unique_pep', 4)
237 >>> t.root.find('.minUniquePeptides').text
238 '4'
239 """
240 # map simple params to their node in the xml tree
241 simple_params = {'missed_cleavages':
242 '.parameterGroups/parameterGroup/maxMissedCleavages',
243 'min_unique_pep': '.minUniquePeptides',
244 'num_threads': 'numThreads',
245 'calc_peak_properties': '.calcPeakProperties',
246 'write_mztab': 'writeMzTab',
247 'min_peptide_len': 'minPepLen',
248 'max_peptide_mass': 'maxPeptideMass',
249 'ibaq': 'ibaq', # lfq global options
250 'ibaq_log_fit': 'ibaqLogFit',
251 'separate_lfq': 'separateLfq',
252 'lfq_stabilize_large_ratios':
253 'lfqStabilizeLargeRatios',
254 'lfq_require_msms': 'lfqRequireMsms',
255 'advanced_site_intensities':
256 'advancedSiteIntensities',
257 'lfq_mode': # lfq param group options
258 '.parameterGroups/parameterGroup/lfqMode',
259 'lfq_skip_norm':
260 '.parameterGroups/parameterGroup/lfqSkipNorm',
261 'lfq_min_edges_per_node':
262 '.parameterGroups/parameterGroup/lfqMinEdgesPerNode',
263 'lfq_avg_edges_per_node':
264 '.parameterGroups/parameterGroup/lfqAvEdgesPerNode',
265 'lfq_min_ratio_count':
266 '.parameterGroups/parameterGroup/lfqMinRatioCount'}
267
268 if key in simple_params:
269 node = self.root.find(simple_params[key])
270 if node is None:
271 raise ValueError('Element {} not found in parameter file'
272 .format(simple_params[key]))
273 node.text = str(value)
274 else:
275 raise ValueError("Parameter not found.")
276
277 def set_silac(self, light_mods, medium_mods, heavy_mods):
278 """Set label modifications.
279 >>> t1 = MQParam('test', './test-data/template.xml', None)
280 >>> t1.set_silac(None, ('test1', 'test2'), None)
281 >>> t1.root.find('.parameterGroups/parameterGroup/maxLabeledAa').text
282 '2'
283 >>> t1.root.find('.parameterGroups/parameterGroup/multiplicity').text
284 '3'
285 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[1].text
286 'test1;test2'
287 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[2].text
288 ''
289 """
290 multiplicity = 3 if medium_mods else 2 if heavy_mods else 1
291 max_label = str(max(len(light_mods) if light_mods else 0,
292 len(medium_mods) if medium_mods else 0,
293 len(heavy_mods) if heavy_mods else 0))
294 multiplicity_node = self.root.find('.parameterGroups/parameterGroup/'
295 + 'multiplicity')
296 multiplicity_node.text = str(multiplicity)
297 max_label_node = self.root.find('.parameterGroups/parameterGroup/'
298 + 'maxLabeledAa')
299 max_label_node.text = max_label
300
301 node = self.root.find('.parameterGroups/parameterGroup/labelMods')
302 node[0].text = ';'.join(light_mods) if light_mods else ''
303 if multiplicity == 3:
304 MQParam._add_child(node, name='string', text=';'.join(medium_mods))
305 if multiplicity > 1:
306 MQParam._add_child(node, name='string',
307 text=';'.join(heavy_mods) if heavy_mods else '')
308
309 def set_list_params(self, key, vals):
310 """Set a list parameter.
311 >>> t = MQParam(None, './test-data/template.xml', None)
312 >>> t.set_list_params('proteases', ('test 1', 'test 2'))
313 >>> len(t.root.find('.parameterGroups/parameterGroup/enzymes'))
314 2
315 >>> t.set_list_params('var_mods', ('Oxidation (M)', ))
316 >>> var_mods = '.parameterGroups/parameterGroup/variableModifications'
317 >>> t.root.find(var_mods)[0].text
318 'Oxidation (M)'
319 """
320
321 params = {'var_mods':
322 '.parameterGroups/parameterGroup/variableModifications',
323 'fixed_mods':
324 '.parameterGroups/parameterGroup/fixedModifications',
325 'proteases':
326 '.parameterGroups/parameterGroup/enzymes'}
327
328 if key in params:
329 node = self.root.find(params[key])
330 if node is None:
331 raise ValueError('Element {} not found in parameter file'
332 .format(params[key]))
333 node.clear()
334 node.tag = params[key].split('/')[-1]
335 for e in vals:
336 MQParam._add_child(node, name='string', text=e)
337 else:
338 raise ValueError("Parameter {} not found.".format(key))
339
340 def write(self):
341 rough_string = ET.tostring(self.root, 'utf-8', short_empty_elements=False)
342 reparsed = minidom.parseString(rough_string)
343 pretty = reparsed.toprettyxml(indent="\t")
344 even_prettier = re.sub(r"\n\s+\n", r"\n", pretty)
345 with open(self.mqpar_out, 'w') as f:
346 print(even_prettier, file=f)