comparison mqparam.py @ 1:8bac3cc5c5de draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant commit ab4e4f1817080cbe8a031a82cb180610ff140847
author galaxyp
date Sat, 20 Jul 2019 05:01:05 -0400
parents
children 175e062b6a17
comparison
equal deleted inserted replaced
0:d4b6c9eae635 1:8bac3cc5c5de
1 """
2 Create a project-specific MaxQuant parameter file.
3
4 TODO: check validity of parsed experimental design template
5 add support for parameter groups
6 add reporter ion MS2
7 add label free quantification
8 don't hardcode parse rules for fasta files
9
10 Author: Damian Glaetzer <d.glaetzer@mailbox.org>
11 """
12
13 import ntpath
14 import os
15 import re
16 import xml.etree.ElementTree as ET
17 from itertools import zip_longest
18 from xml.dom import minidom
19
20
21 class MQParam:
22 """Represents a mqpar.xml and provides methods to modify
23 some of its parameters.
24 """
25
26 fasta_template = """<FastaFileInfo>
27 <fastaFilePath></fastaFilePath>
28 <identifierParseRule></identifierParseRule>
29 <descriptionParseRule></descriptionParseRule>
30 <taxonomyParseRule></taxonomyParseRule>
31 <variationParseRule></variationParseRule>
32 <modificationParseRule></modificationParseRule>
33 <taxonomyId></taxonomyId>
34 </FastaFileInfo>"""
35
36 def __init__(self, mqpar_out, mqpar_in, exp_design,
37 substitution_rx=r'[^\s\S]'): # no sub by default
38 """Initialize MQParam class. mqpar_in can either be a template
39 or a already suitable mqpar file.
40 >>> t = MQParam("test", './test-data/template.xml', None)
41 >>> t.root.tag
42 'MaxQuantParams'
43 >>> (t.root.find('maxQuantVersion')).text
44 '1.6.3.4'
45 """
46
47 self.orig_mqpar = mqpar_in
48 self.exp_design = exp_design
49 self.mqpar_out = mqpar_out
50 self.root = ET.parse(mqpar_in).getroot()
51 self.version = self.root.find('maxQuantVersion').text
52 # regex for substitution of certain file name characters
53 self.substitution_rx = substitution_rx
54
55 @staticmethod
56 def _add_child(el, name, text, attrib=None):
57 """Add a child element to an element.
58
59 >>> t = MQParam("test", './test-data/template.xml', None)
60 >>> MQParam._add_child(t.root, "test", "test")
61 >>> t.root.find('test').text == "test"
62 True
63 """
64
65 child = ET.SubElement(el, name, attrib=attrib if attrib else {})
66 child.text = str(text)
67
68 def _make_exp_design(self, infiles):
69 """Create a dict representing an experimental design from
70 an experimental design template and a list of input files.
71 If the experimental design template is None, create a default
72 design with one experiment for each input file, no fractions and
73 parameter group 0 for all files.
74 >>> t2 = MQParam("test", './test-data/template.xml', \
75 './test-data/two/exp_design_template.txt')
76 >>> design = t2._make_exp_design(['./test-data/BSA_min_21.mzXML', \
77 './test-data/BSA_min_22.mzXML'])
78 >>> design['Name']
79 ['./test-data/BSA_min_21.mzXML', './test-data/BSA_min_22.mzXML']
80 >>> design['Fraction']
81 ['1', '2']
82 """
83 design = {s: [] for s in ("Name", "PTM", "Fraction", "Experiment")}
84 if not self.exp_design:
85 design["Name"] = infiles
86 design["Fraction"] = ('32767',) * len(infiles)
87 design["Experiment"] = [os.path.split(f)[1] for f in infiles]
88 design["PTM"] = ('False',) * len(infiles)
89 else:
90 with open(self.exp_design) as design_file:
91 index_line = design_file.readline().strip()
92 index = []
93 for i in index_line.split('\t'):
94 if i in design:
95 index.append(i)
96 else:
97 raise Exception("Invalid comlumn index in experimental"
98 + " design template: {}".format(i))
99 for line in design_file:
100 row = line.strip().split('\t')
101 for e, i in zip_longest(row, index):
102 design[i].append(e)
103
104 # map infiles to names in exp. design template
105 names = []
106 names_to_paths = {}
107 # strip path and extension
108 for f in infiles:
109 b = os.path.basename(f)
110 basename = b[:-6] if b.endswith('.mzXML') else b[:-11]
111 names_to_paths[basename] = f
112 for name in design['Name']:
113 # same substitution as in maxquant.xml,
114 # when passing the element identifiers
115 fname = re.sub(self.substitution_rx, '_', name)
116 names.append(names_to_paths[fname] if fname in names_to_paths
117 else None)
118 # replace orig. file names with matching links to galaxy datasets
119 design['Name'] = names
120
121 return design
122
123 def add_infiles(self, infiles, interactive):
124 """Add a list of raw/mzxml files to the mqpar.xml.
125 If experimental design template was specified,
126 modify other parameters accordingly.
127 The files must be specified as absolute paths
128 for maxquant to find them.
129 >>> t1 = MQParam("test", './test-data/template.xml', None)
130 >>> t1.add_infiles(('test1', ), True)
131 >>> t1.root.find("filePaths")[0].text
132 'test1'
133 >>> t1.root.find("fractions")[0].text
134 '32767'
135 >>> len(t1.root.find("fractions"))
136 1
137 >>> t2 = MQParam("test", './test-data/template.xml', \
138 './test-data/exp_design_test.txt')
139 >>> t2.add_infiles(('test-data/QEplus021874.thermo.raw', \
140 'test-data/QEplus021876.thermo.raw'), True)
141 >>> len(t2.root.find("filePaths"))
142 2
143 >>> t2.root.find("filePaths")[1].text
144 'test-data/QEplus021876.thermo.raw'
145 >>> t2.root.find("experiments")[1].text
146 '2'
147 >>> t2.root.find("fractions")[0].text
148 '3'
149 """
150
151 # Create experimental design for interactive mode.
152 # In non-interactive mode only filepaths are modified, but
153 # their order from the original mqpar must be kept.
154 if interactive:
155 index = range(len(infiles))
156 nodenames = ('filePaths', 'experiments', 'fractions',
157 'ptms', 'paramGroupIndices', 'referenceChannel')
158 design = self._make_exp_design(infiles)
159 else:
160 index = [-1] * len(infiles)
161 # kind of a BUG: fails if filename starts with '.'
162 infilenames = [os.path.basename(f).split('.')[0] for f in infiles]
163 i = 0
164 for child in self.root.find('filePaths'):
165 # either windows or posix path
166 win = ntpath.basename(child.text)
167 posix = os.path.basename(child.text)
168 basename = win if len(win) < len(posix) else posix
169 basename_with_sub = re.sub(self.substitution_rx, '_',
170 basename.split('.')[0])
171 # match infiles to their names in mqpar.xml,
172 # ignore files missing in mqpar.xml
173 if basename_with_sub in infilenames:
174 index[i] = infilenames.index(basename_with_sub)
175 i += 1
176 else:
177 raise ValueError("no matching infile found for "
178 + child.text)
179
180 nodenames = ('filePaths', )
181 design = {'Name': infiles}
182
183 # Get parent nodes from document
184 nodes = dict()
185 for nodename in nodenames:
186 node = self.root.find(nodename)
187 if node is None:
188 raise ValueError('Element {} not found in parameter file'
189 .format(nodename))
190 nodes[nodename] = node
191 node.clear()
192 node.tag = nodename
193
194 # Append sub-elements to nodes (one per file)
195 for i in index:
196 if i > -1 and design['Name'][i]:
197 MQParam._add_child(nodes['filePaths'], 'string',
198 design['Name'][i])
199 if interactive:
200 MQParam._add_child(nodes['experiments'], 'string',
201 design['Experiment'][i])
202 MQParam._add_child(nodes['fractions'], 'short',
203 design['Fraction'][i])
204 MQParam._add_child(nodes['ptms'], 'boolean',
205 design['PTM'][i])
206 MQParam._add_child(nodes['paramGroupIndices'], 'int', 0)
207 MQParam._add_child(nodes['referenceChannel'], 'string', '')
208
209 def add_fasta_files(self, files,
210 identifier=r'>([^\s]*)',
211 description=r'>(.*)'):
212 """Add fasta file groups.
213 >>> t = MQParam('test', './test-data/template.xml', None)
214 >>> t.add_fasta_files(('test1', 'test2'))
215 >>> len(t.root.find('fastaFiles'))
216 2
217 >>> t.root.find('fastaFiles')[0].find("fastaFilePath").text
218 'test1'
219 """
220 fasta_node = self.root.find("fastaFiles")
221 fasta_node.clear()
222 fasta_node.tag = "fastaFiles"
223
224 for index in range(len(files)):
225 filepath = '<fastaFilePath>' + files[index]
226 fasta = self.fasta_template.replace('<fastaFilePath>', filepath)
227 fasta = fasta.replace('<identifierParseRule>',
228 '<identifierParseRule>' + identifier)
229 fasta = fasta.replace('<descriptionParseRule>',
230 '<descriptionParseRule>' + description)
231 ff_node = self.root.find('.fastaFiles')
232 fastaentry = ET.fromstring(fasta)
233 ff_node.append(fastaentry)
234
235 def set_simple_param(self, key, value):
236 """Set a simple parameter.
237 >>> t = MQParam(None, './test-data/template.xml', None)
238 >>> t.set_simple_param('min_unique_pep', 4)
239 >>> t.root.find('.minUniquePeptides').text
240 '4'
241 """
242 # map simple params to their node in the xml tree
243 simple_params = {'missed_cleavages':
244 '.parameterGroups/parameterGroup/maxMissedCleavages',
245 'min_unique_pep': '.minUniquePeptides',
246 'num_threads': 'numThreads',
247 'calc_peak_properties': '.calcPeakProperties',
248 'write_mztab': 'writeMzTab',
249 'min_peptide_len': 'minPepLen',
250 'max_peptide_mass': 'maxPeptideMass',
251 'match_between_runs': 'matchBetweenRuns',
252 'ibaq': 'ibaq', # lfq global options
253 'ibaq_log_fit': 'ibaqLogFit',
254 'separate_lfq': 'separateLfq',
255 'lfq_stabilize_large_ratios':
256 'lfqStabilizeLargeRatios',
257 'lfq_require_msms': 'lfqRequireMsms',
258 'advanced_site_intensities':
259 'advancedSiteIntensities',
260 'lfq_mode': # lfq param group options
261 '.parameterGroups/parameterGroup/lfqMode',
262 'lfq_skip_norm':
263 '.parameterGroups/parameterGroup/lfqSkipNorm',
264 'lfq_min_edges_per_node':
265 '.parameterGroups/parameterGroup/lfqMinEdgesPerNode',
266 'lfq_avg_edges_per_node':
267 '.parameterGroups/parameterGroup/lfqAvEdgesPerNode',
268 'lfq_min_ratio_count':
269 '.parameterGroups/parameterGroup/lfqMinRatioCount'}
270
271 if key in simple_params:
272 node = self.root.find(simple_params[key])
273 if node is None:
274 raise ValueError('Element {} not found in parameter file'
275 .format(simple_params[key]))
276 node.text = str(value)
277 else:
278 raise ValueError("Parameter not found.")
279
280 def set_silac(self, light_mods, medium_mods, heavy_mods):
281 """Set label modifications.
282 >>> t1 = MQParam('test', './test-data/template.xml', None)
283 >>> t1.set_silac(None, ('test1', 'test2'), None)
284 >>> t1.root.find('.parameterGroups/parameterGroup/maxLabeledAa').text
285 '2'
286 >>> t1.root.find('.parameterGroups/parameterGroup/multiplicity').text
287 '3'
288 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[1].text
289 'test1;test2'
290 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[2].text
291 ''
292 """
293 multiplicity = 3 if medium_mods else 2 if heavy_mods else 1
294 max_label = str(max(len(light_mods) if light_mods else 0,
295 len(medium_mods) if medium_mods else 0,
296 len(heavy_mods) if heavy_mods else 0))
297 multiplicity_node = self.root.find('.parameterGroups/parameterGroup/'
298 + 'multiplicity')
299 multiplicity_node.text = str(multiplicity)
300 max_label_node = self.root.find('.parameterGroups/parameterGroup/'
301 + 'maxLabeledAa')
302 max_label_node.text = max_label
303
304 node = self.root.find('.parameterGroups/parameterGroup/labelMods')
305 node[0].text = ';'.join(light_mods) if light_mods else ''
306 if multiplicity == 3:
307 MQParam._add_child(node, name='string', text=';'.join(medium_mods))
308 if multiplicity > 1:
309 MQParam._add_child(node, name='string',
310 text=';'.join(heavy_mods) if heavy_mods else '')
311
312 def set_list_params(self, key, vals):
313 """Set a list parameter.
314 >>> t = MQParam(None, './test-data/template.xml', None)
315 >>> t.set_list_params('proteases', ('test 1', 'test 2'))
316 >>> len(t.root.find('.parameterGroups/parameterGroup/enzymes'))
317 2
318 >>> t.set_list_params('var_mods', ('Oxidation (M)', ))
319 >>> var_mods = '.parameterGroups/parameterGroup/variableModifications'
320 >>> t.root.find(var_mods)[0].text
321 'Oxidation (M)'
322 """
323
324 params = {'var_mods':
325 '.parameterGroups/parameterGroup/variableModifications',
326 'fixed_mods':
327 '.parameterGroups/parameterGroup/fixedModifications',
328 'proteases':
329 '.parameterGroups/parameterGroup/enzymes'}
330
331 if key in params:
332 node = self.root.find(params[key])
333 if node is None:
334 raise ValueError('Element {} not found in parameter file'
335 .format(params[key]))
336 node.clear()
337 node.tag = params[key].split('/')[-1]
338 for e in vals:
339 MQParam._add_child(node, name='string', text=e)
340 else:
341 raise ValueError("Parameter {} not found.".format(key))
342
343 def write(self):
344 rough_string = ET.tostring(self.root, 'utf-8', short_empty_elements=False)
345 reparsed = minidom.parseString(rough_string)
346 pretty = reparsed.toprettyxml(indent="\t")
347 even_prettier = re.sub(r"\n\s+\n", r"\n", pretty)
348 with open(self.mqpar_out, 'w') as f:
349 print(even_prettier, file=f)