Mercurial > repos > galaxyp > maxquant
comparison mqparam.py @ 1:8bac3cc5c5de draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant commit ab4e4f1817080cbe8a031a82cb180610ff140847
| author | galaxyp |
|---|---|
| date | Sat, 20 Jul 2019 05:01:05 -0400 |
| parents | |
| children | 175e062b6a17 |
comparison
equal
deleted
inserted
replaced
| 0:d4b6c9eae635 | 1:8bac3cc5c5de |
|---|---|
| 1 """ | |
| 2 Create a project-specific MaxQuant parameter file. | |
| 3 | |
| 4 TODO: check validity of parsed experimental design template | |
| 5 add support for parameter groups | |
| 6 add reporter ion MS2 | |
| 7 add label free quantification | |
| 8 don't hardcode parse rules for fasta files | |
| 9 | |
| 10 Author: Damian Glaetzer <d.glaetzer@mailbox.org> | |
| 11 """ | |
| 12 | |
| 13 import ntpath | |
| 14 import os | |
| 15 import re | |
| 16 import xml.etree.ElementTree as ET | |
| 17 from itertools import zip_longest | |
| 18 from xml.dom import minidom | |
| 19 | |
| 20 | |
| 21 class MQParam: | |
| 22 """Represents a mqpar.xml and provides methods to modify | |
| 23 some of its parameters. | |
| 24 """ | |
| 25 | |
| 26 fasta_template = """<FastaFileInfo> | |
| 27 <fastaFilePath></fastaFilePath> | |
| 28 <identifierParseRule></identifierParseRule> | |
| 29 <descriptionParseRule></descriptionParseRule> | |
| 30 <taxonomyParseRule></taxonomyParseRule> | |
| 31 <variationParseRule></variationParseRule> | |
| 32 <modificationParseRule></modificationParseRule> | |
| 33 <taxonomyId></taxonomyId> | |
| 34 </FastaFileInfo>""" | |
| 35 | |
| 36 def __init__(self, mqpar_out, mqpar_in, exp_design, | |
| 37 substitution_rx=r'[^\s\S]'): # no sub by default | |
| 38 """Initialize MQParam class. mqpar_in can either be a template | |
| 39 or a already suitable mqpar file. | |
| 40 >>> t = MQParam("test", './test-data/template.xml', None) | |
| 41 >>> t.root.tag | |
| 42 'MaxQuantParams' | |
| 43 >>> (t.root.find('maxQuantVersion')).text | |
| 44 '1.6.3.4' | |
| 45 """ | |
| 46 | |
| 47 self.orig_mqpar = mqpar_in | |
| 48 self.exp_design = exp_design | |
| 49 self.mqpar_out = mqpar_out | |
| 50 self.root = ET.parse(mqpar_in).getroot() | |
| 51 self.version = self.root.find('maxQuantVersion').text | |
| 52 # regex for substitution of certain file name characters | |
| 53 self.substitution_rx = substitution_rx | |
| 54 | |
| 55 @staticmethod | |
| 56 def _add_child(el, name, text, attrib=None): | |
| 57 """Add a child element to an element. | |
| 58 | |
| 59 >>> t = MQParam("test", './test-data/template.xml', None) | |
| 60 >>> MQParam._add_child(t.root, "test", "test") | |
| 61 >>> t.root.find('test').text == "test" | |
| 62 True | |
| 63 """ | |
| 64 | |
| 65 child = ET.SubElement(el, name, attrib=attrib if attrib else {}) | |
| 66 child.text = str(text) | |
| 67 | |
| 68 def _make_exp_design(self, infiles): | |
| 69 """Create a dict representing an experimental design from | |
| 70 an experimental design template and a list of input files. | |
| 71 If the experimental design template is None, create a default | |
| 72 design with one experiment for each input file, no fractions and | |
| 73 parameter group 0 for all files. | |
| 74 >>> t2 = MQParam("test", './test-data/template.xml', \ | |
| 75 './test-data/two/exp_design_template.txt') | |
| 76 >>> design = t2._make_exp_design(['./test-data/BSA_min_21.mzXML', \ | |
| 77 './test-data/BSA_min_22.mzXML']) | |
| 78 >>> design['Name'] | |
| 79 ['./test-data/BSA_min_21.mzXML', './test-data/BSA_min_22.mzXML'] | |
| 80 >>> design['Fraction'] | |
| 81 ['1', '2'] | |
| 82 """ | |
| 83 design = {s: [] for s in ("Name", "PTM", "Fraction", "Experiment")} | |
| 84 if not self.exp_design: | |
| 85 design["Name"] = infiles | |
| 86 design["Fraction"] = ('32767',) * len(infiles) | |
| 87 design["Experiment"] = [os.path.split(f)[1] for f in infiles] | |
| 88 design["PTM"] = ('False',) * len(infiles) | |
| 89 else: | |
| 90 with open(self.exp_design) as design_file: | |
| 91 index_line = design_file.readline().strip() | |
| 92 index = [] | |
| 93 for i in index_line.split('\t'): | |
| 94 if i in design: | |
| 95 index.append(i) | |
| 96 else: | |
| 97 raise Exception("Invalid comlumn index in experimental" | |
| 98 + " design template: {}".format(i)) | |
| 99 for line in design_file: | |
| 100 row = line.strip().split('\t') | |
| 101 for e, i in zip_longest(row, index): | |
| 102 design[i].append(e) | |
| 103 | |
| 104 # map infiles to names in exp. design template | |
| 105 names = [] | |
| 106 names_to_paths = {} | |
| 107 # strip path and extension | |
| 108 for f in infiles: | |
| 109 b = os.path.basename(f) | |
| 110 basename = b[:-6] if b.endswith('.mzXML') else b[:-11] | |
| 111 names_to_paths[basename] = f | |
| 112 for name in design['Name']: | |
| 113 # same substitution as in maxquant.xml, | |
| 114 # when passing the element identifiers | |
| 115 fname = re.sub(self.substitution_rx, '_', name) | |
| 116 names.append(names_to_paths[fname] if fname in names_to_paths | |
| 117 else None) | |
| 118 # replace orig. file names with matching links to galaxy datasets | |
| 119 design['Name'] = names | |
| 120 | |
| 121 return design | |
| 122 | |
| 123 def add_infiles(self, infiles, interactive): | |
| 124 """Add a list of raw/mzxml files to the mqpar.xml. | |
| 125 If experimental design template was specified, | |
| 126 modify other parameters accordingly. | |
| 127 The files must be specified as absolute paths | |
| 128 for maxquant to find them. | |
| 129 >>> t1 = MQParam("test", './test-data/template.xml', None) | |
| 130 >>> t1.add_infiles(('test1', ), True) | |
| 131 >>> t1.root.find("filePaths")[0].text | |
| 132 'test1' | |
| 133 >>> t1.root.find("fractions")[0].text | |
| 134 '32767' | |
| 135 >>> len(t1.root.find("fractions")) | |
| 136 1 | |
| 137 >>> t2 = MQParam("test", './test-data/template.xml', \ | |
| 138 './test-data/exp_design_test.txt') | |
| 139 >>> t2.add_infiles(('test-data/QEplus021874.thermo.raw', \ | |
| 140 'test-data/QEplus021876.thermo.raw'), True) | |
| 141 >>> len(t2.root.find("filePaths")) | |
| 142 2 | |
| 143 >>> t2.root.find("filePaths")[1].text | |
| 144 'test-data/QEplus021876.thermo.raw' | |
| 145 >>> t2.root.find("experiments")[1].text | |
| 146 '2' | |
| 147 >>> t2.root.find("fractions")[0].text | |
| 148 '3' | |
| 149 """ | |
| 150 | |
| 151 # Create experimental design for interactive mode. | |
| 152 # In non-interactive mode only filepaths are modified, but | |
| 153 # their order from the original mqpar must be kept. | |
| 154 if interactive: | |
| 155 index = range(len(infiles)) | |
| 156 nodenames = ('filePaths', 'experiments', 'fractions', | |
| 157 'ptms', 'paramGroupIndices', 'referenceChannel') | |
| 158 design = self._make_exp_design(infiles) | |
| 159 else: | |
| 160 index = [-1] * len(infiles) | |
| 161 # kind of a BUG: fails if filename starts with '.' | |
| 162 infilenames = [os.path.basename(f).split('.')[0] for f in infiles] | |
| 163 i = 0 | |
| 164 for child in self.root.find('filePaths'): | |
| 165 # either windows or posix path | |
| 166 win = ntpath.basename(child.text) | |
| 167 posix = os.path.basename(child.text) | |
| 168 basename = win if len(win) < len(posix) else posix | |
| 169 basename_with_sub = re.sub(self.substitution_rx, '_', | |
| 170 basename.split('.')[0]) | |
| 171 # match infiles to their names in mqpar.xml, | |
| 172 # ignore files missing in mqpar.xml | |
| 173 if basename_with_sub in infilenames: | |
| 174 index[i] = infilenames.index(basename_with_sub) | |
| 175 i += 1 | |
| 176 else: | |
| 177 raise ValueError("no matching infile found for " | |
| 178 + child.text) | |
| 179 | |
| 180 nodenames = ('filePaths', ) | |
| 181 design = {'Name': infiles} | |
| 182 | |
| 183 # Get parent nodes from document | |
| 184 nodes = dict() | |
| 185 for nodename in nodenames: | |
| 186 node = self.root.find(nodename) | |
| 187 if node is None: | |
| 188 raise ValueError('Element {} not found in parameter file' | |
| 189 .format(nodename)) | |
| 190 nodes[nodename] = node | |
| 191 node.clear() | |
| 192 node.tag = nodename | |
| 193 | |
| 194 # Append sub-elements to nodes (one per file) | |
| 195 for i in index: | |
| 196 if i > -1 and design['Name'][i]: | |
| 197 MQParam._add_child(nodes['filePaths'], 'string', | |
| 198 design['Name'][i]) | |
| 199 if interactive: | |
| 200 MQParam._add_child(nodes['experiments'], 'string', | |
| 201 design['Experiment'][i]) | |
| 202 MQParam._add_child(nodes['fractions'], 'short', | |
| 203 design['Fraction'][i]) | |
| 204 MQParam._add_child(nodes['ptms'], 'boolean', | |
| 205 design['PTM'][i]) | |
| 206 MQParam._add_child(nodes['paramGroupIndices'], 'int', 0) | |
| 207 MQParam._add_child(nodes['referenceChannel'], 'string', '') | |
| 208 | |
| 209 def add_fasta_files(self, files, | |
| 210 identifier=r'>([^\s]*)', | |
| 211 description=r'>(.*)'): | |
| 212 """Add fasta file groups. | |
| 213 >>> t = MQParam('test', './test-data/template.xml', None) | |
| 214 >>> t.add_fasta_files(('test1', 'test2')) | |
| 215 >>> len(t.root.find('fastaFiles')) | |
| 216 2 | |
| 217 >>> t.root.find('fastaFiles')[0].find("fastaFilePath").text | |
| 218 'test1' | |
| 219 """ | |
| 220 fasta_node = self.root.find("fastaFiles") | |
| 221 fasta_node.clear() | |
| 222 fasta_node.tag = "fastaFiles" | |
| 223 | |
| 224 for index in range(len(files)): | |
| 225 filepath = '<fastaFilePath>' + files[index] | |
| 226 fasta = self.fasta_template.replace('<fastaFilePath>', filepath) | |
| 227 fasta = fasta.replace('<identifierParseRule>', | |
| 228 '<identifierParseRule>' + identifier) | |
| 229 fasta = fasta.replace('<descriptionParseRule>', | |
| 230 '<descriptionParseRule>' + description) | |
| 231 ff_node = self.root.find('.fastaFiles') | |
| 232 fastaentry = ET.fromstring(fasta) | |
| 233 ff_node.append(fastaentry) | |
| 234 | |
| 235 def set_simple_param(self, key, value): | |
| 236 """Set a simple parameter. | |
| 237 >>> t = MQParam(None, './test-data/template.xml', None) | |
| 238 >>> t.set_simple_param('min_unique_pep', 4) | |
| 239 >>> t.root.find('.minUniquePeptides').text | |
| 240 '4' | |
| 241 """ | |
| 242 # map simple params to their node in the xml tree | |
| 243 simple_params = {'missed_cleavages': | |
| 244 '.parameterGroups/parameterGroup/maxMissedCleavages', | |
| 245 'min_unique_pep': '.minUniquePeptides', | |
| 246 'num_threads': 'numThreads', | |
| 247 'calc_peak_properties': '.calcPeakProperties', | |
| 248 'write_mztab': 'writeMzTab', | |
| 249 'min_peptide_len': 'minPepLen', | |
| 250 'max_peptide_mass': 'maxPeptideMass', | |
| 251 'match_between_runs': 'matchBetweenRuns', | |
| 252 'ibaq': 'ibaq', # lfq global options | |
| 253 'ibaq_log_fit': 'ibaqLogFit', | |
| 254 'separate_lfq': 'separateLfq', | |
| 255 'lfq_stabilize_large_ratios': | |
| 256 'lfqStabilizeLargeRatios', | |
| 257 'lfq_require_msms': 'lfqRequireMsms', | |
| 258 'advanced_site_intensities': | |
| 259 'advancedSiteIntensities', | |
| 260 'lfq_mode': # lfq param group options | |
| 261 '.parameterGroups/parameterGroup/lfqMode', | |
| 262 'lfq_skip_norm': | |
| 263 '.parameterGroups/parameterGroup/lfqSkipNorm', | |
| 264 'lfq_min_edges_per_node': | |
| 265 '.parameterGroups/parameterGroup/lfqMinEdgesPerNode', | |
| 266 'lfq_avg_edges_per_node': | |
| 267 '.parameterGroups/parameterGroup/lfqAvEdgesPerNode', | |
| 268 'lfq_min_ratio_count': | |
| 269 '.parameterGroups/parameterGroup/lfqMinRatioCount'} | |
| 270 | |
| 271 if key in simple_params: | |
| 272 node = self.root.find(simple_params[key]) | |
| 273 if node is None: | |
| 274 raise ValueError('Element {} not found in parameter file' | |
| 275 .format(simple_params[key])) | |
| 276 node.text = str(value) | |
| 277 else: | |
| 278 raise ValueError("Parameter not found.") | |
| 279 | |
| 280 def set_silac(self, light_mods, medium_mods, heavy_mods): | |
| 281 """Set label modifications. | |
| 282 >>> t1 = MQParam('test', './test-data/template.xml', None) | |
| 283 >>> t1.set_silac(None, ('test1', 'test2'), None) | |
| 284 >>> t1.root.find('.parameterGroups/parameterGroup/maxLabeledAa').text | |
| 285 '2' | |
| 286 >>> t1.root.find('.parameterGroups/parameterGroup/multiplicity').text | |
| 287 '3' | |
| 288 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[1].text | |
| 289 'test1;test2' | |
| 290 >>> t1.root.find('.parameterGroups/parameterGroup/labelMods')[2].text | |
| 291 '' | |
| 292 """ | |
| 293 multiplicity = 3 if medium_mods else 2 if heavy_mods else 1 | |
| 294 max_label = str(max(len(light_mods) if light_mods else 0, | |
| 295 len(medium_mods) if medium_mods else 0, | |
| 296 len(heavy_mods) if heavy_mods else 0)) | |
| 297 multiplicity_node = self.root.find('.parameterGroups/parameterGroup/' | |
| 298 + 'multiplicity') | |
| 299 multiplicity_node.text = str(multiplicity) | |
| 300 max_label_node = self.root.find('.parameterGroups/parameterGroup/' | |
| 301 + 'maxLabeledAa') | |
| 302 max_label_node.text = max_label | |
| 303 | |
| 304 node = self.root.find('.parameterGroups/parameterGroup/labelMods') | |
| 305 node[0].text = ';'.join(light_mods) if light_mods else '' | |
| 306 if multiplicity == 3: | |
| 307 MQParam._add_child(node, name='string', text=';'.join(medium_mods)) | |
| 308 if multiplicity > 1: | |
| 309 MQParam._add_child(node, name='string', | |
| 310 text=';'.join(heavy_mods) if heavy_mods else '') | |
| 311 | |
| 312 def set_list_params(self, key, vals): | |
| 313 """Set a list parameter. | |
| 314 >>> t = MQParam(None, './test-data/template.xml', None) | |
| 315 >>> t.set_list_params('proteases', ('test 1', 'test 2')) | |
| 316 >>> len(t.root.find('.parameterGroups/parameterGroup/enzymes')) | |
| 317 2 | |
| 318 >>> t.set_list_params('var_mods', ('Oxidation (M)', )) | |
| 319 >>> var_mods = '.parameterGroups/parameterGroup/variableModifications' | |
| 320 >>> t.root.find(var_mods)[0].text | |
| 321 'Oxidation (M)' | |
| 322 """ | |
| 323 | |
| 324 params = {'var_mods': | |
| 325 '.parameterGroups/parameterGroup/variableModifications', | |
| 326 'fixed_mods': | |
| 327 '.parameterGroups/parameterGroup/fixedModifications', | |
| 328 'proteases': | |
| 329 '.parameterGroups/parameterGroup/enzymes'} | |
| 330 | |
| 331 if key in params: | |
| 332 node = self.root.find(params[key]) | |
| 333 if node is None: | |
| 334 raise ValueError('Element {} not found in parameter file' | |
| 335 .format(params[key])) | |
| 336 node.clear() | |
| 337 node.tag = params[key].split('/')[-1] | |
| 338 for e in vals: | |
| 339 MQParam._add_child(node, name='string', text=e) | |
| 340 else: | |
| 341 raise ValueError("Parameter {} not found.".format(key)) | |
| 342 | |
| 343 def write(self): | |
| 344 rough_string = ET.tostring(self.root, 'utf-8', short_empty_elements=False) | |
| 345 reparsed = minidom.parseString(rough_string) | |
| 346 pretty = reparsed.toprettyxml(indent="\t") | |
| 347 even_prettier = re.sub(r"\n\s+\n", r"\n", pretty) | |
| 348 with open(self.mqpar_out, 'w') as f: | |
| 349 print(even_prettier, file=f) |
