comparison toolfactory/rgToolFactory2.py @ 30:6f48315c32c1 draft

Uploaded
author fubar
date Fri, 07 Aug 2020 07:54:23 -0400
parents
children 4d578c8c1613
comparison
equal deleted inserted replaced
29:6db39cbc3242 30:6f48315c32c1
1 # rgToolFactory.py
2 # see https://github.com/fubar2/toolfactory
3 #
4 # copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012
5 #
6 # all rights reserved
7 # Licensed under the LGPL
8 # suggestions for improvement and bug fixes welcome at https://github.com/fubar2/toolfactory
9 #
10 # July 2020: BCC was fun and I feel like rip van winkle after 5 years.
11 # Decided to
12 # 1. Fix the toolfactory so it works - done for simplest case
13 # 2. Fix planemo so the toolfactory function works
14 # 3. Rewrite bits using galaxyxml functions where that makes sense - done
15 #
16 # removed all the old complications including making the new tool use this same script
17 # galaxyxml now generates the tool xml https://github.com/hexylena/galaxyxml
18 # No support for automatic HTML file creation from arbitrary outputs
19 # TODO: add option to run that code as a post execution hook
20 # TODO: add additional history input parameters - currently only one
21
22 import sys
23 import subprocess
24 import shutil
25 import os
26 import time
27 import tempfile
28 import argparse
29 import tarfile
30 import re
31 import galaxyxml.tool as gxt
32 import galaxyxml.tool.parameters as gxtp
33 import logging
34
35
36 progname = os.path.split(sys.argv[0])[1]
37 myversion = 'V2.1 July 2020'
38 verbose = True
39 debug = True
40 toolFactoryURL = 'https://github.com/fubar2/toolfactory'
41 ourdelim = '~~~'
42
43 # --input_files="$input_files~~~$CL~~~$input_formats~~~$input_label~~~$input_help"
44 IPATHPOS = 0
45 ICLPOS = 1
46 IFMTPOS = 2
47 ILABPOS = 3
48 IHELPOS = 4
49 IOCLPOS = 5
50 # --output_files "$otab.history_name~~~$otab.history_format~~~$otab.CL
51 ONAMEPOS = 0
52 OFMTPOS = 1
53 OCLPOS = 2
54 OOCLPOS = 3
55
56 #--additional_parameters="$i.param_name~~~$i.param_value~~~$i.param_label~~~$i.param_help~~~$i.param_type~~~$i.CL"
57 ANAMEPOS = 0
58 AVALPOS = 1
59 ALABPOS = 2
60 AHELPPOS = 3
61 ATYPEPOS = 4
62 ACLPOS = 5
63 AOCLPOS = 6
64
65 def timenow():
66 """return current time as a string
67 """
68 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
69
70
71 def quote_non_numeric(s):
72 """return a prequoted string for non-numerics
73 useful for perl and Rscript parameter passing?
74 """
75 try:
76 _ = float(s)
77 return s
78 except ValueError:
79 return '"%s"' % s
80
81
82 html_escape_table = {
83 "&": "&",
84 ">": ">",
85 "<": "&lt;",
86 "$": r"\$"
87 }
88
89
90 def html_escape(text):
91 """Produce entities within text."""
92 return "".join(html_escape_table.get(c, c) for c in text)
93
94
95 def html_unescape(text):
96 """Revert entities within text. Multiple character targets so use replace"""
97 t = text.replace('&amp;', '&')
98 t = t.replace('&gt;', '>')
99 t = t.replace('&lt;', '<')
100 t = t.replace('\\$', '$')
101 return t
102
103
104 def parse_citations(citations_text):
105 """
106 """
107 citations = [c for c in citations_text.split("**ENTRY**") if c.strip()]
108 citation_tuples = []
109 for citation in citations:
110 if citation.startswith("doi"):
111 citation_tuples.append(("doi", citation[len("doi"):].strip()))
112 else:
113 citation_tuples.append(
114 ("bibtex", citation[len("bibtex"):].strip()))
115 return citation_tuples
116
117
118 class ScriptRunner:
119 """Wrapper for an arbitrary script
120 uses galaxyxml
121
122 """
123
124
125 def __init__(self, args=None):
126 """
127 prepare command line cl for running the tool here
128 and prepare elements needed for galaxyxml tool generation
129 """
130
131 self.infiles = [x.split(ourdelim) for x in args.input_files]
132 self.outfiles = [x.split(ourdelim) for x in args.output_files]
133 self.addpar = [x.split(ourdelim) for x in args.additional_parameters]
134 self.args = args
135 self.cleanuppar()
136 self.lastclredirect = None
137 self.lastxclredirect = None
138 self.cl = []
139 self.xmlcl = []
140 aCL = self.cl.append
141 assert args.parampass in ['0','argparse','positional'],'Parameter passing in args.parampass must be "0","positional" or "argparse"'
142 self.tool_name = re.sub('[^a-zA-Z0-9_]+', '', args.tool_name)
143 self.tool_id = self.tool_name
144 self.xmlfile = '%s.xml' % self.tool_name
145 if self.args.runmode == "Executable" or self.args.runmode == "system": # binary - no need
146 aCL(self.args.exe_package) # this little CL will just run
147 else:
148 rx = open(self.args.script_path, 'r').readlines()
149 rx = [x.rstrip() for x in rx ]
150 rxcheck = [x.strip() for x in rx if x.strip() > '']
151 assert len(rxcheck) > 0,"Supplied script is empty. Cannot run"
152 self.script = '\n'.join(rx)
153 fhandle, self.sfile = tempfile.mkstemp(
154 prefix=self.tool_name, suffix=".%s" % (args.interpreter_name))
155 tscript = open(self.sfile, 'w')
156 tscript.write(self.script)
157 tscript.close()
158 self.indentedScript = " %s" % '\n'.join(
159 [' %s' % html_escape(x) for x in rx])
160 self.escapedScript = "%s" % '\n'.join(
161 [' %s' % html_escape(x) for x in rx])
162 art = '%s.%s' % (self.tool_name, args.interpreter_name)
163 artifact = open(art, 'wb')
164 artifact.write(bytes(self.script, "utf8"))
165 artifact.close()
166 aCL(self.args.interpreter_name)
167 aCL(self.sfile)
168 self.elog = "%s_error_log.txt" % self.tool_name
169 self.tlog = "%s_runner_log.txt" % self.tool_name
170
171 if self.args.parampass == '0':
172 self.clsimple()
173 else:
174 clsuffix = []
175 xclsuffix = []
176 for i, p in enumerate(self.infiles):
177 appendme = [p[IOCLPOS], p[ICLPOS], p[IPATHPOS]]
178 clsuffix.append(appendme)
179 xclsuffix.append([p[IOCLPOS],p[ICLPOS],'$%s' % p[ICLPOS]])
180 #print('##infile i=%d, appendme=%s' % (i,appendme))
181 for i, p in enumerate(self.outfiles):
182 if p[OOCLPOS] == "STDOUT":
183 self.lastclredirect = ['>',p[ONAMEPOS]]
184 self.lastxclredirect = ['>','$%s' % p[OCLPOS]]
185 #print('##outfiles i=%d lastclredirect = %s' % (i,self.lastclredirect))
186 else:
187 appendme = [p[OOCLPOS], p[OCLPOS],p[ONAMEPOS]]
188 clsuffix.append(appendme)
189 xclsuffix.append([p[OOCLPOS], p[OCLPOS],'$%s' % p[ONAMEPOS]])
190 #print('##outfiles i=%d' % i,'appendme',appendme)
191 for p in self.addpar:
192 appendme = [p[AOCLPOS], p[ACLPOS], p[AVALPOS]]
193 clsuffix.append(appendme)
194 xclsuffix.append([p[AOCLPOS], p[ACLPOS], '"$%s"' % p[ANAMEPOS]])
195 #print('##adpar %d' % i,'appendme=',appendme)
196 clsuffix.sort()
197 xclsuffix.sort()
198 self.xclsuffix = xclsuffix
199 self.clsuffix = clsuffix
200 if self.args.parampass == 'positional':
201 self.clpositional()
202 else:
203 self.clargparse()
204
205 def cleanuppar(self):
206 """ positional parameters are complicated by their numeric ordinal"""
207 for i,p in enumerate(self.infiles):
208 if self.args.parampass == 'positional':
209 assert p[ICLPOS].isdigit(), "Positional parameters must be ordinal integers - got %s for %s" % (p[ICLPOS],p[ILABPOS])
210 p.append(p[ICLPOS])
211 if p[ICLPOS].isdigit() or self.args.parampass == "0":
212 scl = 'input%d' % (i+1)
213 p[ICLPOS] = scl
214 self.infiles[i] = p
215 for i,p in enumerate(self.outfiles): # trying to automagically gather using extensions
216 if self.args.parampass == 'positional' and p[OCLPOS] != "STDOUT":
217 assert p[OCLPOS].isdigit(), "Positional parameters must be ordinal integers - got %s for %s" % (p[OCLPOS],p[ONAMEPOS])
218 p.append(p[OCLPOS])
219 if p[OCLPOS].isdigit() or p[OCLPOS] == "STDOUT":
220 scl = p[ONAMEPOS]
221 p[OCLPOS] = scl
222 self.outfiles[i] = p
223 for i,p in enumerate(self.addpar):
224 if self.args.parampass == 'positional':
225 assert p[ACLPOS].isdigit(), "Positional parameters must be ordinal integers - got %s for %s" % (p[ACLPOS],p[ANAMEPOS])
226 p.append(p[ACLPOS])
227 if p[ACLPOS].isdigit():
228 scl = 'input%s' % p[ACLPOS]
229 p[ACLPOS] = scl
230 self.addpar[i] = p
231
232
233
234 def clsimple(self):
235 """ no parameters - uses < and > for i/o
236 """
237 aCL = self.cl.append
238 aCL('<')
239 aCL(self.infiles[0][IPATHPOS])
240 aCL('>')
241 aCL(self.outfiles[0][OCLPOS])
242 aXCL = self.xmlcl.append
243 aXCL('<')
244 aXCL('$%s' % self.infiles[0][ICLPOS])
245 aXCL('>')
246 aXCL('$%s' % self.outfiles[0][ONAMEPOS])
247
248
249 def clpositional(self):
250 # inputs in order then params
251 aCL = self.cl.append
252 for (o_v,k, v) in self.clsuffix:
253 if " " in v:
254 aCL("%s" % v)
255 else:
256 aCL(v)
257 aXCL = self.xmlcl.append
258 for (o_v,k, v) in self.xclsuffix:
259 aXCL(v)
260 if self.lastxclredirect:
261 aXCL(self.lastxclredirect[0])
262 aXCL(self.lastxclredirect[1])
263
264
265
266 def clargparse(self):
267 """ argparse style
268 """
269 aCL = self.cl.append
270 aXCL = self.xmlcl.append
271 # inputs then params in argparse named form
272 for (o_v,k, v) in self.xclsuffix:
273 aXCL(k)
274 aXCL(v)
275 for (o_v,k, v) in self.clsuffix:
276 if len(k.strip()) == 1:
277 k = '-%s' % k
278 else:
279 k = '--%s' % k
280 aCL(k)
281 aCL(v)
282
283
284
285
286 def makeXML(self):
287 """
288 Create a Galaxy xml tool wrapper for the new script
289 Uses galaxyhtml
290 Hmmm. How to get the command line into correct order...
291 """
292
293 if self.args.interpreter_name:
294 exe = "$runMe"
295 interp = self.args.interpreter_name
296 else:
297 interp = None
298 exe = self.args.exe_package
299 assert exe is not None, 'No interpeter or executable passed in to makeXML'
300 tool = gxt.Tool(self.args.tool_name, self.tool_id,
301 self.args.tool_version, self.args.tool_desc, exe)
302 tool.command_line_override = self.xmlcl
303 print('#### tool cl override=',self.xmlcl)
304 if interp:
305 tool.interpreter = interp
306 if self.args.help_text:
307 helptext = open(self.args.help_text, 'r').readlines()
308 helptext = [html_escape(x) for x in helptext]
309 tool.help = ''.join([x for x in helptext])
310 else:
311 tool.help = 'Please ask the tool author (%s) for help \
312 as none was supplied at tool generation\n' % (self.args.user_email)
313 tool.version_command = None # do not want
314 tinputs = gxtp.Inputs()
315 toutputs = gxtp.Outputs()
316 requirements = gxtp.Requirements()
317 testparam = []
318 is_positional = (self.args.parampass == 'positional')
319 if self.args.interpreter_name:
320 if self.args.interpreter_name == 'python':
321 requirements.append(gxtp.Requirement(
322 'package', 'python', self.args.interpreter_version))
323 elif self.args.interpreter_name not in ['bash', 'sh']:
324 requirements.append(gxtp.Requirement(
325 'package', self.args.interpreter_name, self.args.interpreter_version))
326 else:
327 if self.args.exe_package and self.args.parampass != "system":
328 requirements.append(gxtp.Requirement(
329 'package', self.args.exe_package, self.args.exe_package_version))
330 tool.requirements = requirements
331 if self.args.parampass == '0':
332 alab = self.infiles[0][ILABPOS]
333 if len(alab) == 0:
334 alab = self.infiles[0][ICLPOS]
335 max1s = 'Maximum one input if parampass is 0 - more than one input files supplied - %s' % str(self.infiles)
336 assert len(self.infiles) == 1,max1s
337 newname = self.infiles[0][ICLPOS]
338 aninput = gxtp.DataParam(newname, optional=False, label=alab, help=self.infiles[0][IHELPOS],
339 format=self.infiles[0][IFMTPOS], multiple=False, num_dashes=0)
340 aninput.command_line_override = '< $%s' % newname
341 aninput.positional = is_positional
342 tinputs.append(aninput)
343 tp = gxtp.TestParam(name=newname, value='%s_sample' % newname)
344 testparam.append(tp)
345 newname = self.outfiles[0][OCLPOS]
346 newfmt = self.outfiles[0][OFMTPOS]
347 anout = gxtp.OutputData(newname, format=newfmt, num_dashes=0)
348 anout.command_line_override = '> $%s' % newname
349 anout.positional = is_positional
350 toutputs.append(anout)
351 tp = gxtp.TestOutput(name=newname, value='%s_sample' % newname,format=newfmt)
352 testparam.append(tp)
353 else:
354 for p in self.outfiles:
355 newname,newfmt,newcl,oldcl = p
356 if is_positional:
357 ndash = 0
358 else:
359 ndash = 2
360 if len(newcl) < 2:
361 ndash = 1
362 aparm = gxtp.OutputData(newcl, format=newfmt, num_dashes=ndash)
363 aparm.positional = is_positional
364 if is_positional:
365 if oldcl == "STDOUT":
366 aparm.positional = 9999999
367 aparm.command_line_override = "> $%s" % newcl
368 else:
369 aparm.positional = int(oldcl)
370 aparm.command_line_override = '$%s' % newcl
371 toutputs.append(aparm)
372 tp = gxtp.TestOutput(name=newcl, value='%s_sample' % newcl ,format=newfmt)
373 testparam.append(tp)
374 for p in self.infiles:
375 newname = p[ICLPOS]
376 newfmt = p[IFMTPOS]
377 if is_positional:
378 ndash = 0
379 else:
380 if len(newname) > 1:
381 ndash = 2
382 else:
383 ndash = 1
384 if not len(p[ILABPOS]) > 0:
385 alab = p[ICLPOS]
386 else:
387 alab = p[ILABPOS]
388 aninput = gxtp.DataParam(newname, optional=False, label=alab, help=p[IHELPOS],
389 format=newfmt, multiple=False, num_dashes=ndash)
390 aninput.positional = is_positional
391 if is_positional:
392 aninput.positional = is_positional
393 tinputs.append(aninput)
394 tparm = gxtp.TestParam(name=newname, value='%s_sample' % newname )
395 testparam.append(tparm)
396 for p in self.addpar:
397 newname, newval, newlabel, newhelp, newtype, newcl, oldcl = p
398 if not len(newlabel) > 0:
399 newlabel = newname
400 if is_positional:
401 ndash = 0
402 else:
403 if len(newname) > 1:
404 ndash = 2
405 else:
406 ndash = 1
407 if newtype == "text":
408 aparm = gxtp.TextParam(
409 newname, label=newlabel, help=newhelp, value=newval, num_dashes=ndash)
410 elif newtype == "integer":
411 aparm = gxtp.IntegerParam(
412 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash)
413 elif newtype == "float":
414 aparm = gxtp.FloatParam(
415 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash)
416 else:
417 raise ValueError('Unrecognised parameter type "%s" for\
418 additional parameter %s in makeXML' % (newtype, newname))
419 aparm.positional = is_positional
420 if is_positional:
421 aninput.positional = int(oldcl)
422 tinputs.append(aparm)
423 tparm = gxtp.TestParam(newname, value=newval)
424 testparam.append(tparm)
425 tool.outputs = toutputs
426 tool.inputs = tinputs
427 if not self.args.runmode in ['Executable','system']:
428 configfiles = gxtp.Configfiles()
429 configfiles.append(gxtp.Configfile(name="runMe", text=self.script))
430 tool.configfiles = configfiles
431 tests = gxtp.Tests()
432 test_a = gxtp.Test()
433 for tp in testparam:
434 test_a.append(tp)
435 tests.append(test_a)
436 tool.tests = tests
437 tool.add_comment('Created by %s at %s using the Galaxy Tool Factory.' % (
438 self.args.user_email, timenow()))
439 tool.add_comment('Source in git at: %s' % (toolFactoryURL))
440 tool.add_comment(
441 'Cite: Creating re-usable tools from scripts doi: 10.1093/bioinformatics/bts573')
442 exml = tool.export()
443 xf = open(self.xmlfile, 'w')
444 xf.write(exml)
445 xf.write('\n')
446 xf.close()
447 # ready for the tarball
448
449 def makeTooltar(self):
450 """
451 a tool is a gz tarball with eg
452 /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ...
453 NOTE names for test inputs and outputs are munged here so must
454 correspond to actual input and output names used on the generated cl
455 """
456 retval = self.run()
457 if retval:
458 sys.stderr.write(
459 '## Run failed. Cannot build yet. Please fix and retry')
460 sys.exit(1)
461 tdir = 'tfout'
462 if not os.path.exists(tdir):
463 os.mkdir(tdir)
464 self.makeXML()
465 testdir = os.path.join(tdir,'test-data')
466 if not os.path.exists(testdir):
467 os.mkdir(testdir) # make tests directory
468 for p in self.infiles:
469 pth = p[IPATHPOS]
470 dest = os.path.join(testdir, '%s_sample' %
471 p[ICLPOS])
472 shutil.copyfile(pth, dest)
473 for p in self.outfiles:
474 pth = p[OCLPOS]
475 if p[OOCLPOS] == 'STDOUT' or self.args.parampass == "0":
476 pth = p[ONAMEPOS]
477 dest = os.path.join(testdir,'%s_sample' % p[ONAMEPOS])
478 shutil.copyfile(pth, dest)
479 dest = os.path.join(tdir, p[ONAMEPOS])
480 shutil.copyfile(pth, dest)
481 else:
482 pth = p[OCLPOS]
483 dest = os.path.join(testdir,'%s_sample' % p[OCLPOS])
484 shutil.copyfile(pth, dest)
485 dest = os.path.join(tdir, p[OCLPOS])
486 shutil.copyfile(pth, dest)
487
488 if os.path.exists(self.tlog) and os.stat(self.tlog).st_size > 0:
489 shutil.copyfile(self.tlog, os.path.join(
490 testdir, 'test1_log.txt'))
491 if not self.args.runmode in ['Executable','system']:
492 stname = os.path.join(tdir, '%s' % (self.sfile))
493 if not os.path.exists(stname):
494 shutil.copyfile(self.sfile, stname)
495 xtname = os.path.join(tdir,self.xmlfile)
496 if not os.path.exists(xtname):
497 shutil.copyfile(self.xmlfile, xtname)
498 tarpath = 'toolfactory_%s.tgz' % self.tool_name
499 tf = tarfile.open(tarpath,"w:gz")
500 tf.add(name=tdir,arcname=self.tool_name)
501 tf.close()
502 shutil.copyfile(tarpath, self.args.new_tool)
503 return retval
504
505 def run(self):
506 """
507 Some devteam tools have this defensive stderr read so I'm keeping with the faith
508 Feel free to update.
509 """
510 s = 'run cl=%s' % str(self.cl)
511
512 logging.debug(s)
513 scl = ' '.join(self.cl)
514 err = None
515 if self.args.parampass != '0':
516 ste = open(self.elog, 'wb')
517 if self.lastclredirect:
518 sto = open(self.lastclredirect[1],'wb') # is name of an output file
519 else:
520 sto = open(self.tlog, 'wb')
521 sto.write(
522 bytes('## Executing Toolfactory generated command line = %s\n' % scl, "utf8"))
523 sto.flush()
524 p = subprocess.run(self.cl, shell=False, stdout=sto,
525 stderr=ste)
526 sto.close()
527 ste.close()
528 tmp_stderr = open(self.elog, 'rb')
529 err = ''
530 buffsize = 1048576
531 try:
532 while True:
533 err += str(tmp_stderr.read(buffsize))
534 if not err or len(err) % buffsize != 0:
535 break
536 except OverflowError:
537 pass
538 tmp_stderr.close()
539 retval = p.returncode
540 else: # work around special case of simple scripts that take stdin and write to stdout
541 sti = open(self.infiles[0][IPATHPOS], 'rb')
542 sto = open(self.outfiles[0][ONAMEPOS], 'wb')
543 # must use shell to redirect
544 p = subprocess.run(self.cl, shell=False, stdout=sto, stdin=sti)
545 retval = p.returncode
546 sto.close()
547 sti.close()
548 if os.path.isfile(self.tlog) and os.stat(self.tlog).st_size == 0:
549 os.unlink(self.tlog)
550 if os.path.isfile(self.elog) and os.stat(self.elog).st_size == 0:
551 os.unlink(self.elog)
552 if p.returncode != 0 and err: # problem
553 sys.stderr.write(err)
554 logging.debug('run done')
555 return retval
556
557
558 def main():
559 """
560 This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as:
561 <command interpreter="python">rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript"
562 </command>
563 """
564 parser = argparse.ArgumentParser()
565 a = parser.add_argument
566 a('--script_path', default='')
567 a('--tool_name', default=None)
568 a('--interpreter_name', default=None)
569 a('--interpreter_version', default=None)
570 a('--exe_package', default=None)
571 a('--exe_package_version', default=None)
572 a('--input_files', default=[], action="append")
573 a('--output_files', default=[], action="append")
574 a('--user_email', default='Unknown')
575 a('--bad_user', default=None)
576 a('--make_Tool', default=None)
577 a('--help_text', default=None)
578 a('--tool_desc', default=None)
579 a('--tool_version', default=None)
580 a('--citations', default=None)
581 a('--additional_parameters', action='append', default=[])
582 a('--edit_additional_parameters', action="store_true", default=False)
583 a('--parampass', default="positional")
584 a('--tfout', default="./tfout")
585 a('--new_tool',default="new_tool")
586 a('--runmode',default=None)
587 args = parser.parse_args()
588 assert not args.bad_user, 'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to "admin_users" in the Galaxy configuration file' % (
589 args.bad_user, args.bad_user)
590 assert args.tool_name, '## Tool Factory expects a tool name - eg --tool_name=DESeq'
591 assert (args.interpreter_name or args.exe_package), '## Tool Factory wrapper expects an interpreter - eg --interpreter_name=Rscript or an executable package findable by the dependency management package'
592 assert args.exe_package or (len(args.script_path) > 0 and os.path.isfile(
593 args.script_path)), '## Tool Factory wrapper expects a script path - eg --script_path=foo.R if no executable'
594 args.input_files = [x.replace('"', '').replace("'", '')
595 for x in args.input_files]
596 # remove quotes we need to deal with spaces in CL params
597 for i, x in enumerate(args.additional_parameters):
598 args.additional_parameters[i] = args.additional_parameters[i].replace(
599 '"', '')
600 r = ScriptRunner(args)
601 if args.make_Tool:
602 retcode = r.makeTooltar()
603 else:
604 retcode = r.run()
605 if retcode:
606 sys.exit(retcode) # indicate failure to job runner
607
608
609 if __name__ == "__main__":
610 main()