# HG changeset patch # User galaxyp # Date 1371740867 14400 # Node ID e9981e6af6668304ce93b6f2accd00dec872d9a2 Improved some datatype handling diff -r 000000000000 -r e9981e6af666 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,11 @@ +--2012-09-19 08:46:18-- http://www.apache.org/licenses/LICENSE-2.0.txt +Resolving www.apache.org... 140.211.11.131, 192.87.106.229, 2001:610:1:80bc:192:87:106:229 +Connecting to www.apache.org|140.211.11.131|:80... connected. +HTTP request sent, awaiting response... 200 OK +Length: 11358 (11K) [text/plain] +Saving to: “LICENSE-2.0.txt” + + 0K .......... . 100% 200K=0.06s + +2012-09-19 08:46:18 (200 KB/s) - “LICENSE-2.0.txt” saved [11358/11358] + diff -r 000000000000 -r e9981e6af666 README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,23 @@ +Tool wrapper for the commercial proteomics application Scaffold. +# Obtaining Tools + +Repositories for all Galaxy-P tools can be found at +https:/bitbucket.org/galaxyp/. + +# Contact + +Please send suggestions for improvements and bug reports to +jmchilton@gmail.com. + +# License + +All Galaxy-P tools are licensed under the Apache License Version 2.0 +unless otherwise documented. + +# Tool Versioning + +Galaxy-P tools will have versions of the form X.Y.Z. Versions +differing only after the second decimal should be completely +compatible with each other. Breaking changes should result in an +increment of the number before and/or after the first decimal. All +tools of version less than 1.0.0 should be considered beta. diff -r 000000000000 -r e9981e6af666 README_GALAXYP.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README_GALAXYP.md Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,22 @@ +# Obtaining Tools + +Repositories for all Galaxy-P tools can be found at +https:/bitbucket.org/galaxyp/. + +# Contact + +Please send suggestions for improvements and bug reports to +jmchilton@gmail.com. + +# License + +All Galaxy-P tools are licensed under the Apache License Version 2.0 +unless otherwise documented. + +# Tool Versioning + +Galaxy-P tools will have versions of the form X.Y.Z. Versions +differing only after the second decimal should be completely +compatible with each other. Breaking changes should result in an +increment of the number before and/or after the first decimal. All +tools of version less than 1.0.0 should be considered beta. diff -r 000000000000 -r e9981e6af666 README_REPO.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README_REPO.md Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,1 @@ +Tool wrapper for the commercial proteomics application Scaffold. diff -r 000000000000 -r e9981e6af666 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,9 @@ + + + + + + + + + diff -r 000000000000 -r e9981e6af666 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r e9981e6af666 scaffold.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scaffold.py Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,8 @@ +from galaxy.datatypes.binary import Binary + + +class Sf3(Binary): + """Class describing a Scaffold SF3 files""" + file_ext = "sf3" + +Binary.register_unsniffable_binary_ext('sf3') diff -r 000000000000 -r e9981e6af666 scaffold.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scaffold.xml Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,139 @@ + + + Visualize and Validate Complex MS/MS Proteomics Experiments + + + # Simple format group:group_name followed by pairs of name:name and path:path lines +#if $sample_mode.mode == "full" +#set $samples = $sample_mode.samples +#for $sample in $samples: +#if $sample.category.specify +#set $category = $sample.category.name +#else +#set $category = $sample.sample_name +#end if +sample:$sample.sample_name +mudpit:$sample.mudpit +category:$category +#for $sample_input in $sample.sample_inputs: +name:${sample_input.display_name} +path:${sample_input} +ext:${sample_input.ext} +#end for +#end for +#elif $sample_mode.mode == "sample_per_file": +#for $sample_input in $sample_mode.sample_inputs: +sample:${sample_input.display_name} +mudpit:false +category:${sample_input.display_name} +name:${sample_input.display_name} +path:${sample_input} +ext:${sample_input.ext} +#end for +#end if + + + + scaffold_wrapper.py run \ + --samples $sample_config \ + --database $database \ + --database_name '$database.display_name'\ + --output $output \ + --database_type $database_type \ + --database_decoy_regex '$database_decoy_regex' \ + #if $thresholds.specify + --protein_probability '$thresholds.protein_probability' \ + --peptide_probability '$thresholds.peptide_probability' \ + #end if + #if $advanced.specify + #if $advanced.output_driver + --output_driver $output_drirver \ + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (advanced['specify'] and advanced["output_driver"]) + + + + + scaffold + + +**What it does** + +Merges multiple protein identification search results together into a single SF3 file for viewing. A free viewer for Scaffold SF3 files can be obtained from Proteome software at http://www.proteomesoftware.com/Scaffold/Scaffold_viewer.htm. + +------ + + +**Citation** + +For the underlying tool, please cite `TODO` + +If you use this tool in Galaxy, please cite Chilton J, et al. https://bitbucket.org/galaxyp/galaxyp-toolshed-scaffold + + \ No newline at end of file diff -r 000000000000 -r e9981e6af666 scaffold_export.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scaffold_export.xml Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,75 @@ + + + Export summary from Scaffold SF3 file. + + + macros.xml + + + scaffold_wrapper.py export \ + --sf3 $sf3_input \ + --output $output \ + --export_type $export.export_type \ + ## Begin Threshold Parameters + #set $threshold_type = $threshold.type + #set $threshold_options = $threshold + #if $threshold_type != "none" + --protein_probability=$threshold_options.protein_probability \ + --peptide_probability=$threshold_options.peptide_probability \ + --minimum_peptide_count=$threshold_options.minimum_peptide_count \ + #if $threshold_type != "simple" + $threshold_options.ignore_charge_1 \ + $threshold_options.ignore_charge_2 \ + $threshold_options.ignore_charge_3 \ + $threshold_options.ignore_charge_4 \ + --minimum_ntt=$threshold_options.minimum_ntt \ + --minimum_peptide_length=$threshold_options.minimum_peptide_length \ + #end if + #end if + ## End Threshold Parameters + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scaffold + + +**What it does** + +Export data out of Scaffold's binary data format (sf3) into tabular reports or XML. + +------ + + +**Citation** + +For the underlying tool, please cite `TODO` + +If you use this tool in Galaxy, please cite Chilton J, et al. https://bitbucket.org/galaxyp/galaxyp-toolshed-scaffold + + \ No newline at end of file diff -r 000000000000 -r e9981e6af666 scaffold_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scaffold_wrapper.py Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,318 @@ +#!/usr/bin/env python +import optparse +import os +import shutil +import sys +import tempfile +import subprocess +import logging +from string import Template +from xml.sax.saxutils import escape + +log = logging.getLogger(__name__) + +DEBUG = True + +working_directory = os.getcwd() +tmp_stderr_name = tempfile.NamedTemporaryFile(dir=working_directory, suffix='.stderr').name +tmp_stdout_name = tempfile.NamedTemporaryFile(dir=working_directory, suffix='.stdout').name + + +def stop_err(msg): + sys.stderr.write("%s\n" % msg) + sys.exit() + + +def read_stderr(): + stderr = '' + if(os.path.exists(tmp_stderr_name)): + with open(tmp_stderr_name, 'rb') as tmp_stderr: + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read(buffsize) + if not stderr or len(stderr) % buffsize != 0: + break + except OverflowError: + pass + return stderr + + +def execute(command, stdin=None): + try: + with open(tmp_stderr_name, 'wb') as tmp_stderr: + with open(tmp_stdout_name, 'wb') as tmp_stdout: + proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ) + returncode = proc.wait() + if returncode != 0: + raise Exception("Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr())) + finally: + print open(tmp_stderr_name, "r").read(64000) + print open(tmp_stdout_name, "r").read(64000) + + +def delete_file(path): + if os.path.exists(path): + try: + os.remove(path) + except: + pass + + +def delete_directory(directory): + if os.path.exists(directory): + try: + shutil.rmtree(directory) + except: + pass + + +def symlink(source, link_name): + import platform + if platform.system() == 'Windows': + try: + import win32file + win32file.CreateSymbolicLink(source, link_name, 1) + except: + shutil.copy(source, link_name) + else: + os.symlink(source, link_name) + + +def copy_to_working_directory(data_file, relative_path): + if os.path.abspath(data_file) != os.path.abspath(relative_path): + shutil.copy(data_file, relative_path) + return relative_path + + +def __main__(): + run_script() + + +# Extra database attributes: name, databaseAccessionRegEx, databaseDescriptionRegEx, decoyProteinRegEx +# Extra export types: protxml, spectrum-report, statistics, peptide-report, protein-report, experiment-report +RUN_TEMPLATE = """ + + +$samples +$display_thresholds + + + +""" + +EXPORT_TEMPLATE = """ + +$display_thresholds + + + +""" + +def parse_groups(inputs_file, group_parts=["group"], input_parts=["name", "path"]): + inputs_lines = [line.strip() for line in open(inputs_file, "r").readlines()] + inputs_lines = [line for line in inputs_lines if line and not line.startswith("#")] + cur_group = None + i = 0 + group_prefixes = ["%s:" % group_part for group_part in group_parts] + input_prefixes = ["%s:" % input_part for input_part in input_parts] + groups = {} + while i < len(inputs_lines): + line = inputs_lines[i] + if line.startswith(group_prefixes[0]): + # Start new group + cur_group = line[len(group_prefixes[0]):] + group_data = {} + for j, group_prefix in enumerate(group_prefixes): + group_line = inputs_lines[i + j] + group_data[group_parts[j]] = group_line[len(group_prefix):] + i += len(group_prefixes) + elif line.startswith(input_prefixes[0]): + input = [] + for j, input_prefix in enumerate(input_prefixes): + part_line = inputs_lines[i + j] + part = part_line[len(input_prefixes[j]):] + input.append(part) + if cur_group not in groups: + groups[cur_group] = {"group_data": group_data, "inputs": []} + groups[cur_group]["inputs"].append(input) + i += len(input_prefixes) + else: + # Skip empty line + i += 1 + return groups + + +def build_samples(samples_file): + group_data = parse_groups(samples_file, group_parts=["sample", "mudpit", "category"], input_parts=["name", "path", "ext"]) + samples_description = "" + for sample_name, sample_data in group_data.iteritems(): + files = sample_data["inputs"] + mudpit = sample_data["group_data"]["mudpit"] + category = sample_data["group_data"]["category"] + samples_description += """\n""" % (sample_name, mudpit, category) + for (name, path, ext) in files: + name = os.path.basename(name) + if not name.lower().endswith(ext.lower()): + name = "%s.%s" % (name, ext) + symlink(path, name) + samples_description += "%s\n" % os.path.abspath(name) + samples_description += """\n""" + return samples_description + + +def run_script(): + action = sys.argv[1] + if action == "run": + proc = scaffold_run + elif action == "export": + proc = scaffold_export + proc() + + +def scaffold_export(): + parser = optparse.OptionParser() + parser.add_option("--sf3") + parser.add_option("--output") + parser.add_option("--export_type") + populate_threshold_options(parser) + (options, args) = parser.parse_args() + + template_parameters = {} + + template_parameters["sf3_path"] = options.sf3 + template_parameters["export_options"] = """ type="%s" """ % options.export_type + template_parameters["display_thresholds"] = build_display_thresholds(options) + + execute_scaffold(options, EXPORT_TEMPLATE, template_parameters) + + +def build_display_thresholds(options): + attributes = ['id="thresh"'] + if options.protein_probability is not None: + attributes.append('proteinProbability="%s"' % options.protein_probability) + if options.peptide_probability is not None: + attributes.append('peptideProbability="%s"' % options.peptide_probability) + if options.minimum_peptide_count is not None: + attributes.append('minimumPeptideCount="%s"' % options.minimum_peptide_count) + if options.minimum_peptide_length is not None: + attributes.append('minimumPeptideLength="%s"' % options.minimum_peptide_length) + if options.minimum_ntt is not None: + attributes.append('minimumNTT="%s"' % options.minimum_ntt) + attributes.append('useCharge="%s"' % build_use_charge_option(options)) + tag_open = "" + tag_body = "".join([f(options) for f in [tandem_opts, omssa_opts]]) + tag_close = "" + return tag_open + tag_body + tag_close + + +def tandem_opts(options): + element = "" + tandem_score = options.tandem_score + if tandem_score: + element = '' % ((tandem_score,) * 4) + return element + + +def omssa_opts(options): + return "" + + +def build_use_charge_option(options): + use_charge_array = [] + for i in ["1", "2", "3", "4"]: + use_charge_i = getattr(options, "use_charge_%s" % i, True) + use_charge_array.append("true" if use_charge_i else "false") + return ",".join(use_charge_array) + + +def populate_threshold_options(option_parser): + option_parser.add_option("--protein_probability", default=None) + option_parser.add_option("--peptide_probability", default=None) + option_parser.add_option("--minimum_peptide_count", default=None) + option_parser.add_option("--ignore_charge_1", action="store_false", dest="use_charge_1", default=True) + option_parser.add_option("--ignore_charge_2", action="store_false", dest="use_charge_2", default=True) + option_parser.add_option("--ignore_charge_3", action="store_false", dest="use_charge_3", default=True) + option_parser.add_option("--ignore_charge_4", action="store_false", dest="use_charge_4", default=True) + option_parser.add_option("--minimum_peptide_length", default=None) + option_parser.add_option("--minimum_ntt", default=None) + option_parser.add_option("--tandem_score", default=None) + option_parser.add_option("--omssa_peptide_probability", default=None) + option_parser.add_option("--omssa_log_expect_score", default=None) + + +def database_rules(database_type): + rules_dict = { + "ESTNR": (">(gi\\|[0-9]*)", ">[^ ]* (.*)"), + "IPI": (">IPI:([^\\| .]*)", ">[^ ]* Tax_Id=[0-9]* (.*)"), + "SWISSPROT": (">([^ ]*)", ">[^ ]* \$[^ ]*\$ (.*)"), + "UNIPROT": (">[^ ]*\\|([^ ]*)", ">[^ ]*\\|[^ ]* (.*)"), + "UNIREF": (">UniRef100_([^ ]*)", ">[^ ]* (.*)"), + "ENSEMBL": (">(ENS[^ ]*)", ">[^ ]* (.*)"), + "MSDB": (">([^ ]*)", ">[^ ]* (.*)"), + "GENERIC": (">([^ ]*)", ">[^ ]* (.*)"), + } + database_type = database_type if database_type in rules_dict else "GENERIC" + return rules_dict[database_type] + + +def scaffold_run(): + parser = optparse.OptionParser() + parser.add_option("--samples") + parser.add_option("--database") + parser.add_option("--database_name") + parser.add_option("--database_type") + parser.add_option("--database_decoy_regex") + parser.add_option("--output") + parser.add_option("--output_driver") + populate_threshold_options(parser) + (options, args) = parser.parse_args() + + template_parameters = {} + + # Read samples from config file and convert to XML + template_parameters["samples"] = build_samples(options.samples) + template_parameters["display_thresholds"] = build_display_thresholds(options) + + # Setup database parameters + database_path = options.database + database_name = options.database_name + database_type = options.database_type + database_decoy_regex = options.database_decoy_regex + + (accession_regex, description_regex) = database_rules(database_type) + + template_parameters["database_path"] = database_path + template_parameters["database_name"] = database_name + template_parameters["database_accession_regex"] = escape(accession_regex) + template_parameters["database_description_regex"] = escape(description_regex) + template_parameters["database_decoy_regex"] = escape(database_decoy_regex) + + execute_scaffold(options, RUN_TEMPLATE, template_parameters) + + if options.output_driver: + shutil.copy("driver.xml", options.output_driver) + + +def execute_scaffold(options, template, template_parameters): + # Setup output parameter + output_path = options.output + template_parameters["output_path"] = output_path + + # Prepare and create driver file + driver_contents = Template(template).substitute(template_parameters) + print driver_contents + driver_path = os.path.abspath("driver.xml") + open(driver_path, "w").write(driver_contents) + + # Run Scaffold + execute("ScaffoldBatch3 '%s'" % driver_path) + +if __name__ == '__main__': + __main__() diff -r 000000000000 -r e9981e6af666 update.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/update.sh Thu Jun 20 11:07:47 2013 -0400 @@ -0,0 +1,35 @@ +#!/bin/bash + +LICENSE_FILE=LICENSE +# Ensure repository contains license file. +if [ ! -e "$LICENSE_FILE" ]; +then + wget http://www.apache.org/licenses/LICENSE-2.0.txt -O "$LICENSE_FILE" +fi + +# Run repository specific update actions. +if [ -f update_repo.sh ]; +then + ./update_repo.sh +fi + +wget https://raw.github.com/gist/3749747/README_GALAXYP.md -O README_GALAXYP.md + +# Create repository README +if [ ! -e README_REPO.md ]; +then + echo "TODO: Document this tool repository." > README_REPO.md +fi +cat README_REPO.md README_GALAXYP.md > README.md + + +# If version file exists, update all tools to this version +VERSION_FILE=version +if [ -e "$VERSION_FILE" ]; +then + VERSION=`cat $VERSION_FILE` + + # Replace tool version in each tool XML file ` + find -iname "*xml" -exec sed -i'' -e '0,/version="$.\+$"/s/version="$.\+$"/version="'$VERSION'"/1g' {} \; + +fi