changeset 0:c7a363d7ab26 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/protease_prediction commit e933135e5dc9aa8c96800fd10b62b256ac3a8523-dirty
author bgruening
date Sat, 12 Mar 2016 19:28:41 -0500
parents
children
files datatypes_conf.xml macros.xml protease.py protease.xml test-data/CTSL_test.fasta test-data/CTSL_train.fasta test-data/model test-data/predictions.txt
diffstat 8 files changed, 541 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<datatypes>
+    <registration>
+        <datatype extension="eden_model" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
+    </registration>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,34 @@
+<macros>
+    <token name="@VERSION@">0.9</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.2.1b">eden</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" />
+        </stdio>
+    </xml>
+    <xml name="loadConditional">
+        <conditional name="selected_tasks">
+          <param name="selected_task" type="select" label="Select a Classification Task">
+              <option value="fit" selected="true">Train a model</option>
+              <option value="predict">Load a model and predict</option>
+          </param>
+          <when value="predict">
+              <param name="infile_model" type="data" format="eden_model" label="Models" help="Select a model file." />
+              <param name="infile_data" type="data" format="fasta" label="Data (fasta)" help="Select the FASTA sequences you want to classify."/>
+          </when>
+          <when value="fit">
+            <param name="infile_train" type="data" format="fasta" label="Training samples (FASTA)" />
+            <yield />
+          </when>
+        </conditional>
+    </xml>
+    <xml name="eden_citation">
+        <citations>
+            <citation type="doi">10.5281/zenodo.27945</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protease.py	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+description = """
+Explicit Decomposition with Neighborhood (EDeN) utility program.
+Protease modelling driver.
+"""
+
+epilog = """
+Author: Fabrizio Costa
+Copyright: 2015
+License: GPL
+Maintainer: Fabrizio Costa
+Email: costa@informatik.uni-freiburg.de
+Status: Production
+
+Cite:  Costa, Fabrizio, and Kurt De Grave, 'Fast neighborhood subgraph pairwise
+distance kernel', Proceedings of the 26th International Conference on Machine
+Learning. 2010. """
+
+import os
+import logging
+
+from eden.graph import Vectorizer
+from eden.model_base import ModelInitializerBase, main_script
+from eden.converter.fasta import fasta_to_sequence
+from eden.modifier.seq import seq_to_seq
+from eden.modifier.seq import shuffle_modifier
+from eden.modifier.seq import mark_modifier
+from eden.converter.fasta import sequence_to_eden
+
+
+class ModelInitializer(ModelInitializerBase):
+
+    def load_data(self, args):
+        seqs = fasta_to_sequence(args.input_file)
+        return seqs
+
+    def load_positive_data(self, args):
+        return self.load_data(args)
+
+    def load_negative_data(self, args):
+        seqs = self.load_data(args)
+        return seq_to_seq(seqs,
+                          modifier=shuffle_modifier,
+                          times=args.negative_ratio,
+                          order=args.shuffle_order)
+
+    def pre_processor_init(self, args):
+        def pre_processor(seqs, **args):
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark='%')
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark='@')
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark='*')
+            graphs = sequence_to_eden(seqs)
+            return graphs
+
+        pre_processor_parameters = {}
+        return pre_processor, pre_processor_parameters
+
+    def vectorizer_init(self, args):
+        vectorizer = Vectorizer()
+        vectorizer_parameters = {'complexity': [2, 3, 4, 5, 6]}
+        return vectorizer, vectorizer_parameters
+
+    def add_arguments(self, parser):
+        parser.add_argument('--version', action='version', version='0.1')
+        return parser
+
+    def add_arguments_fit(self, parser):
+        parser.add_argument("-i", "--input-file",
+                            dest="input_file",
+                            help="Path to FASTA file containing input sequences.",
+                            required=True)
+        parser.add_argument("--negative-ratio",
+                            dest="negative_ratio",
+                            type=int,
+                            help="Relative size ration for the randomly permuted negative instances w.r.t.\
+                            the positive instances.",
+                            default=2)
+        parser.add_argument("--shuffle-order",
+                            dest="shuffle_order",
+                            type=int,
+                            help="Order of the k-mer for the random shuffling procedure.",
+                            default=2)
+        return parser
+
+    def add_arguments_estimate(self, parser):
+        return self.add_arguments_fit(parser)
+
+if __name__ == "__main__":
+    model_initializer = ModelInitializer()
+    main_script(model_initializer=model_initializer,
+                description=description,
+                epilog=epilog,
+                prog_name=os.path.basename(__file__),
+                logger=logging.getLogger())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/protease.xml	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,81 @@
+<tool id="eden_protease_prediction" name="Protease prediction" version="@VERSION@">
+    <description>based on cleavage sites</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <version_command>echo "@VERSION@"</version_command>
+    <command><![CDATA[
+    python $__tool_directory__/protease.py
+    #if $selected_tasks.selected_task == 'fit':
+        fit
+        -i $selected_tasks.infile_train
+        --negative-ratio $selected_tasks.options.negative_ratio
+        --shuffle-order $selected_tasks.options.shuffle_order
+        -r  $selected_tasks.options.random_state
+    #else:
+        predict
+        -m $selected_tasks.infile_model
+        -i $selected_tasks.infile_data
+    #end if
+]]>
+    </command>
+    <inputs>
+        <expand macro="loadConditional">
+            <section name="options" title="Advanced Options" expanded="False">
+                <param name="negative_ratio" type="integer" optional="true" value="2" label="Negative to positive instance ratio"
+                    help="Relative size ratio for the randomly permuted negative instances w.r.t. the positive instances." />
+                <param name="shuffle_order" type="integer" optional="true" value="2" label="Order of k-mer shuffling"
+                    help="Order of the k-mer for the random shuffling procedure." />
+                <param name="random_state" type="integer" value="1" label="Random seed" />
+            </section>
+        </expand>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="outfile_predict" from_work_dir="out/predictions.txt">
+            <filter>selected_tasks['selected_task'] == 'predict'</filter>
+        </data>
+        <data format="eden_model" name="outfile_fit" from_work_dir="out/model">
+            <filter>selected_tasks['selected_task'] == 'fit'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile_train" value="CTSL_train.fasta" ftype="fasta"/>
+            <param name="selected_task" value="fit"/>
+            <param name="shuffle_order" value="3"/>
+            <output name="outfile_fit" file="model" ftpye="eden_model" compare="sim_size" delta="100000"/>
+        </test>
+        <test>
+            <param name="infile_model" value="model" ftype="eden_model"/>
+            <param name="infile_data" value="CTSL_test.fasta" ftype="fasta"/>
+            <param name="selected_task" value="predict"/>
+            <output name="outfile_predict" file="predictions.txt" ftpye="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool can learn the cleavage specificity of a given class of protease. In a second step this can be used to predict proteases given a cleavage site.
+The method assumes that the candidate cleavage point is between the two amino acids adjacent to the central position.
+The method is based on an efficient string kernel implemented in the Explicit Decomposition with Neighbourhood (EDeN) library.
+This approach uses the notion of k-mers with gaps to enumerate all possible substrings of increasing order which are used as features in an efficient linear binary classification estimator.
+
+**Example Input**
+
+::
+
+  >CTSL1
+  SSFVSNWD
+  >CTSL1
+  SSIQATTA
+  >CTSL1
+  SSLAGCQI
+  >CTSL1
+  SSLGGTVV
+
+
+    ]]></help>
+    <expand macro="eden_citation"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CTSL_test.fasta	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,150 @@
+>a
+SSFVSNWD
+>b
+SSIQATTA
+>c
+SSLAGCQI
+>d
+SSLGGTVV
+>e
+SSLQDCLH
+>f
+SSPAGGHA
+>g
+SSVGNVAD
+>h
+SSYVHGGV
+>i
+STFEERSY
+>j
+TFPKASVP
+>k
+TFVNITPA
+>l
+TGFAGIDS
+>m
+TGFEISSS
+>n
+TGFGMIYD
+>o
+TGLRDPFN
+>p
+TGLTQIET
+>q
+THYFLPPD
+>r
+TKAQAAAP
+>s
+TLIVRPDN
+>t
+TLLNQAPD
+>u
+TLVQTQVE
+>v
+TLWTSDMQ
+>w
+TPFAATSS
+>x
+TPVATSPT
+>y
+TQVHGTIT
+>z
+TRVSHFLP
+>aa
+TSFNGHKP
+>ab
+TSVGSVNP
+>ac
+TSYQSPHG
+>ad
+TTLSGTAP
+>ae
+TTMGGPLP
+>af
+TTVNGQSP
+>ag
+TTVSNSQQ
+>ah
+TVFAEHIS
+>ai
+TVFFDIAV
+>aj
+TVIGGGDT
+>ak
+TVVMASKG
+>al
+TYPQWQPP
+>am
+VAFCDAQS
+>an
+VAFTQVNS
+>ao
+VAVAGCCH
+>ap
+VAVSAAPG
+>aq
+VAYVSFGP
+>ar
+VDIEAIFS
+>as
+VDLSHPGV
+>at
+VELNGNQP
+>au
+VEVLAGHG
+>av
+VFFDIAVD
+>aw
+VFVGGLSP
+>ax
+VGAGGPAP
+>ay
+VGFLEGGK
+>az
+VGFSSGTE
+>bb
+VGINYQPP
+>rr
+VGLTSIAN
+>ss
+VGVSGSET
+>ee
+VHIQAGQC
+>ww
+VHYGEVTN
+>qq
+VIFQGTDH
+>tt
+VIISAPSA
+>zz
+VIITGPPE
+>uu
+VILESDPQ
+>ii
+VILGSEAA
+>oo
+VILHLKED
+>pp
+VLAMSGDP
+>ll
+VLIEHIGN
+>kk
+VLLEGNPD
+>jj
+VLLQAGAD
+>hh
+VLPRSAKE
+>gg
+VLVERSAA
+>ff
+VMIQDGPQ
+>nn
+VMLGETNP
+>bb
+VNIGSIST
+>bn
+VNLQHLDL
+>mm
+VPLGSEKP
+>cc
+VPVTGIPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CTSL_train.fasta	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,100 @@
+>CTSL1
+AALAAAPA
+>CTSL1
+AALAHISG
+>CTSL1
+AAMAASPH
+>CTSL1
+AAPGSAAP
+>CTSL1
+AARKSAPA
+>CTSL1
+AASGSPGP
+>CTSL1
+AATQGAAA
+>CTSL1
+AAVGGVFD
+>CTSL1
+ACLEKPLL
+>CTSL1
+ADYESVNE
+>CTSL1
+AEIGQNHQ
+>CTSL1
+AESESLVN
+>CTSL1
+AFVNQHLC
+>CTSL1
+AGCTSAGP
+>CTSL1
+AGIATHFV
+>CTSL1
+AGIQHSCQ
+>CTSL1
+AGLESGAE
+>CTSL1
+AGLVSPSL
+>CTSL1
+AGSFGGAG
+>CTSL1
+AGVGEFEA
+>CTSL1
+AGVNTVTT
+>CTSL1
+AGWMGLDC
+>CTSL1
+AGYLGQVT
+>CTSL1
+AHFGIHEE
+>CTSL1
+AHLDITPN
+>CTSL1
+AHLKNSQE
+>CTSL1
+AHLMEIQV
+>CTSL1
+AHLQTSHK
+>CTSL1
+AIFGRPVV
+>CTSL1
+AIICGSGL
+>CTSL1
+AIPMSIPP
+>CTSL1
+AIYEGQLG
+>CTSL1
+AKVKAQTA
+>CTSL1
+ALEYATDT
+>CTSL1
+ALGHRPIP
+>CTSL1
+ALKPMYSM
+>CTSL1
+ALLELQLE
+>CTSL1
+ALLGGHQG
+>CTSL1
+ALLSSAVD
+>CTSL1
+ALVAEEHL
+>CTSL1
+ALVLGGVD
+>CTSL1
+ALVQHQEW
+>CTSL1
+ALVTGGEI
+>CTSL1
+ALWDTAGQ
+>CTSL1
+ALYLVCGE
+>CTSL1
+AMLGNSED
+>CTSL1
+AMLSGPGQ
+>CTSL1
+ANIAHGNS
+>CTSL1
+ANLTQSQI
+>CTSL1
+ANVGAVPS
Binary file test-data/model has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/predictions.txt	Sat Mar 12 19:28:41 2016 -0500
@@ -0,0 +1,75 @@
+1	714865162965.0	a
+-1	-2.0633772184e+12	b
+-1	-2.63278832465e+12	c
+-1	-2.30657489269e+12	d
+-1	-1.60666238581e+12	e
+-1	-5.64892007591e+12	f
+1	1.19958430313e+12	g
+-1	-4.81891904858e+12	h
+-1	-4.26115839421e+12	i
+-1	-2.01451585778e+12	j
+-1	-3.18448213118e+12	k
+-1	-5.32148298316e+12	l
+-1	-4.25594148364e+12	m
+-1	-5.05361918097e+12	n
+-1	-2.81407147475e+12	o
+-1	-743476285794.0	p
+-1	-1.28450200191e+12	q
+-1	-6.82098953196e+12	r
+-1	-911697110363.0	s
+-1	-1.41018885051e+12	t
+1	1.54489789585e+12	u
+1	15904035492.6	v
+-1	-7.1604898574e+12	w
+-1	-291097086285.0	x
+-1	-2.94082503016e+12	y
+-1	-1.73028072922e+12	z
+-1	-1.92238905582e+12	aa
+-1	-635673300943.0	ab
+-1	-486766774604.0	ac
+-1	-1.11318146795e+12	ad
+-1	-3.65821042965e+12	ae
+-1	-114610205054.0	af
+-1	-510138596388.0	ag
+-1	-6.65599199641e+12	ah
+-1	-4.13413986663e+12	ai
+-1	-5.8294381292e+12	aj
+-1	-3.52307285487e+12	ak
+-1	-1.63846242641e+12	al
+-1	-6.2381237974e+12	am
+1	1.56329451125e+12	an
+-1	-3.41757523005e+12	ao
+-1	-3.69981770962e+12	ap
+-1	-1.26491397758e+12	aq
+-1	-6.1732488464e+12	ar
+-1	-2.93027667881e+12	as
+-1	-1.23589278355e+12	at
+-1	-7.81321990096e+12	au
+-1	-3.37867184582e+12	av
+1	1.81255065566e+12	aw
+-1	-5.8103087454e+12	ax
+-1	-7.64938989051e+12	ay
+-1	-2.56010386139e+12	az
+-1	-2.19510046853e+12	bb
+-1	-1.38509574184e+12	rr
+-1	-1.82551763609e+12	ss
+-1	-2.22551450346e+12	ee
+-1	-4.51255078762e+12	ww
+-1	-3.36285574975e+12	qq
+-1	-3.07010023516e+12	tt
+-1	-1.27965891837e+12	zz
+-1	-1.32001091916e+12	uu
+-1	-1.91484366367e+12	ii
+-1	-3.10115319124e+12	oo
+-1	-7.10850199103e+12	pp
+-1	-4.95385785405e+12	ll
+-1	-1.40493423999e+12	kk
+-1	-3.5605949667e+12	jj
+-1	-2.88491677858e+12	hh
+-1	-3.71463321771e+12	gg
+-1	-3.30053101487e+12	ff
+-1	-3.04988922726e+12	nn
+1	50271977527.9	bb
+1	772008596347.0	bn
+-1	-554618958861.0	mm
+-1	-1.30728155546e+12	cc