Repository 'protease_prediction'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/protease_prediction

Changeset 0:c7a363d7ab26 (2016-03-12)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/protease_prediction commit e933135e5dc9aa8c96800fd10b62b256ac3a8523-dirty
added:
datatypes_conf.xml
macros.xml
protease.py
protease.xml
test-data/CTSL_test.fasta
test-data/CTSL_train.fasta
test-data/model
test-data/predictions.txt
b
diff -r 000000000000 -r c7a363d7ab26 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Sat Mar 12 19:28:41 2016 -0500
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<datatypes>
+    <registration>
+        <datatype extension="eden_model" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/>
+    </registration>
+</datatypes>
b
diff -r 000000000000 -r c7a363d7ab26 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Sat Mar 12 19:28:41 2016 -0500
b
@@ -0,0 +1,34 @@
+<macros>
+    <token name="@VERSION@">0.9</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.2.1b">eden</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" />
+        </stdio>
+    </xml>
+    <xml name="loadConditional">
+        <conditional name="selected_tasks">
+          <param name="selected_task" type="select" label="Select a Classification Task">
+              <option value="fit" selected="true">Train a model</option>
+              <option value="predict">Load a model and predict</option>
+          </param>
+          <when value="predict">
+              <param name="infile_model" type="data" format="eden_model" label="Models" help="Select a model file." />
+              <param name="infile_data" type="data" format="fasta" label="Data (fasta)" help="Select the FASTA sequences you want to classify."/>
+          </when>
+          <when value="fit">
+            <param name="infile_train" type="data" format="fasta" label="Training samples (FASTA)" />
+            <yield />
+          </when>
+        </conditional>
+    </xml>
+    <xml name="eden_citation">
+        <citations>
+            <citation type="doi">10.5281/zenodo.27945</citation>
+        </citations>
+    </xml>
+</macros>
b
diff -r 000000000000 -r c7a363d7ab26 protease.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/protease.py Sat Mar 12 19:28:41 2016 -0500
[
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+description = """
+Explicit Decomposition with Neighborhood (EDeN) utility program.
+Protease modelling driver.
+"""
+
+epilog = """
+Author: Fabrizio Costa
+Copyright: 2015
+License: GPL
+Maintainer: Fabrizio Costa
+Email: costa@informatik.uni-freiburg.de
+Status: Production
+
+Cite:  Costa, Fabrizio, and Kurt De Grave, 'Fast neighborhood subgraph pairwise
+distance kernel', Proceedings of the 26th International Conference on Machine
+Learning. 2010. """
+
+import os
+import logging
+
+from eden.graph import Vectorizer
+from eden.model_base import ModelInitializerBase, main_script
+from eden.converter.fasta import fasta_to_sequence
+from eden.modifier.seq import seq_to_seq
+from eden.modifier.seq import shuffle_modifier
+from eden.modifier.seq import mark_modifier
+from eden.converter.fasta import sequence_to_eden
+
+
+class ModelInitializer(ModelInitializerBase):
+
+    def load_data(self, args):
+        seqs = fasta_to_sequence(args.input_file)
+        return seqs
+
+    def load_positive_data(self, args):
+        return self.load_data(args)
+
+    def load_negative_data(self, args):
+        seqs = self.load_data(args)
+        return seq_to_seq(seqs,
+                          modifier=shuffle_modifier,
+                          times=args.negative_ratio,
+                          order=args.shuffle_order)
+
+    def pre_processor_init(self, args):
+        def pre_processor(seqs, **args):
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark='%')
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark='@')
+            seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark='*')
+            graphs = sequence_to_eden(seqs)
+            return graphs
+
+        pre_processor_parameters = {}
+        return pre_processor, pre_processor_parameters
+
+    def vectorizer_init(self, args):
+        vectorizer = Vectorizer()
+        vectorizer_parameters = {'complexity': [2, 3, 4, 5, 6]}
+        return vectorizer, vectorizer_parameters
+
+    def add_arguments(self, parser):
+        parser.add_argument('--version', action='version', version='0.1')
+        return parser
+
+    def add_arguments_fit(self, parser):
+        parser.add_argument("-i", "--input-file",
+                            dest="input_file",
+                            help="Path to FASTA file containing input sequences.",
+                            required=True)
+        parser.add_argument("--negative-ratio",
+                            dest="negative_ratio",
+                            type=int,
+                            help="Relative size ration for the randomly permuted negative instances w.r.t.\
+                            the positive instances.",
+                            default=2)
+        parser.add_argument("--shuffle-order",
+                            dest="shuffle_order",
+                            type=int,
+                            help="Order of the k-mer for the random shuffling procedure.",
+                            default=2)
+        return parser
+
+    def add_arguments_estimate(self, parser):
+        return self.add_arguments_fit(parser)
+
+if __name__ == "__main__":
+    model_initializer = ModelInitializer()
+    main_script(model_initializer=model_initializer,
+                description=description,
+                epilog=epilog,
+                prog_name=os.path.basename(__file__),
+                logger=logging.getLogger())
b
diff -r 000000000000 -r c7a363d7ab26 protease.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/protease.xml Sat Mar 12 19:28:41 2016 -0500
[
@@ -0,0 +1,81 @@
+<tool id="eden_protease_prediction" name="Protease prediction" version="@VERSION@">
+    <description>based on cleavage sites</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="stdio"/>
+    <version_command>echo "@VERSION@"</version_command>
+    <command><![CDATA[
+    python $__tool_directory__/protease.py
+    #if $selected_tasks.selected_task == 'fit':
+        fit
+        -i $selected_tasks.infile_train
+        --negative-ratio $selected_tasks.options.negative_ratio
+        --shuffle-order $selected_tasks.options.shuffle_order
+        -r  $selected_tasks.options.random_state
+    #else:
+        predict
+        -m $selected_tasks.infile_model
+        -i $selected_tasks.infile_data
+    #end if
+]]>
+    </command>
+    <inputs>
+        <expand macro="loadConditional">
+            <section name="options" title="Advanced Options" expanded="False">
+                <param name="negative_ratio" type="integer" optional="true" value="2" label="Negative to positive instance ratio"
+                    help="Relative size ratio for the randomly permuted negative instances w.r.t. the positive instances." />
+                <param name="shuffle_order" type="integer" optional="true" value="2" label="Order of k-mer shuffling"
+                    help="Order of the k-mer for the random shuffling procedure." />
+                <param name="random_state" type="integer" value="1" label="Random seed" />
+            </section>
+        </expand>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="outfile_predict" from_work_dir="out/predictions.txt">
+            <filter>selected_tasks['selected_task'] == 'predict'</filter>
+        </data>
+        <data format="eden_model" name="outfile_fit" from_work_dir="out/model">
+            <filter>selected_tasks['selected_task'] == 'fit'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile_train" value="CTSL_train.fasta" ftype="fasta"/>
+            <param name="selected_task" value="fit"/>
+            <param name="shuffle_order" value="3"/>
+            <output name="outfile_fit" file="model" ftpye="eden_model" compare="sim_size" delta="100000"/>
+        </test>
+        <test>
+            <param name="infile_model" value="model" ftype="eden_model"/>
+            <param name="infile_data" value="CTSL_test.fasta" ftype="fasta"/>
+            <param name="selected_task" value="predict"/>
+            <output name="outfile_predict" file="predictions.txt" ftpye="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool can learn the cleavage specificity of a given class of protease. In a second step this can be used to predict proteases given a cleavage site.
+The method assumes that the candidate cleavage point is between the two amino acids adjacent to the central position.
+The method is based on an efficient string kernel implemented in the Explicit Decomposition with Neighbourhood (EDeN) library.
+This approach uses the notion of k-mers with gaps to enumerate all possible substrings of increasing order which are used as features in an efficient linear binary classification estimator.
+
+**Example Input**
+
+::
+
+  >CTSL1
+  SSFVSNWD
+  >CTSL1
+  SSIQATTA
+  >CTSL1
+  SSLAGCQI
+  >CTSL1
+  SSLGGTVV
+
+
+    ]]></help>
+    <expand macro="eden_citation"/>
+</tool>
b
diff -r 000000000000 -r c7a363d7ab26 test-data/CTSL_test.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CTSL_test.fasta Sat Mar 12 19:28:41 2016 -0500
b
@@ -0,0 +1,150 @@
+>a
+SSFVSNWD
+>b
+SSIQATTA
+>c
+SSLAGCQI
+>d
+SSLGGTVV
+>e
+SSLQDCLH
+>f
+SSPAGGHA
+>g
+SSVGNVAD
+>h
+SSYVHGGV
+>i
+STFEERSY
+>j
+TFPKASVP
+>k
+TFVNITPA
+>l
+TGFAGIDS
+>m
+TGFEISSS
+>n
+TGFGMIYD
+>o
+TGLRDPFN
+>p
+TGLTQIET
+>q
+THYFLPPD
+>r
+TKAQAAAP
+>s
+TLIVRPDN
+>t
+TLLNQAPD
+>u
+TLVQTQVE
+>v
+TLWTSDMQ
+>w
+TPFAATSS
+>x
+TPVATSPT
+>y
+TQVHGTIT
+>z
+TRVSHFLP
+>aa
+TSFNGHKP
+>ab
+TSVGSVNP
+>ac
+TSYQSPHG
+>ad
+TTLSGTAP
+>ae
+TTMGGPLP
+>af
+TTVNGQSP
+>ag
+TTVSNSQQ
+>ah
+TVFAEHIS
+>ai
+TVFFDIAV
+>aj
+TVIGGGDT
+>ak
+TVVMASKG
+>al
+TYPQWQPP
+>am
+VAFCDAQS
+>an
+VAFTQVNS
+>ao
+VAVAGCCH
+>ap
+VAVSAAPG
+>aq
+VAYVSFGP
+>ar
+VDIEAIFS
+>as
+VDLSHPGV
+>at
+VELNGNQP
+>au
+VEVLAGHG
+>av
+VFFDIAVD
+>aw
+VFVGGLSP
+>ax
+VGAGGPAP
+>ay
+VGFLEGGK
+>az
+VGFSSGTE
+>bb
+VGINYQPP
+>rr
+VGLTSIAN
+>ss
+VGVSGSET
+>ee
+VHIQAGQC
+>ww
+VHYGEVTN
+>qq
+VIFQGTDH
+>tt
+VIISAPSA
+>zz
+VIITGPPE
+>uu
+VILESDPQ
+>ii
+VILGSEAA
+>oo
+VILHLKED
+>pp
+VLAMSGDP
+>ll
+VLIEHIGN
+>kk
+VLLEGNPD
+>jj
+VLLQAGAD
+>hh
+VLPRSAKE
+>gg
+VLVERSAA
+>ff
+VMIQDGPQ
+>nn
+VMLGETNP
+>bb
+VNIGSIST
+>bn
+VNLQHLDL
+>mm
+VPLGSEKP
+>cc
+VPVTGIPP
b
diff -r 000000000000 -r c7a363d7ab26 test-data/CTSL_train.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CTSL_train.fasta Sat Mar 12 19:28:41 2016 -0500
b
@@ -0,0 +1,100 @@
+>CTSL1
+AALAAAPA
+>CTSL1
+AALAHISG
+>CTSL1
+AAMAASPH
+>CTSL1
+AAPGSAAP
+>CTSL1
+AARKSAPA
+>CTSL1
+AASGSPGP
+>CTSL1
+AATQGAAA
+>CTSL1
+AAVGGVFD
+>CTSL1
+ACLEKPLL
+>CTSL1
+ADYESVNE
+>CTSL1
+AEIGQNHQ
+>CTSL1
+AESESLVN
+>CTSL1
+AFVNQHLC
+>CTSL1
+AGCTSAGP
+>CTSL1
+AGIATHFV
+>CTSL1
+AGIQHSCQ
+>CTSL1
+AGLESGAE
+>CTSL1
+AGLVSPSL
+>CTSL1
+AGSFGGAG
+>CTSL1
+AGVGEFEA
+>CTSL1
+AGVNTVTT
+>CTSL1
+AGWMGLDC
+>CTSL1
+AGYLGQVT
+>CTSL1
+AHFGIHEE
+>CTSL1
+AHLDITPN
+>CTSL1
+AHLKNSQE
+>CTSL1
+AHLMEIQV
+>CTSL1
+AHLQTSHK
+>CTSL1
+AIFGRPVV
+>CTSL1
+AIICGSGL
+>CTSL1
+AIPMSIPP
+>CTSL1
+AIYEGQLG
+>CTSL1
+AKVKAQTA
+>CTSL1
+ALEYATDT
+>CTSL1
+ALGHRPIP
+>CTSL1
+ALKPMYSM
+>CTSL1
+ALLELQLE
+>CTSL1
+ALLGGHQG
+>CTSL1
+ALLSSAVD
+>CTSL1
+ALVAEEHL
+>CTSL1
+ALVLGGVD
+>CTSL1
+ALVQHQEW
+>CTSL1
+ALVTGGEI
+>CTSL1
+ALWDTAGQ
+>CTSL1
+ALYLVCGE
+>CTSL1
+AMLGNSED
+>CTSL1
+AMLSGPGQ
+>CTSL1
+ANIAHGNS
+>CTSL1
+ANLTQSQI
+>CTSL1
+ANVGAVPS
b
diff -r 000000000000 -r c7a363d7ab26 test-data/model
b
Binary file test-data/model has changed
b
diff -r 000000000000 -r c7a363d7ab26 test-data/predictions.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/predictions.txt Sat Mar 12 19:28:41 2016 -0500
b
@@ -0,0 +1,75 @@
+1 714865162965.0 a
+-1 -2.0633772184e+12 b
+-1 -2.63278832465e+12 c
+-1 -2.30657489269e+12 d
+-1 -1.60666238581e+12 e
+-1 -5.64892007591e+12 f
+1 1.19958430313e+12 g
+-1 -4.81891904858e+12 h
+-1 -4.26115839421e+12 i
+-1 -2.01451585778e+12 j
+-1 -3.18448213118e+12 k
+-1 -5.32148298316e+12 l
+-1 -4.25594148364e+12 m
+-1 -5.05361918097e+12 n
+-1 -2.81407147475e+12 o
+-1 -743476285794.0 p
+-1 -1.28450200191e+12 q
+-1 -6.82098953196e+12 r
+-1 -911697110363.0 s
+-1 -1.41018885051e+12 t
+1 1.54489789585e+12 u
+1 15904035492.6 v
+-1 -7.1604898574e+12 w
+-1 -291097086285.0 x
+-1 -2.94082503016e+12 y
+-1 -1.73028072922e+12 z
+-1 -1.92238905582e+12 aa
+-1 -635673300943.0 ab
+-1 -486766774604.0 ac
+-1 -1.11318146795e+12 ad
+-1 -3.65821042965e+12 ae
+-1 -114610205054.0 af
+-1 -510138596388.0 ag
+-1 -6.65599199641e+12 ah
+-1 -4.13413986663e+12 ai
+-1 -5.8294381292e+12 aj
+-1 -3.52307285487e+12 ak
+-1 -1.63846242641e+12 al
+-1 -6.2381237974e+12 am
+1 1.56329451125e+12 an
+-1 -3.41757523005e+12 ao
+-1 -3.69981770962e+12 ap
+-1 -1.26491397758e+12 aq
+-1 -6.1732488464e+12 ar
+-1 -2.93027667881e+12 as
+-1 -1.23589278355e+12 at
+-1 -7.81321990096e+12 au
+-1 -3.37867184582e+12 av
+1 1.81255065566e+12 aw
+-1 -5.8103087454e+12 ax
+-1 -7.64938989051e+12 ay
+-1 -2.56010386139e+12 az
+-1 -2.19510046853e+12 bb
+-1 -1.38509574184e+12 rr
+-1 -1.82551763609e+12 ss
+-1 -2.22551450346e+12 ee
+-1 -4.51255078762e+12 ww
+-1 -3.36285574975e+12 qq
+-1 -3.07010023516e+12 tt
+-1 -1.27965891837e+12 zz
+-1 -1.32001091916e+12 uu
+-1 -1.91484366367e+12 ii
+-1 -3.10115319124e+12 oo
+-1 -7.10850199103e+12 pp
+-1 -4.95385785405e+12 ll
+-1 -1.40493423999e+12 kk
+-1 -3.5605949667e+12 jj
+-1 -2.88491677858e+12 hh
+-1 -3.71463321771e+12 gg
+-1 -3.30053101487e+12 ff
+-1 -3.04988922726e+12 nn
+1 50271977527.9 bb
+1 772008596347.0 bn
+-1 -554618958861.0 mm
+-1 -1.30728155546e+12 cc