Mercurial > repos > bgruening > protease_prediction
changeset 0:c7a363d7ab26 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/protease_prediction commit e933135e5dc9aa8c96800fd10b62b256ac3a8523-dirty
author | bgruening |
---|---|
date | Sat, 12 Mar 2016 19:28:41 -0500 |
parents | |
children | |
files | datatypes_conf.xml macros.xml protease.py protease.xml test-data/CTSL_test.fasta test-data/CTSL_train.fasta test-data/model test-data/predictions.txt |
diffstat | 8 files changed, 541 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<datatypes> + <registration> + <datatype extension="eden_model" type="galaxy.datatypes.binary:CompressedArchive" subclass="True" display_in_upload="True"/> + </registration> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,34 @@ +<macros> + <token name="@VERSION@">0.9</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.2.1b">eden</requirement> + </requirements> + </xml> + <xml name="stdio"> + <stdio> + <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" /> + </stdio> + </xml> + <xml name="loadConditional"> + <conditional name="selected_tasks"> + <param name="selected_task" type="select" label="Select a Classification Task"> + <option value="fit" selected="true">Train a model</option> + <option value="predict">Load a model and predict</option> + </param> + <when value="predict"> + <param name="infile_model" type="data" format="eden_model" label="Models" help="Select a model file." /> + <param name="infile_data" type="data" format="fasta" label="Data (fasta)" help="Select the FASTA sequences you want to classify."/> + </when> + <when value="fit"> + <param name="infile_train" type="data" format="fasta" label="Training samples (FASTA)" /> + <yield /> + </when> + </conditional> + </xml> + <xml name="eden_citation"> + <citations> + <citation type="doi">10.5281/zenodo.27945</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/protease.py Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +description = """ +Explicit Decomposition with Neighborhood (EDeN) utility program. +Protease modelling driver. +""" + +epilog = """ +Author: Fabrizio Costa +Copyright: 2015 +License: GPL +Maintainer: Fabrizio Costa +Email: costa@informatik.uni-freiburg.de +Status: Production + +Cite: Costa, Fabrizio, and Kurt De Grave, 'Fast neighborhood subgraph pairwise +distance kernel', Proceedings of the 26th International Conference on Machine +Learning. 2010. """ + +import os +import logging + +from eden.graph import Vectorizer +from eden.model_base import ModelInitializerBase, main_script +from eden.converter.fasta import fasta_to_sequence +from eden.modifier.seq import seq_to_seq +from eden.modifier.seq import shuffle_modifier +from eden.modifier.seq import mark_modifier +from eden.converter.fasta import sequence_to_eden + + +class ModelInitializer(ModelInitializerBase): + + def load_data(self, args): + seqs = fasta_to_sequence(args.input_file) + return seqs + + def load_positive_data(self, args): + return self.load_data(args) + + def load_negative_data(self, args): + seqs = self.load_data(args) + return seq_to_seq(seqs, + modifier=shuffle_modifier, + times=args.negative_ratio, + order=args.shuffle_order) + + def pre_processor_init(self, args): + def pre_processor(seqs, **args): + seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark='%') + seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark='@') + seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark='*') + graphs = sequence_to_eden(seqs) + return graphs + + pre_processor_parameters = {} + return pre_processor, pre_processor_parameters + + def vectorizer_init(self, args): + vectorizer = Vectorizer() + vectorizer_parameters = {'complexity': [2, 3, 4, 5, 6]} + return vectorizer, vectorizer_parameters + + def add_arguments(self, parser): + parser.add_argument('--version', action='version', version='0.1') + return parser + + def add_arguments_fit(self, parser): + parser.add_argument("-i", "--input-file", + dest="input_file", + help="Path to FASTA file containing input sequences.", + required=True) + parser.add_argument("--negative-ratio", + dest="negative_ratio", + type=int, + help="Relative size ration for the randomly permuted negative instances w.r.t.\ + the positive instances.", + default=2) + parser.add_argument("--shuffle-order", + dest="shuffle_order", + type=int, + help="Order of the k-mer for the random shuffling procedure.", + default=2) + return parser + + def add_arguments_estimate(self, parser): + return self.add_arguments_fit(parser) + +if __name__ == "__main__": + model_initializer = ModelInitializer() + main_script(model_initializer=model_initializer, + description=description, + epilog=epilog, + prog_name=os.path.basename(__file__), + logger=logging.getLogger())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/protease.xml Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,81 @@ +<tool id="eden_protease_prediction" name="Protease prediction" version="@VERSION@"> + <description>based on cleavage sites</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="stdio"/> + <version_command>echo "@VERSION@"</version_command> + <command><![CDATA[ + python $__tool_directory__/protease.py + #if $selected_tasks.selected_task == 'fit': + fit + -i $selected_tasks.infile_train + --negative-ratio $selected_tasks.options.negative_ratio + --shuffle-order $selected_tasks.options.shuffle_order + -r $selected_tasks.options.random_state + #else: + predict + -m $selected_tasks.infile_model + -i $selected_tasks.infile_data + #end if +]]> + </command> + <inputs> + <expand macro="loadConditional"> + <section name="options" title="Advanced Options" expanded="False"> + <param name="negative_ratio" type="integer" optional="true" value="2" label="Negative to positive instance ratio" + help="Relative size ratio for the randomly permuted negative instances w.r.t. the positive instances." /> + <param name="shuffle_order" type="integer" optional="true" value="2" label="Order of k-mer shuffling" + help="Order of the k-mer for the random shuffling procedure." /> + <param name="random_state" type="integer" value="1" label="Random seed" /> + </section> + </expand> + </inputs> + <outputs> + <data format="tabular" name="outfile_predict" from_work_dir="out/predictions.txt"> + <filter>selected_tasks['selected_task'] == 'predict'</filter> + </data> + <data format="eden_model" name="outfile_fit" from_work_dir="out/model"> + <filter>selected_tasks['selected_task'] == 'fit'</filter> + </data> + </outputs> + <tests> + <test> + <param name="infile_train" value="CTSL_train.fasta" ftype="fasta"/> + <param name="selected_task" value="fit"/> + <param name="shuffle_order" value="3"/> + <output name="outfile_fit" file="model" ftpye="eden_model" compare="sim_size" delta="100000"/> + </test> + <test> + <param name="infile_model" value="model" ftype="eden_model"/> + <param name="infile_data" value="CTSL_test.fasta" ftype="fasta"/> + <param name="selected_task" value="predict"/> + <output name="outfile_predict" file="predictions.txt" ftpye="tabular"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool can learn the cleavage specificity of a given class of protease. In a second step this can be used to predict proteases given a cleavage site. +The method assumes that the candidate cleavage point is between the two amino acids adjacent to the central position. +The method is based on an efficient string kernel implemented in the Explicit Decomposition with Neighbourhood (EDeN) library. +This approach uses the notion of k-mers with gaps to enumerate all possible substrings of increasing order which are used as features in an efficient linear binary classification estimator. + +**Example Input** + +:: + + >CTSL1 + SSFVSNWD + >CTSL1 + SSIQATTA + >CTSL1 + SSLAGCQI + >CTSL1 + SSLGGTVV + + + ]]></help> + <expand macro="eden_citation"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CTSL_test.fasta Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,150 @@ +>a +SSFVSNWD +>b +SSIQATTA +>c +SSLAGCQI +>d +SSLGGTVV +>e +SSLQDCLH +>f +SSPAGGHA +>g +SSVGNVAD +>h +SSYVHGGV +>i +STFEERSY +>j +TFPKASVP +>k +TFVNITPA +>l +TGFAGIDS +>m +TGFEISSS +>n +TGFGMIYD +>o +TGLRDPFN +>p +TGLTQIET +>q +THYFLPPD +>r +TKAQAAAP +>s +TLIVRPDN +>t +TLLNQAPD +>u +TLVQTQVE +>v +TLWTSDMQ +>w +TPFAATSS +>x +TPVATSPT +>y +TQVHGTIT +>z +TRVSHFLP +>aa +TSFNGHKP +>ab +TSVGSVNP +>ac +TSYQSPHG +>ad +TTLSGTAP +>ae +TTMGGPLP +>af +TTVNGQSP +>ag +TTVSNSQQ +>ah +TVFAEHIS +>ai +TVFFDIAV +>aj +TVIGGGDT +>ak +TVVMASKG +>al +TYPQWQPP +>am +VAFCDAQS +>an +VAFTQVNS +>ao +VAVAGCCH +>ap +VAVSAAPG +>aq +VAYVSFGP +>ar +VDIEAIFS +>as +VDLSHPGV +>at +VELNGNQP +>au +VEVLAGHG +>av +VFFDIAVD +>aw +VFVGGLSP +>ax +VGAGGPAP +>ay +VGFLEGGK +>az +VGFSSGTE +>bb +VGINYQPP +>rr +VGLTSIAN +>ss +VGVSGSET +>ee +VHIQAGQC +>ww +VHYGEVTN +>qq +VIFQGTDH +>tt +VIISAPSA +>zz +VIITGPPE +>uu +VILESDPQ +>ii +VILGSEAA +>oo +VILHLKED +>pp +VLAMSGDP +>ll +VLIEHIGN +>kk +VLLEGNPD +>jj +VLLQAGAD +>hh +VLPRSAKE +>gg +VLVERSAA +>ff +VMIQDGPQ +>nn +VMLGETNP +>bb +VNIGSIST +>bn +VNLQHLDL +>mm +VPLGSEKP +>cc +VPVTGIPP
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/CTSL_train.fasta Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,100 @@ +>CTSL1 +AALAAAPA +>CTSL1 +AALAHISG +>CTSL1 +AAMAASPH +>CTSL1 +AAPGSAAP +>CTSL1 +AARKSAPA +>CTSL1 +AASGSPGP +>CTSL1 +AATQGAAA +>CTSL1 +AAVGGVFD +>CTSL1 +ACLEKPLL +>CTSL1 +ADYESVNE +>CTSL1 +AEIGQNHQ +>CTSL1 +AESESLVN +>CTSL1 +AFVNQHLC +>CTSL1 +AGCTSAGP +>CTSL1 +AGIATHFV +>CTSL1 +AGIQHSCQ +>CTSL1 +AGLESGAE +>CTSL1 +AGLVSPSL +>CTSL1 +AGSFGGAG +>CTSL1 +AGVGEFEA +>CTSL1 +AGVNTVTT +>CTSL1 +AGWMGLDC +>CTSL1 +AGYLGQVT +>CTSL1 +AHFGIHEE +>CTSL1 +AHLDITPN +>CTSL1 +AHLKNSQE +>CTSL1 +AHLMEIQV +>CTSL1 +AHLQTSHK +>CTSL1 +AIFGRPVV +>CTSL1 +AIICGSGL +>CTSL1 +AIPMSIPP +>CTSL1 +AIYEGQLG +>CTSL1 +AKVKAQTA +>CTSL1 +ALEYATDT +>CTSL1 +ALGHRPIP +>CTSL1 +ALKPMYSM +>CTSL1 +ALLELQLE +>CTSL1 +ALLGGHQG +>CTSL1 +ALLSSAVD +>CTSL1 +ALVAEEHL +>CTSL1 +ALVLGGVD +>CTSL1 +ALVQHQEW +>CTSL1 +ALVTGGEI +>CTSL1 +ALWDTAGQ +>CTSL1 +ALYLVCGE +>CTSL1 +AMLGNSED +>CTSL1 +AMLSGPGQ +>CTSL1 +ANIAHGNS +>CTSL1 +ANLTQSQI +>CTSL1 +ANVGAVPS
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/predictions.txt Sat Mar 12 19:28:41 2016 -0500 @@ -0,0 +1,75 @@ +1 714865162965.0 a +-1 -2.0633772184e+12 b +-1 -2.63278832465e+12 c +-1 -2.30657489269e+12 d +-1 -1.60666238581e+12 e +-1 -5.64892007591e+12 f +1 1.19958430313e+12 g +-1 -4.81891904858e+12 h +-1 -4.26115839421e+12 i +-1 -2.01451585778e+12 j +-1 -3.18448213118e+12 k +-1 -5.32148298316e+12 l +-1 -4.25594148364e+12 m +-1 -5.05361918097e+12 n +-1 -2.81407147475e+12 o +-1 -743476285794.0 p +-1 -1.28450200191e+12 q +-1 -6.82098953196e+12 r +-1 -911697110363.0 s +-1 -1.41018885051e+12 t +1 1.54489789585e+12 u +1 15904035492.6 v +-1 -7.1604898574e+12 w +-1 -291097086285.0 x +-1 -2.94082503016e+12 y +-1 -1.73028072922e+12 z +-1 -1.92238905582e+12 aa +-1 -635673300943.0 ab +-1 -486766774604.0 ac +-1 -1.11318146795e+12 ad +-1 -3.65821042965e+12 ae +-1 -114610205054.0 af +-1 -510138596388.0 ag +-1 -6.65599199641e+12 ah +-1 -4.13413986663e+12 ai +-1 -5.8294381292e+12 aj +-1 -3.52307285487e+12 ak +-1 -1.63846242641e+12 al +-1 -6.2381237974e+12 am +1 1.56329451125e+12 an +-1 -3.41757523005e+12 ao +-1 -3.69981770962e+12 ap +-1 -1.26491397758e+12 aq +-1 -6.1732488464e+12 ar +-1 -2.93027667881e+12 as +-1 -1.23589278355e+12 at +-1 -7.81321990096e+12 au +-1 -3.37867184582e+12 av +1 1.81255065566e+12 aw +-1 -5.8103087454e+12 ax +-1 -7.64938989051e+12 ay +-1 -2.56010386139e+12 az +-1 -2.19510046853e+12 bb +-1 -1.38509574184e+12 rr +-1 -1.82551763609e+12 ss +-1 -2.22551450346e+12 ee +-1 -4.51255078762e+12 ww +-1 -3.36285574975e+12 qq +-1 -3.07010023516e+12 tt +-1 -1.27965891837e+12 zz +-1 -1.32001091916e+12 uu +-1 -1.91484366367e+12 ii +-1 -3.10115319124e+12 oo +-1 -7.10850199103e+12 pp +-1 -4.95385785405e+12 ll +-1 -1.40493423999e+12 kk +-1 -3.5605949667e+12 jj +-1 -2.88491677858e+12 hh +-1 -3.71463321771e+12 gg +-1 -3.30053101487e+12 ff +-1 -3.04988922726e+12 nn +1 50271977527.9 bb +1 772008596347.0 bn +-1 -554618958861.0 mm +-1 -1.30728155546e+12 cc