comparison pre_process.xml @ 0:29899feb4d44 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author bgruening
date Fri, 16 Feb 2018 09:18:41 -0500
parents
children dad38f036e83
comparison
equal deleted inserted replaced
-1:000000000000 0:29899feb4d44
1 <tool id="sklearn_data_preprocess" name="Preprocess" version="@VERSION@">
2 <description>raw feature vectors into standardized datasets</description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/>
8 <version_command>echo "@VERSION@"</version_command>
9 <command>
10 <![CDATA[
11 python "$pre_processor_script" '$inputs'
12 ]]>
13 </command>
14 <configfiles>
15 <inputs name="inputs" />
16 <configfile name="pre_processor_script">
17 <![CDATA[
18 import sys
19 import json
20 import pandas
21 import pickle
22 import numpy as np
23 from scipy.io import mmread
24 from scipy.io import mmwrite
25 from sklearn import preprocessing
26
27 input_json_path = sys.argv[1]
28 params = json.load(open(input_json_path, "r"))
29
30 #if $input_type.selected_input_type == "sparse":
31 X = mmread(open("$infile", 'r'))
32 #else:
33 X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
34 #end if
35
36 #if $input_type.pre_processors.infile_transform.ext == 'txt':
37 y = mmread(open("$infile", 'r'))
38 #else:
39 y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
40 #end if
41
42 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"]
43 options = params["input_type"]["pre_processors"]["options"]
44
45 my_class = getattr(preprocessing, preprocessor)
46 estimator = my_class(**options)
47 estimator.fit(X)
48 result = estimator.transform(y)
49
50 #if $input_type.pre_processors.infile_transform.ext == 'txt':
51 mmwrite(open("$outfile_transform" , 'w+'), result)
52 #else:
53 res = pandas.DataFrame(result)
54 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None)
55 #end if
56
57 #if $save:
58 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL)
59 #end if
60 ]]>
61 </configfile>
62 </configfiles>
63 <inputs>
64 <conditional name="input_type">
65 <param name="selected_input_type" type="select" label="Select the type of your input data:">
66 <option value="tabular" selected="true">Tabular</option>
67 <option value="sparse">Sparse</option>
68 </param>
69 <when value="tabular">
70 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/>
71 <conditional name="pre_processors">
72 <expand macro="sparse_preprocessors">
73 <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option>
74 <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option>
75 <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option>
76 <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option>
77 </expand>
78 <expand macro="sparse_preprocessor_options">
79 <when value="KernelCenterer">
80 <expand macro="multitype_input"/>
81 <section name="options" title="Advanced Options" expanded="False">
82 </section>
83 </when>
84 <when value="MinMaxScaler">
85 <expand macro="multitype_input"/>
86 <section name="options" title="Advanced Options" expanded="False">
87 <!--feature_range-->
88 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
89 label="Use a copy of data for precomputing normalization" help=" "/>
90 </section>
91 </when>
92 <when value="PolynomialFeatures">
93 <expand macro="multitype_input"/>
94 <section name="options" title="Advanced Options" expanded="False">
95 <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/>
96 <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/>
97 <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/>
98 </section>
99 </when>
100 <when value="RobustScaler">
101 <expand macro="multitype_input"/>
102 <section name="options" title="Advanced Options" expanded="False">
103 <!--=True, =True, copy=True-->
104 <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
105 label="Center the data before scaling" help=" "/>
106 <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
107 label="Scale the data to interquartile range" help=" "/>
108 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
109 label="Use a copy of data for inplace scaling" help=" "/>
110 </section>
111 </when>
112 </expand>
113 </conditional>
114 </when>
115 <when value="sparse">
116 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/>
117 <conditional name="pre_processors">
118 <expand macro="sparse_preprocessors"/>
119 <expand macro="sparse_preprocessor_options"/>
120 </conditional>
121 </when>
122 </conditional>
123 <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="false"
124 label="Save the preprocessor"
125 help="Saves the preprocessor after fitting to the data. The preprocessor can then be passed to other tools and used in later operations."/>
126 </inputs>
127 <outputs>
128 <data format="tabular" name="outfile_transform" from_work_dir="./output"/>
129 <data format="zip" name="outfile_fit">
130 <filter>save</filter>
131 </data>
132 </outputs>
133 <tests>
134 <test>
135 <param name="infile" value="train.tabular" ftype="tabular"/>
136 <param name="infile_transform" value="train.tabular" ftype="tabular"/>
137 <param name="selected_input_type" value="tabular"/>
138 <param name="selected_pre_processor" value="KernelCenterer"/>
139 <param name="save" value="true"/>
140 <output name="outfile_transform" file="prp_result01" ftype="tabular"/>
141 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/>
142 </test>
143 <test>
144 <param name="infile" value="train.tabular" ftype="tabular"/>
145 <param name="infile_transform" value="train.tabular" ftype="tabular"/>
146 <param name="selected_input_type" value="tabular"/>
147 <param name="selected_pre_processor" value="MinMaxScaler"/>
148 <param name="save" value="true"/>
149 <output name="outfile_transform" file="prp_result02" ftype="tabular"/>
150 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/>
151 </test>
152 <test>
153 <param name="infile" value="train.tabular" ftype="tabular"/>
154 <param name="infile_transform" value="train.tabular" ftype="tabular"/>
155 <param name="selected_input_type" value="tabular"/>
156 <param name="selected_pre_processor" value="PolynomialFeatures"/>
157 <param name="save" value="true"/>
158 <output name="outfile_transform" file="prp_result03" ftype="tabular"/>
159 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/>
160 </test>
161 <test>
162 <param name="infile" value="train.tabular" ftype="tabular"/>
163 <param name="infile_transform" value="train.tabular" ftype="tabular"/>
164 <param name="selected_input_type" value="tabular"/>
165 <param name="selected_pre_processor" value="RobustScaler"/>
166 <param name="save" value="true"/>
167 <output name="outfile_transform" file="prp_result04" ftype="tabular"/>
168 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/>
169 </test>
170 <test>
171 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
172 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
173 <param name="selected_input_type" value="sparse"/>
174 <param name="selected_pre_processor" value="Binarizer"/>
175 <param name="save" value="true"/>
176 <output name="outfile_transform" file="prp_result05" ftype="tabular"/>
177 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/>
178 </test>
179 <test>
180 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
181 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
182 <param name="selected_input_type" value="sparse"/>
183 <param name="selected_pre_processor" value="Imputer"/>
184 <param name="save" value="true"/>
185 <param name="axis" value="true"/>
186 <output name="outfile_transform" file="prp_result06" ftype="tabular"/>
187 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/>
188 </test>
189 <test>
190 <param name="infile" value="train.tabular" ftype="tabular"/>
191 <param name="infile_transform" value="train.tabular" ftype="tabular"/>
192 <param name="selected_input_type" value="tabular"/>
193 <param name="selected_pre_processor" value="StandardScaler"/>
194 <param name="save" value="true"/>
195 <output name="outfile_transform" file="prp_result07" ftype="tabular"/>
196 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/>
197 </test>
198 <test>
199 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
200 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
201 <param name="selected_input_type" value="sparse"/>
202 <param name="selected_pre_processor" value="MaxAbsScaler"/>
203 <param name="save" value="true"/>
204 <output name="outfile_transform" file="prp_result08" ftype="tabular"/>
205 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/>
206 </test>
207 <test>
208 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
209 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
210 <param name="selected_input_type" value="sparse"/>
211 <param name="selected_pre_processor" value="Normalizer"/>
212 <param name="save" value="true"/>
213 <output name="outfile_transform" file="prp_result09" ftype="tabular"/>
214 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/>
215 </test>
216 </tests>
217 <help>
218 <![CDATA[
219 **What it does**
220
221 This tool provides several transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. The library is provided by sklearn.preprocessing package.
222
223 For information about preprocessing classes and parameter settings please refer to `Scikit-learn preprocessing`_.
224
225 .. _`Scikit-learn preprocessing`: http://scikit-learn.org/stable/modules/preprocessing.html
226 ]]>
227 </help>
228 <expand macro="sklearn_citation"/>
229 </tool>