comparison pre_process.xml @ 15:dad38f036e83 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit f54ff2ba2f8e7542d68966ce5a6b17d7f624ac48
author bgruening
date Fri, 13 Jul 2018 03:55:44 -0400
parents 29899feb4d44
children f196d4715cfb
comparison
equal deleted inserted replaced
14:f9def78f6cd5 15:dad38f036e83
22 import numpy as np 22 import numpy as np
23 from scipy.io import mmread 23 from scipy.io import mmread
24 from scipy.io import mmwrite 24 from scipy.io import mmwrite
25 from sklearn import preprocessing 25 from sklearn import preprocessing
26 26
27 @COLUMNS_FUNCTION@
28
27 input_json_path = sys.argv[1] 29 input_json_path = sys.argv[1]
28 params = json.load(open(input_json_path, "r")) 30 with open(input_json_path, "r") as param_handler:
31 params = json.load(param_handler)
29 32
30 #if $input_type.selected_input_type == "sparse": 33 #if $input_type.selected_input_type == "sparse":
31 X = mmread(open("$infile", 'r')) 34 X = mmread("$infile")
32 #else: 35 #else:
33 X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) 36 header = 'infer' if params["input_type"]["header1"] else None
34 #end if 37 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"]
35 38 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
36 #if $input_type.pre_processors.infile_transform.ext == 'txt': 39 c = params["input_type"]["column_selector_options_1"]["col1"]
37 y = mmread(open("$infile", 'r')) 40 else:
38 #else: 41 c = None
39 y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) 42 X = read_columns(
43 "$input_type.infile",
44 c = c,
45 c_option = column_option,
46 sep='\t',
47 header=header,
48 parse_dates=True,
49 encoding=None,
50 index_col=None,
51 tupleize_cols=False
52 )
40 #end if 53 #end if
41 54
42 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] 55 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"]
43 options = params["input_type"]["pre_processors"]["options"] 56 options = params["input_type"]["pre_processors"]["options"]
44 57
45 my_class = getattr(preprocessing, preprocessor) 58 my_class = getattr(preprocessing, preprocessor)
46 estimator = my_class(**options) 59 estimator = my_class(**options)
47 estimator.fit(X) 60 estimator.fit(X)
48 result = estimator.transform(y) 61 result = estimator.transform(X)
49 62
50 #if $input_type.pre_processors.infile_transform.ext == 'txt': 63 #if $input_type.selected_input_type == "sparse":
51 mmwrite(open("$outfile_transform" , 'w+'), result) 64 with open("$outfile_transform", "w+") as transform_handler:
65 mmwrite(transform_handler, result)
52 #else: 66 #else:
53 res = pandas.DataFrame(result) 67 res = pandas.DataFrame(result)
54 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) 68 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None)
55 #end if 69 #end if
56 70
57 #if $save: 71 #if $save:
58 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) 72 with open("$outfile_fit", 'wb') as out_handler:
73 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)
59 #end if 74 #end if
60 ]]> 75 ]]>
61 </configfile> 76 </configfile>
62 </configfiles> 77 </configfiles>
63 <inputs> 78 <inputs>
65 <param name="selected_input_type" type="select" label="Select the type of your input data:"> 80 <param name="selected_input_type" type="select" label="Select the type of your input data:">
66 <option value="tabular" selected="true">Tabular</option> 81 <option value="tabular" selected="true">Tabular</option>
67 <option value="sparse">Sparse</option> 82 <option value="sparse">Sparse</option>
68 </param> 83 </param>
69 <when value="tabular"> 84 <when value="tabular">
70 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/> 85 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" />
86 <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" />
87 <conditional name="column_selector_options_1">
88 <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile"/>
89 </conditional>
71 <conditional name="pre_processors"> 90 <conditional name="pre_processors">
72 <expand macro="sparse_preprocessors"> 91 <expand macro="sparse_preprocessors_ext" />
73 <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> 92 <expand macro="sparse_preprocessor_options_ext" />
74 <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option>
75 <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option>
76 <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option>
77 </expand>
78 <expand macro="sparse_preprocessor_options">
79 <when value="KernelCenterer">
80 <expand macro="multitype_input"/>
81 <section name="options" title="Advanced Options" expanded="False">
82 </section>
83 </when>
84 <when value="MinMaxScaler">
85 <expand macro="multitype_input"/>
86 <section name="options" title="Advanced Options" expanded="False">
87 <!--feature_range-->
88 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
89 label="Use a copy of data for precomputing normalization" help=" "/>
90 </section>
91 </when>
92 <when value="PolynomialFeatures">
93 <expand macro="multitype_input"/>
94 <section name="options" title="Advanced Options" expanded="False">
95 <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/>
96 <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/>
97 <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/>
98 </section>
99 </when>
100 <when value="RobustScaler">
101 <expand macro="multitype_input"/>
102 <section name="options" title="Advanced Options" expanded="False">
103 <!--=True, =True, copy=True-->
104 <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
105 label="Center the data before scaling" help=" "/>
106 <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
107 label="Scale the data to interquartile range" help=" "/>
108 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
109 label="Use a copy of data for inplace scaling" help=" "/>
110 </section>
111 </when>
112 </expand>
113 </conditional> 93 </conditional>
114 </when> 94 </when>
115 <when value="sparse"> 95 <when value="sparse">
116 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/> 96 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/>
117 <conditional name="pre_processors"> 97 <conditional name="pre_processors">
131 </data> 111 </data>
132 </outputs> 112 </outputs>
133 <tests> 113 <tests>
134 <test> 114 <test>
135 <param name="infile" value="train.tabular" ftype="tabular"/> 115 <param name="infile" value="train.tabular" ftype="tabular"/>
136 <param name="infile_transform" value="train.tabular" ftype="tabular"/> 116 <param name="selected_column_selector_option" value="all_columns"/>
137 <param name="selected_input_type" value="tabular"/> 117 <param name="selected_input_type" value="tabular"/>
138 <param name="selected_pre_processor" value="KernelCenterer"/> 118 <param name="selected_pre_processor" value="KernelCenterer"/>
139 <param name="save" value="true"/> 119 <param name="save" value="true"/>
140 <output name="outfile_transform" file="prp_result01" ftype="tabular"/> 120 <output name="outfile_transform" file="prp_result01" ftype="tabular"/>
141 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/> 121 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/>
142 </test> 122 </test>
143 <test> 123 <test>
144 <param name="infile" value="train.tabular" ftype="tabular"/> 124 <param name="infile" value="train.tabular" ftype="tabular"/>
145 <param name="infile_transform" value="train.tabular" ftype="tabular"/> 125 <param name="selected_column_selector_option" value="all_columns"/>
146 <param name="selected_input_type" value="tabular"/> 126 <param name="selected_input_type" value="tabular"/>
147 <param name="selected_pre_processor" value="MinMaxScaler"/> 127 <param name="selected_pre_processor" value="MinMaxScaler"/>
148 <param name="save" value="true"/> 128 <param name="save" value="true"/>
149 <output name="outfile_transform" file="prp_result02" ftype="tabular"/> 129 <output name="outfile_transform" file="prp_result02" ftype="tabular"/>
150 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/> 130 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/>
151 </test> 131 </test>
152 <test> 132 <test>
153 <param name="infile" value="train.tabular" ftype="tabular"/> 133 <param name="infile" value="train.tabular" ftype="tabular"/>
154 <param name="infile_transform" value="train.tabular" ftype="tabular"/> 134 <param name="selected_column_selector_option" value="all_columns"/>
155 <param name="selected_input_type" value="tabular"/> 135 <param name="selected_input_type" value="tabular"/>
156 <param name="selected_pre_processor" value="PolynomialFeatures"/> 136 <param name="selected_pre_processor" value="PolynomialFeatures"/>
157 <param name="save" value="true"/> 137 <param name="save" value="true"/>
158 <output name="outfile_transform" file="prp_result03" ftype="tabular"/> 138 <output name="outfile_transform" file="prp_result03" ftype="tabular"/>
159 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/> 139 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/>
160 </test> 140 </test>
161 <test> 141 <test>
162 <param name="infile" value="train.tabular" ftype="tabular"/> 142 <param name="infile" value="train.tabular" ftype="tabular"/>
163 <param name="infile_transform" value="train.tabular" ftype="tabular"/> 143 <param name="selected_column_selector_option" value="all_columns"/>
164 <param name="selected_input_type" value="tabular"/> 144 <param name="selected_input_type" value="tabular"/>
165 <param name="selected_pre_processor" value="RobustScaler"/> 145 <param name="selected_pre_processor" value="RobustScaler"/>
166 <param name="save" value="true"/> 146 <param name="save" value="true"/>
167 <output name="outfile_transform" file="prp_result04" ftype="tabular"/> 147 <output name="outfile_transform" file="prp_result04" ftype="tabular"/>
168 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/> 148 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/>
169 </test> 149 </test>
170 <test> 150 <test>
171 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> 151 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
172 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
173 <param name="selected_input_type" value="sparse"/> 152 <param name="selected_input_type" value="sparse"/>
174 <param name="selected_pre_processor" value="Binarizer"/> 153 <param name="selected_pre_processor" value="Binarizer"/>
175 <param name="save" value="true"/> 154 <param name="save" value="true"/>
176 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> 155 <output name="outfile_transform" file="prp_result05" ftype="tabular"/>
177 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/> 156 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/>
178 </test> 157 </test>
179 <test> 158 <test>
180 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> 159 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
181 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
182 <param name="selected_input_type" value="sparse"/> 160 <param name="selected_input_type" value="sparse"/>
183 <param name="selected_pre_processor" value="Imputer"/> 161 <param name="selected_pre_processor" value="Imputer"/>
184 <param name="save" value="true"/> 162 <param name="save" value="true"/>
185 <param name="axis" value="true"/> 163 <param name="axis" value="true"/>
186 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> 164 <output name="outfile_transform" file="prp_result06" ftype="tabular"/>
187 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/> 165 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/>
188 </test> 166 </test>
189 <test> 167 <test>
190 <param name="infile" value="train.tabular" ftype="tabular"/> 168 <param name="infile" value="train.tabular" ftype="tabular"/>
191 <param name="infile_transform" value="train.tabular" ftype="tabular"/> 169 <param name="selected_input_type" value="tabular"/>
192 <param name="selected_input_type" value="tabular"/> 170 <param name="selected_column_selector_option" value="all_columns"/>
193 <param name="selected_pre_processor" value="StandardScaler"/> 171 <param name="selected_pre_processor" value="StandardScaler"/>
194 <param name="save" value="true"/> 172 <param name="save" value="true"/>
195 <output name="outfile_transform" file="prp_result07" ftype="tabular"/> 173 <output name="outfile_transform" file="prp_result07" ftype="tabular"/>
196 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/> 174 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/>
197 </test> 175 </test>
198 <test> 176 <test>
199 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> 177 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
200 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
201 <param name="selected_input_type" value="sparse"/> 178 <param name="selected_input_type" value="sparse"/>
202 <param name="selected_pre_processor" value="MaxAbsScaler"/> 179 <param name="selected_pre_processor" value="MaxAbsScaler"/>
203 <param name="save" value="true"/> 180 <param name="save" value="true"/>
204 <output name="outfile_transform" file="prp_result08" ftype="tabular"/> 181 <output name="outfile_transform" file="prp_result08" ftype="tabular"/>
205 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/> 182 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/>
206 </test> 183 </test>
207 <test> 184 <test>
208 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> 185 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
209 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
210 <param name="selected_input_type" value="sparse"/> 186 <param name="selected_input_type" value="sparse"/>
211 <param name="selected_pre_processor" value="Normalizer"/> 187 <param name="selected_pre_processor" value="Normalizer"/>
212 <param name="save" value="true"/> 188 <param name="save" value="true"/>
213 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> 189 <output name="outfile_transform" file="prp_result09" ftype="tabular"/>
214 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/> 190 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/>