view feature_selection.xml @ 16:328a8d547ca2 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c64ccc5850c8e061a95fb64e07ed388384e82393
author bgruening
date Thu, 11 Oct 2018 03:34:39 -0400
parents 026667802750
children 2bbbac61e48d
line wrap: on
line source

<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@.1">
    <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        python "$feature_selection_script" '$inputs'
        ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
        <configfile name="feature_selection_script">
            <![CDATA[
import sys
import os
import json
import pandas
import sklearn.feature_selection

with open("$__tool_directory__/sk_whitelist.json", "r") as f:
    sk_whitelist = json.load(f)
exec(open("$__tool_directory__/utils.py").read(), globals())

safe_eval = SafeEval()

input_json_path = sys.argv[1]
with open(input_json_path, "r") as param_handler:
    params = json.load(param_handler)

#handle cheetah
#if $fs_algorithm_selector.selected_algorithm == "SelectFromModel"\
        and $fs_algorithm_selector.model_inputter.input_mode == "prefitted":
params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\
        "$fs_algorithm_selector.model_inputter.fitted_estimator"
#end if

# Read features
features_has_header = params["input_options"]["header1"]
input_type = params["input_options"]["selected_input"]
if input_type=="tabular":
    header = 'infer' if features_has_header else None
    column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
    if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
        c = params["input_options"]["column_selector_options_1"]["col1"]
    else:
        c = None
    X, input_df = read_columns(
            "$input_options.infile1",
            c = c,
            c_option = column_option,
            return_df = True,
            sep='\t',
            header=header,
            parse_dates=True
    )
else:
    X = mmread("$input_options.infile1")

# Read labels
header = 'infer' if params["input_options"]["header2"] else None
column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
    c = params["input_options"]["column_selector_options_2"]["col2"]
else:
    c = None
y = read_columns(
        "$input_options.infile2",
        c = c,
        c_option = column_option,
        sep='\t',
        header=header,
        parse_dates=True
)
y=y.ravel()

# Create feature selector
new_selector = feature_selector(params['fs_algorithm_selector'])
if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\
        or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' :
    new_selector.fit(X, y)

## Transform to select features
selected_names = None
if "$output_method_selector.selected_method" == "fit_transform":
    res = new_selector.transform(X)
    if features_has_header:
        selected_names = input_df.columns[new_selector.get_support(indices=True)]
else:
    res = new_selector.get_support(params["output_method_selector"]["indices"])

res = pandas.DataFrame(res, columns = selected_names)
res.to_csv(path_or_buf="$outfile", sep='\t', index=False)


            ]]>
        </configfile>
    </configfiles>
    <inputs>
        <expand macro="feature_selection_all">
            <expand macro="fs_selectfrommodel_prefitted"/>
        </expand>
        <expand macro="feature_selection_output_mothods" />
        <expand macro="sl_mixed_input"/>
    </inputs>
    <outputs>
        <data format="tabular" name="outfile"/>
    </outputs>
    <tests>
        <test>
            <param name="selected_algorithm" value="SelectFromModel"/>
            <param name="input_mode" value="new"/>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <param name="text_params" value="n_estimators=10, random_state=10"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="header1" value="false"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <param name="header2" value="false"/>
            <output name="outfile" file="feature_selection_result01"/>
        </test>
        <test>
            <param name="selected_algorithm" value="GenericUnivariateSelect"/>
            <param name="param" value="20"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result02"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectPercentile"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result03"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectKBest"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result04"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectFpr"/>
            <param name="alpha" value="0.05"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result05"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectFdr"/>
            <param name="alpha" value="0.05"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result06"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectFwe"/>
            <param name="alpha" value="0.05"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result07"/>
        </test>
        <test>
            <param name="selected_algorithm" value="RFE"/>
            <param name="input_mode" value="new"/>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <param name="text_params" value="n_estimators=10, random_state=10"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="header1" value="false"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <param name="header2" value="false"/>
            <output name="outfile" file="feature_selection_result08"/>
        </test>
        <test>
            <param name="selected_algorithm" value="RFECV"/>
            <param name="input_mode" value="new"/>
            <param name="selected_module" value="ensemble"/>
            <param name="selected_estimator" value="RandomForestRegressor"/>
            <param name="text_params" value="n_estimators=10, random_state=10"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="header1" value="false"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <param name="header2" value="false"/>
            <output name="outfile" file="feature_selection_result09"/>
        </test>
        <test>
            <param name="selected_algorithm" value="VarianceThreshold"/>
            <param name="threshold" value="0.1"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="True"/>
            <output name="outfile" file="feature_selection_result10"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectKBest"/>
            <param name="k" value="3"/>
            <param name="infile1" value="test3.tabular" ftype="tabular"/>
            <param name="header1" value="True"/>
            <param name="selected_column_selector_option" value="all_but_by_header_name"/>
            <param name="col1" value="target"/>
            <param name="infile2" value="test3.tabular" ftype="tabular"/>
            <param name="header2" value="True"/>
            <param name="selected_column_selector_option2" value="by_header_name"/>
            <param name="col2" value="target"/>
            <output name="outfile" file="feature_selection_result11"/>
        </test>
        <test>
            <param name="selected_algorithm" value="SelectFromModel"/>
            <param name="input_mode" value="prefitted"/>
            <param name="fitted_estimator" value="rfr_model01" ftype="zip"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="header1" value="false"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="1"/>
            <param name="header2" value="false"/>
            <output name="outfile" file="feature_selection_result12"/>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on
sklearn.metrics package.
For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_.

.. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="skrebate_citation"/>
        <expand macro="xgboost_citation"/>
    </expand>
</tool>