Mercurial > repos > bgruening > sklearn_clf_metrics

<tool id="sklearn_clf_metrics" name="Calculate metrics" version="@VERSION@">
    <description>for classification performance</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        python "$clf_metrics_script" '$inputs'
        ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
        <configfile name="clf_metrics_script">
            <![CDATA[
import sys
import json
import pandas
import numpy as np
from sklearn import metrics

execfile("$__tool_directory__/utils.py")

input_json_path = sys.argv[1]
with open(input_json_path, "r") as param_handler:
    params = json.load(param_handler)

header='infer' if params["clf_metrics"].get("header1", None) else None
column_option = params["clf_metrics"]["column_selector_options_1"]["selected_column_selector_option"]
if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
    c = params["clf_metrics"]["column_selector_options_1"]["col1"]
else:
    c = None
y_t = read_columns(
        "$clf_metrics.infile1",
        c = c,
        c_option = column_option,
        sep='\t',
        header=header,
        parse_dates=True
)

header='infer' if params["clf_metrics"].get("header2", None) else None
column_option = params["clf_metrics"]["column_selector_options_2"]["selected_column_selector_option2"]
if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
    c = params["clf_metrics"]["column_selector_options_2"]["col2"]
else:
    c = None
y_p = read_columns(
        "$clf_metrics.infile2",
        c = c,
        c_option = column_option,
        sep='\t',
        header=header,
        parse_dates=True
)

options = params["clf_metrics"].get("options", {})
print(options)
if options and options.get('average', '') == 'None':
    options['average'] = None
metric = params["clf_metrics"]["selected_metric"]
metric_function = getattr(metrics, metric)
res = metric_function(y_t,y_p,**options)
with open("$outfile", 'w+') as out_file:
    out_file.write( metric + ' : ' + '\n' + str(res) + '\n' )

            ]]>
        </configfile>
    </configfiles>
    <inputs>
        <conditional name="clf_metrics">
            <param name="selected_metric" type="select" label="Metrics">
                <option value="accuracy_score" selected="true">Accuracy classification score</option>
                <option value="classification_report">Text report of the main classification metrics</option>
                <option value="f1_score">F1 score (aka balanced F-score or F-measure)</option>
                <option value="fbeta_score">F-beta score</option>
                <option value="hamming_loss">Average Hamming loss</option>
                <option value="jaccard_similarity_score">Jaccard similarity coefficient score</option>
                <option value="precision_recall_fscore_support">Compute precision, recall, F-measure and support for each class</option>
                <option value="precision_score">Precision</option>
                <option value="recall_score">Recall</option>
                <option value="zero_one_loss">Zero-one classification loss</option>
                <option value="auc">Area under the curve (AUC)</option>
                <option value="brier_score_loss">Brier score</option>
                <option value="matthews_corrcoef">Matthews correlation coefficient (MCC) for binary classes</option>
                <option value="confusion_matrix">Confusion matrix (evaluate the accuracy of a classification)</option>
                <option value="precision_recall_curve">Precision-recall pairs for different probability thresholds</option>
                <option value="roc_curve">Receiver operating characteristic (ROC)</option>
                <option value="hinge_loss">Average hinge loss (non-regularized)</option>
                <option value="log_loss">Log loss (aka logistic loss or cross-entropy loss)</option>
                <option value="average_precision_score">Average precision (AP) from prediction scores</option>
                <option value="roc_auc_score">Area Under the Curve (AUC) from prediction scores</option>
            </param>
            <when value="accuracy_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--sample_weight-->
                    <param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
                        label="Normalize" help="If false, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. "/>
                </section>
            </when>
            <when value="classification_report">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--labels-->
                    <!--target_names-->
                    <!--sample_weight-->
                    <param argument="digits" type="integer" value="2" label="Number of digits for formatting output floating point values" help=""/>
                </section>
            </when>
            <when value="f1_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--labels-->
                    <!--sample_weight-->
                    <expand macro="pos_label" default_value="1"/>
                    <expand macro="average">
                        <option value="binary" selected="true">Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary. (binary)</option>
                    </expand>
                </section>
            </when>
            <when value="fbeta_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="beta"/>
                    <!--labels-->
                    <!--sample_weight-->
                    <expand macro="pos_label" default_value="1"/>
                    <expand macro="average">
                        <option value="binary" selected="true">Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary. (binary)</option>
                    </expand>
                </section>
            </when>
            <when value="hamming_loss">
                <expand macro="clf_inputs"/>
                <!--section name="options" title="Advanced Options" expanded="False">
                    <!- -classes- ->
                </section-->
            </when>
            <when value="jaccard_similarity_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Normalize" help="If false, returns the sum of the Jaccard similarity coefficient over the sample set. Otherwise, returns the average of Jaccard similarity coefficient. "/>
                </section>
            </when>
            <when value="precision_recall_fscore_support">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="beta"/>
                    <!--labels-->
                    <!--sample_weight-->
                    <!--warn_for-->
                    <expand macro="pos_label" default_value="1"/>
                    <expand macro="average">
                        <option value="binary" selected="true">Only report results for the class specified by pos_label. Applicable only on binary classification. (binary)</option>
                    </expand>
                </section>
            </when>
            <when value="precision_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--labels-->
                    <!--sample_weight-->
                    <expand macro="pos_label" default_value="1"/>
                    <expand macro="average">
                        <option value="binary" selected="true">Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary. (binary)</option>
                    </expand>
                </section>
            </when>
            <when value="recall_score">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--labels-->
                    <!--sample_weight-->
                    <expand macro="pos_label" default_value="1"/>
                    <expand macro="average">
                        <option value="binary" selected="true">Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary. (binary)</option>
                    </expand>
                </section>
            </when>
            <when value="zero_one_loss">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--sample_weight-->
                    <param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Normalize" help="If false, returns the number of misclassifications. Otherwise, returns the fraction of misclassifications. "/>
                </section>
            </when>
            <when value="auc">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <param argument="reorder" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Assume an ascending curve in the case of ties" help="If the curve is non-ascending, the result will be wrong. "/>
                </section>
            </when>
            <when value="brier_score_loss">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--weights-->
                    <expand macro="pos_label"/>
                </section>
            </when>
            <when value="matthews_corrcoef">
                <expand macro="clf_inputs"/>
                <!--section name="options" title="Advanced Options" expanded="False">
                </section-->
            </when>
            <when value="confusion_matrix">
                <expand macro="clf_inputs"/>
                <!--section name="options" title="Advanced Options" expanded="False">
                    <!- -labels- ->
                </section-->
            </when>
            <when value="precision_recall_curve">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="pos_label"/>
                    <!--sample_weight-->
                </section>
            </when>
            <when value="roc_curve">
                <expand macro="clf_inputs"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--sample_weight-->
                    <expand macro="pos_label"/>
                    <param argument="drop_intermediate" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label=" Drop suboptimal thresholds" help="This is useful in order to create lighter ROC curves. "/>
                </section>
            </when>
            <when value="hinge_loss">
                <expand macro="clf_inputs" multiple="true"/>
                <!--section name="options" title="Advanced Options" expanded="False">
                    <!- -labels- ->
                    <!- -sample_weight- ->
                </section-->
            </when>
            <when value="log_loss">
                <expand macro="clf_inputs" multiple="true"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <!--sample_weight-->
                    <param argument="eps" type="float" value="1e-15" label="Clipping factor"
                        help="Log loss is undefined for p=0 or p=1, so probabilities are     clipped to max(eps, min(1 - eps, p)). "/>
                    <param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
                        label="Normalize" help="If true, returns the mean loss per sample. Otherwise, returns the sum of the per-sample losses. "/>
                </section>
            </when>
            <when value="average_precision_score">
                <expand macro="clf_inputs" multiple1="true" multiple="true"/>
                <!-- section name="options" title="Advanced Options" expanded="False">
                    <!- -average='macro', sample_weight=None- ->
                </section-->
            </when>
            <when value="roc_auc_score">
                <expand macro="clf_inputs" multiple1="true" multiple="true"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="average">
                        <option value="macro" selected="true">Calculate metrics for each label, and find their unweighted mean. (macro)</option>
                    </expand>
                </section>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="txt" name="outfile"/>
    </outputs>
    <tests>
        <test>
            <param name="selected_metric" value="accuracy_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="accuracy_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="classification_report"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="classification_report.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="f1_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="average" value="micro"/>
            <output name="outfile" file="f1_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="fbeta_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="average" value="micro"/>
            <output name="outfile" file="fbeta_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="hamming_loss"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="hamming_loss.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="jaccard_similarity_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="jaccard_similarity_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="precision_recall_fscore_support"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="average" value="micro"/>
            <output name="outfile" file="precision_recall_fscore_support.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="precision_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="average" value="micro"/>
            <output name="outfile" file="precision_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="recall_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="average" value="micro"/>
            <output name="outfile" file="recall_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="zero_one_loss"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="zero_one_loss.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="auc"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <param name="reorder" value="true"/>
            <output name="outfile" file="auc.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="brier_score_loss"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="3"/>
            <output name="outfile" file="brier_score_loss.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="matthews_corrcoef"/>
            <param name="infile1" value="y.tabular"/>
            <param name="col1" value="6"/>
            <param name="infile2" value="y.tabular"/>
            <param name="col2" value="7"/>
            <output name="outfile" file="matthews_corrcoef.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="confusion_matrix"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="2"/>
            <output name="outfile" file="confusion_matrix.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="precision_recall_curve"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="6"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="7"/>
            <output name="outfile" file="precision_recall_curve.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="roc_curve"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="6"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="7"/>
            <output name="outfile" file="roc_curve.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="hinge_loss"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="1"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="8,9,10"/>
            <output name="outfile" file="hinge_loss.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="log_loss"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="2"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="3,4,5"/>
            <output name="outfile" file="log_loss.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="average_precision_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="6"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="7"/>
            <output name="outfile" file="average_precision_score.txt"/>
        </test>
        <test>
            <param name="selected_metric" value="roc_auc_score"/>
            <param name="infile1" value="y.tabular" ftype="tabular"/>
            <param name="col1" value="6"/>
            <param name="infile2" value="y.tabular" ftype="tabular"/>
            <param name="col2" value="7"/>
            <output name="outfile" file="roc_auc_score.txt"/>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on
sklearn.metrics package.
For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_.

.. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
        ]]>
    </help>
    <expand macro="sklearn_citation"/>
</tool>
author	bgruening
date	Fri, 17 Aug 2018 12:29:12 -0400
parents	8da7dc3f4e66
children	ce75dea7d3f0