changeset 6:222c02df5d55 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9e28f4466084464d38d3f8db2aff07974be4ba69"
author bgruening
date Wed, 11 Mar 2020 13:59:57 -0400
parents c7655b5a94af
children 05143043ca13
files main_macros.xml ml_visualization_ex.py ml_visualization_ex.xml test-data/ml_confusion_predicted.tabular test-data/ml_confusion_true.tabular test-data/ml_confusion_viz.png test-data/predicted_header.tabular test-data/true_header.tabular
diffstat 8 files changed, 259 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/main_macros.xml	Wed Jan 22 08:03:54 2020 -0500
+++ b/main_macros.xml	Wed Mar 11 13:59:57 2020 -0400
@@ -1,5 +1,5 @@
 <macros>
-  <token name="@VERSION@">1.0.8.1</token>
+  <token name="@VERSION@">1.0.8.2</token>
 
   <xml name="python_requirements">
       <requirements>
--- a/ml_visualization_ex.py	Wed Jan 22 08:03:54 2020 -0500
+++ b/ml_visualization_ex.py	Wed Mar 11 13:59:57 2020 -0400
@@ -13,7 +13,7 @@
 from keras.utils import plot_model
 from sklearn.feature_selection.base import SelectorMixin
 from sklearn.metrics import precision_recall_curve, average_precision_score
-from sklearn.metrics import roc_curve, auc
+from sklearn.metrics import roc_curve, auc, confusion_matrix
 from sklearn.pipeline import Pipeline
 from galaxy_ml.utils import load_model, read_columns, SafeEval
 
@@ -266,12 +266,29 @@
               os.path.join(folder, "output"))
 
 
+def get_dataframe(file_path, plot_selection, header_name, column_name):
+    header = 'infer' if plot_selection[header_name] else None
+    column_option = plot_selection[column_name]["selected_column_selector_option"]
+    if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+        col = plot_selection[column_name]["col1"]
+    else:
+        col = None
+    _, input_df = read_columns(file_path, c=col,
+                                   c_option=column_option,
+                                   return_df=True,
+                                   sep='\t', header=header,
+                                   parse_dates=True)
+    return input_df
+
+
 def main(inputs, infile_estimator=None, infile1=None,
          infile2=None, outfile_result=None,
          outfile_object=None, groups=None,
          ref_seq=None, intervals=None,
          targets=None, fasta_path=None,
-         model_config=None):
+         model_config=None, true_labels=None,
+         predicted_labels=None, plot_color=None,
+         title=None):
     """
     Parameter
     ---------
@@ -311,6 +328,18 @@
 
     model_config : str, default is None
         File path to dataset containing JSON config for neural networks
+
+    true_labels : str, default is None
+        File path to dataset containing true labels
+
+    predicted_labels : str, default is None
+        File path to dataset containing true predicted labels
+
+    plot_color : str, default is None
+        Color of the confusion matrix heatmap
+
+    title : str, default is None
+        Title of the confusion matrix heatmap
     """
     warnings.simplefilter('ignore')
 
@@ -543,6 +572,32 @@
 
         return 0
 
+    elif plot_type == 'classification_confusion_matrix':
+        plot_selection = params["plotting_selection"]
+        input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true")
+        header_predicted = 'infer' if plot_selection["header_predicted"] else None
+        input_predicted = pd.read_csv(predicted_labels, sep='\t', parse_dates=True, header=header_predicted)
+        true_classes = input_true.iloc[:, -1].copy()
+        predicted_classes = input_predicted.iloc[:, -1].copy()
+        axis_labels = list(set(true_classes))
+        c_matrix = confusion_matrix(true_classes, predicted_classes)
+        fig, ax = plt.subplots(figsize=(7, 7))
+        im = plt.imshow(c_matrix, cmap=plot_color)
+        for i in range(len(c_matrix)):
+            for j in range(len(c_matrix)):
+                ax.text(j, i, c_matrix[i, j], ha="center", va="center", color="k")
+        ax.set_ylabel('True class labels')
+        ax.set_xlabel('Predicted class labels')
+        ax.set_title(title)
+        ax.set_xticks(axis_labels)
+        ax.set_yticks(axis_labels)
+        fig.colorbar(im, ax=ax)
+        fig.tight_layout()
+        plt.savefig("output.png", dpi=125)
+        os.rename('output.png', 'output')
+
+        return 0
+
     # save pdf file to disk
     # fig.write_image("image.pdf", format='pdf')
     # fig.write_image("image.pdf", format='pdf', width=340*2, height=226*2)
@@ -562,10 +617,17 @@
     aparser.add_argument("-t", "--targets", dest="targets")
     aparser.add_argument("-f", "--fasta_path", dest="fasta_path")
     aparser.add_argument("-c", "--model_config", dest="model_config")
+    aparser.add_argument("-tl", "--true_labels", dest="true_labels")
+    aparser.add_argument("-pl", "--predicted_labels", dest="predicted_labels")
+    aparser.add_argument("-pc", "--plot_color", dest="plot_color")
+    aparser.add_argument("-pt", "--title", dest="title")
     args = aparser.parse_args()
 
     main(args.inputs, args.infile_estimator, args.infile1, args.infile2,
          args.outfile_result, outfile_object=args.outfile_object,
          groups=args.groups, ref_seq=args.ref_seq, intervals=args.intervals,
          targets=args.targets, fasta_path=args.fasta_path,
-         model_config=args.model_config)
+         model_config=args.model_config, true_labels=args.true_labels,
+         predicted_labels=args.predicted_labels,
+         plot_color=args.plot_color,
+         title=args.title)
--- a/ml_visualization_ex.xml	Wed Jan 22 08:03:54 2020 -0500
+++ b/ml_visualization_ex.xml	Wed Mar 11 13:59:57 2020 -0400
@@ -21,6 +21,11 @@
             --infile1 '$plotting_selection.infile1'
             #elif $plotting_selection.plot_type == 'keras_plot_model'
             --model_config '$plotting_selection.infile_model_config'
+            #elif $plotting_selection.plot_type == 'classification_confusion_matrix'
+            --true_labels '$plotting_selection.infile_true'
+            --predicted_labels '$plotting_selection.infile_predicted'
+            --plot_color '$plotting_selection.plot_color'
+            --title '$plotting_selection.title'
             #end if
         ]]>
     </command>
@@ -36,6 +41,7 @@
                 <option value="rfecv_gridscores">Number of features vs. Recursive Feature Elimination gridscores with corss-validation</option>
                 <option value="feature_importances">Feature Importances plot</option>
                 <option value="keras_plot_model">keras plot model - plot configuration of a neural network model</option>
+                <option value="classification_confusion_matrix">Confusion matrix for classes</option>
             </param>
             <when value="learning_curve">
                 <param name="infile1" type="data" format="tabular" label="Select the dataset containing values for plotting learning curve." help="This dataset should be the output of tool model_validation->learning_curve."/>
@@ -96,6 +102,33 @@
                 <param name="title" type="hidden" value="" optional="true" label="Plot title" help="Optional. If change is desired."/>
                 <param name="plot_format" type="hidden" value="png" label="The output format and library"/>
             </when>
+            
+            <when value="classification_confusion_matrix">
+                <param name="infile_true" type="data" format="tabular" label="Select dataset containing true labels"/>
+                <param name="header_true" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" />
+                <conditional name="column_selector_options_true">
+                    <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option"
+                        col_name="col1" infile="infile_true"/>
+                </conditional>
+                
+                <param name="infile_predicted" type="data" format="tabular" label="Select dataset containing predicted labels"/>
+                <param name="header_predicted" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" /> 
+                <param name="title" type="text" value="Confusion matrix between true and predicted labels" label="Plot title"/>
+                <param name="plot_format" type="hidden" value="png" label="The output format and library"/>
+                <param name="plot_color" type="select" label="Choose plot color">
+                    <option value="Greys">Greys</option>
+                    <option value="Purples">Purples</option>
+                    <option value="Blues">Blues</option>
+                    <option value="Greens" selected="true">Greens</option>
+                    <option value="Oranges">Oranges</option>
+                    <option value="Reds">Reds</option>
+                    <option value="Summer">Summer</option>
+                    <option value="Autumn">Autumn</option>
+                    <option value="RdYlGn">RdYlGn</option>
+                    <option value="Spectral">Spectral</option>
+                    <option value="winter">winter</option>
+                </param>
+            </when>
         </conditional>
     </inputs>
     <outputs>
@@ -140,6 +173,28 @@
             <param name="infile_model_config" value="deepsear_1feature.json" ftype="json"/>
             <output name="output" file="ml_vis05.png" compare="sim_size" delta="20000"/>
         </test>
+        <test>
+            <param name="plot_type" value="classification_confusion_matrix"/>
+            <param name="infile_true" value="ml_confusion_true.tabular" ftype="tabular"/>
+            <param name="header_true" value="False"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
+            <param name="infile_predicted" value="ml_confusion_predicted.tabular" ftype="tabular"/>
+            <param name="header_predicted" value="False"/>
+            <param name="title" value="Confusion matrix"/>
+            <param name="plot_color" value="winter" />
+            <output name="output" file="ml_confusion_viz.png" compare="sim_size"/>
+        </test>
+        <test>
+            <param name="plot_type" value="classification_confusion_matrix"/>
+            <param name="infile_true" value="true_header.tabular" ftype="tabular"/>
+            <param name="header_true" value="True"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
+            <param name="infile_predicted" value="predicted_header.tabular" ftype="tabular"/>
+            <param name="header_predicted" value="True"/>
+            <param name="title" value="Confusion matrix"/>
+            <param name="plot_color" value="winter" />
+            <output name="output" file="ml_confusion_viz.png" compare="sim_size"/>
+        </test>
     </tests>
     <help>
         <![CDATA[
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ml_confusion_predicted.tabular	Wed Mar 11 13:59:57 2020 -0400
@@ -0,0 +1,34 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ml_confusion_true.tabular	Wed Mar 11 13:59:57 2020 -0400
@@ -0,0 +1,34 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+0
+0
+1
+1
+0
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0
+0
+0
Binary file test-data/ml_confusion_viz.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/predicted_header.tabular	Wed Mar 11 13:59:57 2020 -0400
@@ -0,0 +1,35 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/true_header.tabular	Wed Mar 11 13:59:57 2020 -0400
@@ -0,0 +1,35 @@
+cancer
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+0
+0
+1
+1
+0
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0
+0
+0