Mercurial > repos > bgruening > plotly_parallel_coordinates_plot

--- a/paracords_plot.py	Wed Oct 10 02:29:28 2018 -0400
+++ b/paracords_plot.py	Mon Nov 04 12:20:51 2019 -0500
@@ -3,42 +3,62 @@
 import plotly
 import plotly.graph_objs as go
 import pandas as pd
+import re

-def main(infile, col_dimensions, categorized, col_color):
+
+def main(infile, col_dimensions, categorized, col_color,
+         dimension_mode='by_index'):
     """
     Produce an interactive paracords plotting html
     Args:
         infile: str, tabular file
         col_dimensions: str, comma separated index numbers. For example: "3,4,5"
         col_color: str, index number
+        dimension_mode: str, one ['by_index', 'by_name']
     """
     df = pd.read_csv(infile, sep='\t', parse_dates=True)

     dimensions = []
-    col_dimensions = [int(x)-1 for x in col_dimensions.split(',')]
-    for col in col_dimensions:
-        values = df[df.columns[col]]
+
+    if dimension_mode not in ['by_index', 'by_name']:
+        raise ValueError("Select dimensions `{}` is not supported!"\
+                         .format(dimension_mode))
+    if dimension_mode == 'by_index':
+        col_dimensions = [int(x)-1 for x in col_dimensions.split(',')]
+        col_dimensions = df.columns[col_dimensions]
+    else:
+        if '*' not in col_dimensions:
+            col_dimensions = [header.strip() for header in col_dimensions.split(',')]
+        else:
+            pattern = col_dimensions.strip()
+            col_dimensions = [header for header in df.columns
+                              if re.search(pattern, header)]
+
+    for col_name in col_dimensions:
+        values = df[col_name]
         if categorized == 'boolfalse' and all(type(e) is int for e in values ):
             dimensions.append(
                 dict(   values = values,
                         tickformat = ",.2r",
-                        label = df.columns[col])
+                        label = col_name)
             )
         elif categorized == 'boolfalse' and all(type(e) is float for e in values ):
             dimensions.append(
                 dict(   values = values,
                         tickformat = "g",
-                        label = df.columns[col])
+                        label = col_name)
             )
         else:
             unique_values = list(set(values))
             unique_values.sort()
+            # cast to str, fix object indexing
+            unique_values = [repr(e) for e in unique_values]
             dimensions.append(
                 dict(   range = [0, len(unique_values)-1],
                         tickvals = list(range(len(unique_values))),
-                        ticktext = [str(e) for e in unique_values],
-                        values = list(map(lambda e: unique_values.index(e), values )),
-                        label = df.columns[col])
+                        ticktext = unique_values,
+                        values = list(map(lambda e: unique_values.index(repr(e)), values )),
+                        label = col_name)
             )

     col_color = int(col_color) - 1
@@ -80,6 +100,8 @@
     aparser.add_argument( "-d", "--col_dimensions", dest="col_dimensions")
     aparser.add_argument( "-t", "--categorized_datatype", dest="categorized")
     aparser.add_argument( "-c", "--col_color", dest="col_color")
+    aparser.add_argument( "-m", "--dimension_mode", dest="dimension_mode")
     args = aparser.parse_args()

-    main(args.infile, args.col_dimensions, args.categorized, args.col_color)
\ No newline at end of file
+    main(args.infile, args.col_dimensions, args.categorized, args.col_color,
+         args.dimension_mode)
--- a/paracords_plot.xml	Wed Oct 10 02:29:28 2018 -0400
+++ b/paracords_plot.xml	Mon Nov 04 12:20:51 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="plotly_parallel_coordinates_plot" name="Parallel Coordinates Plot" version="0.1">
+<tool id="plotly_parallel_coordinates_plot" name="Parallel Coordinates Plot" version="0.2">
     <description>of tabular data</description>
     <requirements>
         <requirement type="package" version="3.6">python</requirement>
@@ -9,15 +9,27 @@
     <command detect_errors="aggressive"><![CDATA[
     python '$__tool_directory__/paracords_plot.py'
         -i '$infile'
-        -d '$col_dimensions'
+        -m '$dimension_selections.selected_mode'
+        -d '$dimension_selections.col_dimensions'
         -t '$categorized_datatype'
         -c '$col_color'
 ]]>
     </command>
     <inputs>
-        <param name="infile" type="data" format="tabular" label="Select data file :"/>
-        <param name="col_dimensions" multiple="true" type="data_column" data_ref="infile" use_header_names="true" display="checkboxes" label="Select the columns for dimentions:"/>
-        <param name="categorized_datatype" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="All the dimensions in categorized datatype:"/>
+        <param name="infile" type="data" format="tabular" label="Select table data file "/>
+        <conditional name="dimension_selections">
+            <param name="selected_mode" type="select" label="The mode of column selection">
+                <option value="by_index" selected="true">By index</option>
+                <option value="by_name">By column name</option>
+            </param>
+            <when value="by_index">
+                <param name="col_dimensions" multiple="true" type="data_column" data_ref="infile" use_header_names="true" display="checkboxes" label="Select the columns for dimentions:"/>
+            </when>
+            <when value="by_name">
+                <param name="col_dimensions" type="text" value="^param_.*" label="Type the column names" help="Two modes: 1) multiple names separated by comma, e.g. header1,header2, or 2) include `*` to initiate regular expression searches for every column name, for example, ^param_.* will retreive all the headers starting with param_. Note: no mix of the two is supported. Use default sanitizer: string.ascii_letters + string.digits + &quot; -=_.()/+*^,:?!&quot;)"/>
+            </when>
+        </conditional>
+        <param name="categorized_datatype" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Cast all the dimensions into categorical datatype?"/>
         <param name="col_color" type="data_column" data_ref="infile" use_header_names="true" label="Select a column containg the values for coloring:" help="e.g. mean_test_score"/>
     </inputs>

@@ -39,6 +51,14 @@
             <param name="col_color" value="3"/>
             <output name="output" file="parcoords_plot02.html" compare="sim_size"/>
         </test>
+        <test>
+            <param name="infile" value="parcoords02.tabular" ftype="tabular"/>
+            <param name="selected_mode" value="by_name"/>
+            <param name="col_dimensions" value="param_*"/>
+            <param name="categorized_datatype" value="false"/>
+            <param name="col_color" value="3"/>
+            <output name="output" file="parcoords_plot02.html" compare="sim_size"/>
+        </test>
     </tests>
     <help><![CDATA[
 **What it does**
@@ -51,4 +71,14 @@

     ]]>
     </help>
+    <citations>
+         <citation type="bibtex">
+          @online{plotly,
+          author = {Plotly Technologies Inc.},
+          title = {Collaborative data science},
+          publisher = {Plotly Technologies Inc.},
+          address = {Montreal, QC}, year = {2015},
+          url = {https://plot.ly} }
+        </citation>
+    </citations>
 </tool>