diff feature_selection/featureSelection.xml @ 0:76a728a52df6 draft default tip

planemo upload for repository https://github.com/jaidevjoshi83/MicroBiomML commit 5ef78d4decc95ac107c468499328e7f086289ff9-dirty
author jay
date Tue, 17 Feb 2026 10:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_selection/featureSelection.xml	Tue Feb 17 10:52:45 2026 +0000
@@ -0,0 +1,170 @@
+<tool id="feature_selector" name="Feature Selector" version="1.0.0">
+    <description>Perform feature selection using SequentialFeatureSelector for microbiome data analysis.</description>
+
+    <requirements>
+        <requirement type="package" version="2.1.4">pandas</requirement>
+        <requirement type="package" version="1.3.2">scikit-learn</requirement> 
+        <requirement type="package" version="0.1.18">hdlib</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code">
+        <![CDATA[
+        python3 '$__tool_directory__/featureSelection.py' 
+            --input '$input'
+            --metadata '$metadata_file'
+            --threads '$threads'
+            --classifier '$classifier'
+            --label '$label_clm'
+            --tol '$tol'
+            --log '$log_file'
+            --feature_out '$selected_features'
+        ]]>
+    </command>
+
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Count Matrix File" help="A TSV file containing the count matrix with a header row." />
+        <param name="metadata_file" type="data" format="tabular" label="Metadata File" help="A TSV file containing the metadata with a header row." />
+
+        <conditional name='drop_columns'>
+            <param name="advanced_setup" type="select" label="Drop Columns from Training Data.">
+            <option value="default" selected="true">Do Not Drop Columns</option>
+            <option value="settings">Drop Columns</option>
+            </param>
+
+            <when value="default">
+            </when>
+
+            <when value="settings">
+            <param name="columns_to_drop" type='data_column' data_ref="input"  label="Columns to Drop from Training Data" argument="--dp_columns" multiple="true" use_header_names="true" help="Select the columns to drop from the training data." />
+            </when>
+      </conditional>
+
+        <param name="threads" type="integer" value="4" label="Number of Threads"
+               help="The number of threads to use for SequentialFeatureSelector." />
+
+        <param name="classifier" type="select" label="Classifier"
+               help="The classifier to use for feature selection.">
+            <option value="lr">Logistic Regression</option>
+            <option value="dt">Decision Tree</option>
+            <option value="sv">Support Vector Classifier</option>
+            <option value="rf">Random Forest</option>
+            <option value="hdc">HDC Classifier</option>
+        </param>
+
+        <param name="label_clm" type="data_column" data_ref="metadata_file"  multiple="false" use_header_names="true"  label="Class Label Column" help="Select the column in the metadata file that contains the class labels for feature selection." >
+     
+        </param>
+
+        <param name="tol" type="float" value="0.00001" label="Tolerance" help="The tolerance for SequentialFeatureSelector convergence. Lower values mean stricter convergence (default: 0.00001)." />
+
+    </inputs>
+
+    <outputs>
+        <data name="log_file" format="txt" label="Feature Selection Log."/>
+        <data name="selected_features" format="tsv" label="Selected Features."/>
+    </outputs>
+
+    <tests>
+      <test>
+        <param name="input" value="test_count.tsv"/>
+        <param name="metadata_file" value="test_metadata.tsv"/>
+        <param name="threads" value="4"/>
+        <param name="classifier" value="lr"/>
+        <param name="label_clm" value='2'/>
+        <param name="tol" value="1e-05"/>
+        <output name="log_file" file="out.log" />
+        <output name="selected_features" file="out.tsv"/>
+      </test>
+
+    <test>
+        <param name="input" value="test_count.tsv"/>
+        <param name="metadata_file" value="test_metadata.tsv"/>
+        <param name="threads" value="4"/>
+        <param name="classifier" value="dt"/>
+        <param name="label_clm" value='2'/>
+        <param name="tol" value="1e-05"/>
+        <output name="log_file" file="out.log" />
+        <output name="selected_features" file="out.tsv"/>
+      </test>
+
+      <test>
+        <param name="input" value="test_count.tsv"/>
+        <param name="metadata_file" value="test_metadata.tsv"/>
+        <param name="threads" value="4"/>
+        <param name="classifier" value="sv"/>
+        <param name="label_clm" value='2'/>
+        <param name="tol" value="1e-05"/>
+        <output name="log_file" file="out.log" />
+        <output name="selected_features" file="out.tsv"/>
+      </test>
+
+     <test>
+        <param name="input" value="test_count.tsv"/>
+        <param name="metadata_file" value="test_metadata.tsv"/>
+        <param name="threads" value="4"/>
+        <param name="classifier" value="rf"/>
+        <param name="label_clm" value='2'/>
+        <param name="tol" value="1e-05"/>
+        <output name="log_file" file="out.log" />
+        <output name="selected_features" file="out.tsv"/>
+      </test>
+
+      <test>
+        <param name="input" value="test_count.tsv"/>
+        <param name="metadata_file" value="test_metadata.tsv"/>
+        <param name="threads" value="4"/>
+        <param name="classifier" value="hdc"/>
+        <param name="label_clm" value='2'/>
+        <param name="tol" value="1e-05"/>
+        <output name="log_file" file="out.log" />
+        <output name="selected_features" file="out.tsv"/>
+      </test>
+     
+    </tests>
+
+    <help><![CDATA[
+**Feature Selector**
+
+This tool performs feature selection on a single TSV file using scikit-learn's `SequentialFeatureSelector`.
+You can choose from multiple classifiers and configure parameters such as tolerance and the number of threads.
+
+**Inputs**
+
+- **Count Matrix File**: A TSV file containing the features (columns) and samples (rows).
+- **Metadata File**: A TSV file containing the sample metadata.
+- **Class Label Column**: The column in the metadata file that contains the class labels for feature selection.
+- **Classifier**: The classifier type to use for feature selection.
+- **Number of Threads**: The number of threads to use for computation.
+- **Tolerance**: The convergence tolerance for the SequentialFeatureSelector (optional).
+
+**Outputs**
+
+- **Feature Selection Log**: A text file containing the run details and timing information.
+- **Selected Features**: A TSV file listing the selected feature names.
+]]></help>
+     <citations>
+          <citation type="bibtex">
+               @article{cumbo2023hdlib,
+                    title={hdlib: A Python library for designing Vector-Symbolic Architectures},
+                    author={Cumbo, Fabio and Weitschek, Emanuel and Blankenberg, Daniel},
+                    journal={Journal of Open Source Software},
+                    volume={8},
+                    number={89},
+                    pages={5704},
+                    year={2023}
+               }
+          </citation>
+          <citation type="bibtex">
+               @article{cumbo2025feature,
+                    title={Feature selection with vector-symbolic architectures: a case study on microbial profiles of shotgun metagenomic samples of colorectal cancer},
+                    author={Cumbo, Fabio and Truglia, Simone and Weitschek, Emanuel and Blankenberg, Daniel},
+                    journal={Briefings in Bioinformatics},
+                    volume={26},
+                    number={2},
+                    pages={bbaf177},
+                    year={2025},
+                    publisher={Oxford University Press}
+               }
+          </citation>
+     </citations>
+</tool>