view feature_selection/featureSelection.xml @ 0:76a728a52df6 draft default tip

planemo upload for repository https://github.com/jaidevjoshi83/MicroBiomML commit 5ef78d4decc95ac107c468499328e7f086289ff9-dirty
author jay
date Tue, 17 Feb 2026 10:52:45 +0000
parents
children
line wrap: on
line source

<tool id="feature_selector" name="Feature Selector" version="1.0.0">
    <description>Perform feature selection using SequentialFeatureSelector for microbiome data analysis.</description>

    <requirements>
        <requirement type="package" version="2.1.4">pandas</requirement>
        <requirement type="package" version="1.3.2">scikit-learn</requirement> 
        <requirement type="package" version="0.1.18">hdlib</requirement>
    </requirements>

    <command detect_errors="exit_code">
        <![CDATA[
        python3 '$__tool_directory__/featureSelection.py' 
            --input '$input'
            --metadata '$metadata_file'
            --threads '$threads'
            --classifier '$classifier'
            --label '$label_clm'
            --tol '$tol'
            --log '$log_file'
            --feature_out '$selected_features'
        ]]>
    </command>

    <inputs>
        <param name="input" type="data" format="tabular" label="Count Matrix File" help="A TSV file containing the count matrix with a header row." />
        <param name="metadata_file" type="data" format="tabular" label="Metadata File" help="A TSV file containing the metadata with a header row." />

        <conditional name='drop_columns'>
            <param name="advanced_setup" type="select" label="Drop Columns from Training Data.">
            <option value="default" selected="true">Do Not Drop Columns</option>
            <option value="settings">Drop Columns</option>
            </param>

            <when value="default">
            </when>

            <when value="settings">
            <param name="columns_to_drop" type='data_column' data_ref="input"  label="Columns to Drop from Training Data" argument="--dp_columns" multiple="true" use_header_names="true" help="Select the columns to drop from the training data." />
            </when>
      </conditional>

        <param name="threads" type="integer" value="4" label="Number of Threads"
               help="The number of threads to use for SequentialFeatureSelector." />

        <param name="classifier" type="select" label="Classifier"
               help="The classifier to use for feature selection.">
            <option value="lr">Logistic Regression</option>
            <option value="dt">Decision Tree</option>
            <option value="sv">Support Vector Classifier</option>
            <option value="rf">Random Forest</option>
            <option value="hdc">HDC Classifier</option>
        </param>

        <param name="label_clm" type="data_column" data_ref="metadata_file"  multiple="false" use_header_names="true"  label="Class Label Column" help="Select the column in the metadata file that contains the class labels for feature selection." >
     
        </param>

        <param name="tol" type="float" value="0.00001" label="Tolerance" help="The tolerance for SequentialFeatureSelector convergence. Lower values mean stricter convergence (default: 0.00001)." />

    </inputs>

    <outputs>
        <data name="log_file" format="txt" label="Feature Selection Log."/>
        <data name="selected_features" format="tsv" label="Selected Features."/>
    </outputs>

    <tests>
      <test>
        <param name="input" value="test_count.tsv"/>
        <param name="metadata_file" value="test_metadata.tsv"/>
        <param name="threads" value="4"/>
        <param name="classifier" value="lr"/>
        <param name="label_clm" value='2'/>
        <param name="tol" value="1e-05"/>
        <output name="log_file" file="out.log" />
        <output name="selected_features" file="out.tsv"/>
      </test>

    <test>
        <param name="input" value="test_count.tsv"/>
        <param name="metadata_file" value="test_metadata.tsv"/>
        <param name="threads" value="4"/>
        <param name="classifier" value="dt"/>
        <param name="label_clm" value='2'/>
        <param name="tol" value="1e-05"/>
        <output name="log_file" file="out.log" />
        <output name="selected_features" file="out.tsv"/>
      </test>

      <test>
        <param name="input" value="test_count.tsv"/>
        <param name="metadata_file" value="test_metadata.tsv"/>
        <param name="threads" value="4"/>
        <param name="classifier" value="sv"/>
        <param name="label_clm" value='2'/>
        <param name="tol" value="1e-05"/>
        <output name="log_file" file="out.log" />
        <output name="selected_features" file="out.tsv"/>
      </test>

     <test>
        <param name="input" value="test_count.tsv"/>
        <param name="metadata_file" value="test_metadata.tsv"/>
        <param name="threads" value="4"/>
        <param name="classifier" value="rf"/>
        <param name="label_clm" value='2'/>
        <param name="tol" value="1e-05"/>
        <output name="log_file" file="out.log" />
        <output name="selected_features" file="out.tsv"/>
      </test>

      <test>
        <param name="input" value="test_count.tsv"/>
        <param name="metadata_file" value="test_metadata.tsv"/>
        <param name="threads" value="4"/>
        <param name="classifier" value="hdc"/>
        <param name="label_clm" value='2'/>
        <param name="tol" value="1e-05"/>
        <output name="log_file" file="out.log" />
        <output name="selected_features" file="out.tsv"/>
      </test>
     
    </tests>

    <help><![CDATA[
**Feature Selector**

This tool performs feature selection on a single TSV file using scikit-learn's `SequentialFeatureSelector`.
You can choose from multiple classifiers and configure parameters such as tolerance and the number of threads.

**Inputs**

- **Count Matrix File**: A TSV file containing the features (columns) and samples (rows).
- **Metadata File**: A TSV file containing the sample metadata.
- **Class Label Column**: The column in the metadata file that contains the class labels for feature selection.
- **Classifier**: The classifier type to use for feature selection.
- **Number of Threads**: The number of threads to use for computation.
- **Tolerance**: The convergence tolerance for the SequentialFeatureSelector (optional).

**Outputs**

- **Feature Selection Log**: A text file containing the run details and timing information.
- **Selected Features**: A TSV file listing the selected feature names.
]]></help>
     <citations>
          <citation type="bibtex">
               @article{cumbo2023hdlib,
                    title={hdlib: A Python library for designing Vector-Symbolic Architectures},
                    author={Cumbo, Fabio and Weitschek, Emanuel and Blankenberg, Daniel},
                    journal={Journal of Open Source Software},
                    volume={8},
                    number={89},
                    pages={5704},
                    year={2023}
               }
          </citation>
          <citation type="bibtex">
               @article{cumbo2025feature,
                    title={Feature selection with vector-symbolic architectures: a case study on microbial profiles of shotgun metagenomic samples of colorectal cancer},
                    author={Cumbo, Fabio and Truglia, Simone and Weitschek, Emanuel and Blankenberg, Daniel},
                    journal={Briefings in Bioinformatics},
                    volume={26},
                    number={2},
                    pages={bbaf177},
                    year={2025},
                    publisher={Oxford University Press}
               }
          </citation>
     </citations>
</tool>