Mercurial > repos > jay > feature_selector
diff feature_selection/featureSelection.xml @ 0:76a728a52df6 draft default tip
planemo upload for repository https://github.com/jaidevjoshi83/MicroBiomML commit 5ef78d4decc95ac107c468499328e7f086289ff9-dirty
| author | jay |
|---|---|
| date | Tue, 17 Feb 2026 10:52:45 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selection/featureSelection.xml Tue Feb 17 10:52:45 2026 +0000 @@ -0,0 +1,170 @@ +<tool id="feature_selector" name="Feature Selector" version="1.0.0"> + <description>Perform feature selection using SequentialFeatureSelector for microbiome data analysis.</description> + + <requirements> + <requirement type="package" version="2.1.4">pandas</requirement> + <requirement type="package" version="1.3.2">scikit-learn</requirement> + <requirement type="package" version="0.1.18">hdlib</requirement> + </requirements> + + <command detect_errors="exit_code"> + <![CDATA[ + python3 '$__tool_directory__/featureSelection.py' + --input '$input' + --metadata '$metadata_file' + --threads '$threads' + --classifier '$classifier' + --label '$label_clm' + --tol '$tol' + --log '$log_file' + --feature_out '$selected_features' + ]]> + </command> + + <inputs> + <param name="input" type="data" format="tabular" label="Count Matrix File" help="A TSV file containing the count matrix with a header row." /> + <param name="metadata_file" type="data" format="tabular" label="Metadata File" help="A TSV file containing the metadata with a header row." /> + + <conditional name='drop_columns'> + <param name="advanced_setup" type="select" label="Drop Columns from Training Data."> + <option value="default" selected="true">Do Not Drop Columns</option> + <option value="settings">Drop Columns</option> + </param> + + <when value="default"> + </when> + + <when value="settings"> + <param name="columns_to_drop" type='data_column' data_ref="input" label="Columns to Drop from Training Data" argument="--dp_columns" multiple="true" use_header_names="true" help="Select the columns to drop from the training data." /> + </when> + </conditional> + + <param name="threads" type="integer" value="4" label="Number of Threads" + help="The number of threads to use for SequentialFeatureSelector." /> + + <param name="classifier" type="select" label="Classifier" + help="The classifier to use for feature selection."> + <option value="lr">Logistic Regression</option> + <option value="dt">Decision Tree</option> + <option value="sv">Support Vector Classifier</option> + <option value="rf">Random Forest</option> + <option value="hdc">HDC Classifier</option> + </param> + + <param name="label_clm" type="data_column" data_ref="metadata_file" multiple="false" use_header_names="true" label="Class Label Column" help="Select the column in the metadata file that contains the class labels for feature selection." > + + </param> + + <param name="tol" type="float" value="0.00001" label="Tolerance" help="The tolerance for SequentialFeatureSelector convergence. Lower values mean stricter convergence (default: 0.00001)." /> + + </inputs> + + <outputs> + <data name="log_file" format="txt" label="Feature Selection Log."/> + <data name="selected_features" format="tsv" label="Selected Features."/> + </outputs> + + <tests> + <test> + <param name="input" value="test_count.tsv"/> + <param name="metadata_file" value="test_metadata.tsv"/> + <param name="threads" value="4"/> + <param name="classifier" value="lr"/> + <param name="label_clm" value='2'/> + <param name="tol" value="1e-05"/> + <output name="log_file" file="out.log" /> + <output name="selected_features" file="out.tsv"/> + </test> + + <test> + <param name="input" value="test_count.tsv"/> + <param name="metadata_file" value="test_metadata.tsv"/> + <param name="threads" value="4"/> + <param name="classifier" value="dt"/> + <param name="label_clm" value='2'/> + <param name="tol" value="1e-05"/> + <output name="log_file" file="out.log" /> + <output name="selected_features" file="out.tsv"/> + </test> + + <test> + <param name="input" value="test_count.tsv"/> + <param name="metadata_file" value="test_metadata.tsv"/> + <param name="threads" value="4"/> + <param name="classifier" value="sv"/> + <param name="label_clm" value='2'/> + <param name="tol" value="1e-05"/> + <output name="log_file" file="out.log" /> + <output name="selected_features" file="out.tsv"/> + </test> + + <test> + <param name="input" value="test_count.tsv"/> + <param name="metadata_file" value="test_metadata.tsv"/> + <param name="threads" value="4"/> + <param name="classifier" value="rf"/> + <param name="label_clm" value='2'/> + <param name="tol" value="1e-05"/> + <output name="log_file" file="out.log" /> + <output name="selected_features" file="out.tsv"/> + </test> + + <test> + <param name="input" value="test_count.tsv"/> + <param name="metadata_file" value="test_metadata.tsv"/> + <param name="threads" value="4"/> + <param name="classifier" value="hdc"/> + <param name="label_clm" value='2'/> + <param name="tol" value="1e-05"/> + <output name="log_file" file="out.log" /> + <output name="selected_features" file="out.tsv"/> + </test> + + </tests> + + <help><![CDATA[ +**Feature Selector** + +This tool performs feature selection on a single TSV file using scikit-learn's `SequentialFeatureSelector`. +You can choose from multiple classifiers and configure parameters such as tolerance and the number of threads. + +**Inputs** + +- **Count Matrix File**: A TSV file containing the features (columns) and samples (rows). +- **Metadata File**: A TSV file containing the sample metadata. +- **Class Label Column**: The column in the metadata file that contains the class labels for feature selection. +- **Classifier**: The classifier type to use for feature selection. +- **Number of Threads**: The number of threads to use for computation. +- **Tolerance**: The convergence tolerance for the SequentialFeatureSelector (optional). + +**Outputs** + +- **Feature Selection Log**: A text file containing the run details and timing information. +- **Selected Features**: A TSV file listing the selected feature names. +]]></help> + <citations> + <citation type="bibtex"> + @article{cumbo2023hdlib, + title={hdlib: A Python library for designing Vector-Symbolic Architectures}, + author={Cumbo, Fabio and Weitschek, Emanuel and Blankenberg, Daniel}, + journal={Journal of Open Source Software}, + volume={8}, + number={89}, + pages={5704}, + year={2023} + } + </citation> + <citation type="bibtex"> + @article{cumbo2025feature, + title={Feature selection with vector-symbolic architectures: a case study on microbial profiles of shotgun metagenomic samples of colorectal cancer}, + author={Cumbo, Fabio and Truglia, Simone and Weitschek, Emanuel and Blankenberg, Daniel}, + journal={Briefings in Bioinformatics}, + volume={26}, + number={2}, + pages={bbaf177}, + year={2025}, + publisher={Oxford University Press} + } + </citation> + </citations> +</tool>
