Mercurial > repos > bgruening > model_prediction

<tool id="model_prediction" name="Model Prediction" version="@VERSION@" profile="@PROFILE@">
    <description>predicts on new data using a preffited model</description>
    <macros>
        <import>main_macros.xml</import>
        <import>keras_macros.xml</import>
    </macros>
    <expand macro="python_requirements" />
    <expand macro="macro_stdio" />
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        export HDF5_USE_FILE_LOCKING='FALSE';
        python '$__tool_directory__/model_prediction.py'
            --inputs '$inputs'
            --infile_estimator '$infile_estimator'
            --outfile_predict '$outfile_predict'
            #if $input_options.selected_input == 'seq_fasta'
            --fasta_path '$input_options.fasta_path'
            #elif $input_options.selected_input == 'variant_effect'
            --ref_seq '$input_options.ref_genome_file'
            --vcf_path '$input_options.vcf_file'
            #else
            --infile1 '$input_options.infile1'
            #end if
        ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
    </configfiles>
    <inputs>
        <param name="infile_estimator" type="data" format="h5mlm" label="Choose the dataset containing pipeline/estimator object" />
        <param argument="method" type="select" label="Select invocation method">
            <option value="predict" selected="true">predict</option>
            <option value="predict_proba">predict_proba</option>
        </param>
        <conditional name="input_options">
            <param name="selected_input" type="select" label="Select input data type for prediction">
                <option value="tabular" selected="true">tabular data</option>
                <option value="sparse">sparse matrix</option>
                <option value="seq_fasta">sequnences in a fasta file</option>
                <option value="variant_effect">reference genome and variant call file</option>
            </param>
            <when value="tabular">
                <param name="infile1" type="data" format="tabular" label="Training samples dataset:" />
                <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="False" label="Does the dataset contain header:" />
                <conditional name="column_selector_options_1">
                    <expand macro="samples_column_selector_options" multiple="true" />
                </conditional>
            </when>
            <when value="sparse">
                <param name="infile1" type="data" format="txt" label="Select a sparse matrix" help="" />
            </when>
            <when value="seq_fasta">
                <param name="fasta_path" type="data" format="fasta" label="Dataset containing fasta genomic/protein sequences" help="Sequences will be one-hot encoded to arrays." />
                <param name="seq_type" type="select" label="Sequence type">
                    <option value="FastaDNABatchGenerator">DNA</option>
                    <option value="FastaRNABatchGenerator">RNA</option>
                    <option value="FastaProteinBatchGenerator">Protein</option>
                </param>
            </when>
            <when value="variant_effect">
                <param name="ref_genome_file" type="data" format="fasta" label="Dataset containing reference genomic sequence" help="fasta" />
                <param name="blacklist_regions" type="select" label="blacklist regioins" help="A pre-loaded list of blacklisted intervals.Refer to `selene` for details.">
                    <option value="none" selected="true">None</option>
                    <option value="hg38">hg38</option>
                    <option value="hg19">hg19</option>
                </param>
                <param name="vcf_file" type="data" format="vcf" label="Dataset containing sequence variations" help="vcf" />
                <param name="seq_length" type="integer" value="1000" label="Encoding seqence length" help="A stretch of sequence surrounding the variation position on the reference genome." />
                <param name="output_reference" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Predict the reference sequence?" help="If False, predict on the variant sequence." />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="tabular" name="outfile_predict" />
    </outputs>
    <tests>
        <test>
            <param name="infile_estimator" value="best_estimator_.h5mlm" ftype="h5mlm" />
            <param name="method" value="predict" />
            <param name="infile1" value="regression_X.tabular" ftype="tabular" />
            <param name="header1" value="true" />
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" />
            <output name="outfile_predict" file="model_pred01.tabular" />
        </test>
        <test>
            <param name="infile_estimator" value="train_test_eval_model01" ftype="h5mlm" />
            <param name="method" value="predict" />
            <param name="infile1" value="regression_X.tabular" ftype="tabular" />
            <param name="header1" value="true" />
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" />
            <output name="outfile_predict" >
                <assert_contents>
                    <has_n_columns n="1" />
                    <has_text text="71.0" />
                    <has_text text="61.3" />
                    <has_text text="83.7" />
                    <has_text text="69.2" />
                    <has_text text="51.8" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**

Given a fitted estimator and new data sets, this tool outpus the prediction results on the data sets via invoking the estimator's `predict` or `predict_proba` method.

For estimator, this tool supports fitted sklearn estimators and trained deep learning models. It predicts on three different dataset inputs,

- tabular

- sparse

- bio-sequences in a fasta file

- reference genome and variant call file

        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="keras_citation" />
        <expand macro="selene_citation" />
    </expand>
</tool>
author	bgruening
date	Fri, 03 Nov 2023 23:18:23 +0000
parents	3bb1b688b0e4
children