Mercurial > repos > iuc > nanocompore_sampcomp

<?xml version="1.0"?>
<tool id="nanocompore_sampcomp" name="SampComp" version="@TOOL_VERSION@+galaxy0">
    <description>to compare Nanopolished datasets</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <version_command><![CDATA[nanocompore --version]]></version_command>
    <command detect_errors="exit_code"><![CDATA[
        ## initialize
        ## requires a minimum of 3 threads
        threads=\$((3 > \${GALAXY_SLOTS:-3} ? 3 : \${GALAXY_SLOTS:-3})) &&
        ## same name pattern required
        #for $i, $current in enumerate($file1_rep)
            ln -s '$current.file' 'sample_1_${i}.tsv' &&
            ln -s '$current.index' 'sample_1_${i}.tsv.idx' &&
        #end for
        #for $i, $current in enumerate($file2_rep)
            ln -s '$current.file' 'sample_2_${i}.tsv' &&
            ln -s '$current.index' 'sample_2_${i}.tsv.idx' &&
        #end for

        ## run
        nanocompore sampcomp
        ## required
        --label1 '$label1'
        #set files1 = ','.join(['sample_1_' + str(item) + '.tsv' for item in range(len($file1_rep))])
        --file_list1 '$files1'
        --label2 '$label2'
        #set files2 = ','.join(['sample_2_' + str(item) + '.tsv' for item in range(len($file2_rep))])
        --file_list2 '$files2'
        --fasta '$fasta'
        ## optional
        #if $ap.bed
            --bed '$ap.bed'
        #end if
        --max_invalid_kmers_freq $ap.max_invalid_kmers_freq
        --min_coverage $ap.min_coverage
        --min_ref_length $ap.min_ref_length
        --comparison_methods '$ap.comparison_methods'
        --sequence_context $ap.sequence_context
        --sequence_context_weights '$ap.sequence_context_weights'
        --pvalue_thr $ap.pvalue_thr
        $ap.logit
        $ap.allow_warnings
        --outpath 'results'
        --nthreads \$threads
        --log_level debug

        && tar -cf 'results/db.tar' 'results/out_SampComp.db.bak' 'results/out_SampComp.db.dir' 'results/out_SampComp.db.dat'
        ]]></command>
    <inputs>
        <param argument="--label1" type="text" value="Condition 1" label="Set label of first condition"/>
        <repeat name="file1_rep" min="1" title="First condition files">
            <param name="file" type="data" format="tabular" label="Select NanopolishComp file" help="(--file_list1)"/>
            <param name="index" type="data" format="tabular" label="Select index file"/>
        </repeat>
        <param argument="--label2" type="text" value="Condition 2" label="Set label of second condition"/>
        <repeat name="file2_rep" min="1" title="Second condition files">
            <param name="file" type="data" format="tabular" label="Select NanopolishComp file" help="(--file_list2)"/>
            <param name="index" type="data" format="tabular" label="Select index file"/>
        </repeat>
        <param argument="--fasta" type="data" format="fasta" label="Select reference mapping fasta"/>

        <section name="ap" title="Advanced parameters">
            <param argument="--bed" type="data" format="bed" optional="true" label="Select mapping file with annotation of transcriptome"/>
            <param argument="--max_invalid_kmers_freq" type="float" value="0.1" min="0.0" max="1.0" label="Set max fequency of invalid kmers"/>
            <param argument="--min_coverage" type="integer" value="30" min="0" label="Set minimum coverage required in each condition to do the comparison"/>
            <param argument="--min_ref_length" type="integer" value="100" min="0" label="Set minimum length of a reference transcript to include it in the analysis"/>
            <param argument="--comparison_methods" type="select" multiple="true" label="Select comparison methods">
                <option value="GMM" selected="true">GMM</option>
                <option value="KS" selected="true">KS</option>
                <option value="TT">TT</option>
                <option value="MW">MW</option>
            </param>
            <param argument="--sequence_context" type="integer" value="0" min="0" max="4" label="Set sequence context for combining p-values"/>
            <param argument="--sequence_context_weights" type="select" label="Select type of weights to use for combining p-values">
                <option value="uniform" selected="true">Uniform</option>
                <option value="harmonic">Harmonic</option>
            </param>
            <param argument="--pvalue_thr" type="float" value="0.05" min="0.0" max="1.0" label="Set adjusted p-value threshold for reporting significant sites"/>
            <param argument="--logit" type="boolean" truevalue="--logit" falsevalue="" label="Use logistic regression testing also when all conditions have replicates?"/>
            <param argument="--allow_warnings" type="boolean" truevalue="--allow_warnings" falsevalue="" label="Should runtime warnings during the ANOVA tests raise an error?"/>
            <param name="out" type="select" multiple="true" optional="false" label="Select output file(s)">
                <option value="results" selected="true">Results</option>
                <option value="shift" selected="true">Shift stats</option>
                <option value="db" selected="true">Database (*.db.dir, *.db.bak, *.db.dat)</option>
                <option value="log">Log</option>
            </param>
        </section>
    </inputs>
    <outputs>
         <data name="out_results" format="tabular" from_work_dir="results/out_nanocompore_results.tsv" label="${tool.name} on ${on_string}: Results">
            <filter>'results' in ap['out']</filter>
        </data>
         <data name="out_shift" format="tabular" from_work_dir="results/out_nanocompore_shift_stats.tsv" label="${tool.name} on ${on_string}: Shift stats">
            <filter>'shift' in ap['out']</filter>
        </data>
        <data name="out_db" format="tar" from_work_dir="results/db.tar" label="${tool.name} on ${on_string}: Database">
            <filter>'db' in ap['out']</filter>
        </data>
        <data name="out_log" format="txt" from_work_dir="results/out_SampComp.log" label="${tool.name} on ${on_string}: log">
            <filter>'log' in ap['out']</filter>
        </data>
    </outputs>
    <tests>
        <!-- #1 -->
        <test expect_num_outputs="3">
            <repeat name="file1_rep">
                <param name="file" value="sample1.tsv"/>
                <param name="index" value="sample1.tsv.idx"/>
            </repeat>
            <repeat name="file2_rep">
                <param name="file" value="sample2.tsv"/>
                <param name="index" value="sample2.tsv.idx"/>
            </repeat>
            <param name="fasta" value="reference.fa"/>
            <output name="out_results">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_text_matching expression="pos&#09;chr.+"/>
                    <has_text_matching expression="22102&#09;NA.+"/>
                </assert_contents>
            </output>
            <output name="out_shift">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_text_matching expression="ref\_id&#09;pos.+"/>
                    <has_text_matching expression="chr&#09;22102.+"/>
                </assert_contents>
            </output>
            <output name="out_db">
                <assert_contents>
                    <has_size value="5416960" delta="10000"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 -->
        <test expect_num_outputs="4">
            <param name="label1" value="C1"/>
            <repeat name="file1_rep">
                <param name="file" value="sample1.tsv"/>
                <param name="index" value="sample1.tsv.idx"/>
            </repeat>
            <param name="label2" value="C2"/>
            <repeat name="file2_rep">
                <param name="file" value="sample2.tsv"/>
                <param name="index" value="sample2.tsv.idx"/>
            </repeat>
            <param name="fasta" value="reference.fa"/>
            <section name="ap">
                <param name="max_invalid_kmers_freq" value="0.2"/>
                <param name="min_coverage" value="31"/>
                <param name="min_ref_length" value="101"/>
                <param name="comparison_methods" value="GMM,KS,TT,MW"/>
                <param name="sequence_context" value="1"/>
                <param name="sequence_context_weights" value="harmonic"/>
                <param name="pvalue_thr" value="0.06"/>
                <param name="logit" value="true"/>
                <param name="allow_warnings" value="true"/>
                <param name="out" value="results,shift,db,log"/>
            </section>
            <output name="out_results">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_text_matching expression="pos&#09;chr.+"/>
                    <has_text_matching expression="22102&#09;NA.+"/>
                </assert_contents>
            </output>
            <output name="out_shift">
                <assert_contents>
                    <has_n_lines n="3"/>
                    <has_text_matching expression="ref\_id&#09;pos.+"/>
                    <has_text_matching expression="chr&#09;22102.+"/>
                </assert_contents>
            </output>
            <output name="out_db">
                <assert_contents>
                    <has_size value="5416960" delta="10000"/>
                </assert_contents>
            </output>
            <output name="out_log">
                <assert_contents>
                    <has_n_lines n="31"/>
                    <has_text_matching expression=".+package\_name.+"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

@WID@

SampComp provides a very flexible analysis framework with a few mandatory options and many optional parameters.

First, SampComp parses the sample eventalign collapse files and then the observed results are piled-up per reference at position level. In a second time, positions are compared using various statistical methods and the statistics are stored in a shelve DBM database containing the results for all positions with sufficient coverage.

**Input**

SampComp requires sample files obtained with NanopolishComp EventalignCollapse as explained before (see data preparation) for both the control and the experimental conditions. 2 conditions are expected and at least 2 replicates per conditions are highly recommended.

A transcriptome FASTA reference file is required to extract kmer sequences during the analyses. The reference has to be the same as the one used at the mapping step.

Optionally, a BED file containing the genome annotations corresponding to the transcriptome fasta file can be provided. In that case Nanocompore will also convert the transcript coordinates into the genome space.

**Output**

The database object returned by Sampcomp is a Python GDBM object database indexed by reference id and can be be used with SampCompDB.

.. class:: infomark

**References**

@REFERENCES@
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Wed, 17 Jun 2020 13:28:27 -0400
parents	c43f4b80f5a9
children	c0fb573c1e8d