Mercurial > repos > bgruening > nanopolish_variants

<tool id="nanopolish_variants" name="Nanopolish variants" version="@VERSION@+galaxy0">
    <description>- Find SNPs of basecalled merged Nanopore reads and polishes the consensus sequences</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        ln -s '$input_merged' reads.fasta &&

        #if $input_reads_raw.extension == 'fast5':
            mkdir fast5_files && ln -s '$input_reads_raw' fast5_files/read1.fast5 &&

        #else if $input_reads_raw.extension == 'fast5.tar':
            ln -s '$input_reads_raw' fast5_files.tar &&
            mkdir fast5_files && tar -xf fast5_files.tar -C fast5_files &&

        #else if $input_reads_raw.extension == 'fast5.tar.bz2':
            ln -s '$input_reads_raw' fast5_files.tar.bz2 &&
            mkdir fast5_files && tar -xjf fast5_files.tar.bz2 -C fast5_files &&

        #else:
            ln -s '$input_reads_raw' fast5_files.tar.gz &&
            mkdir fast5_files && tar -xzf fast5_files.tar.gz -C fast5_files &&

        #end if

        nanopolish index
        -d fast5_files/
        #if $adv.input_seq_summary:
          -s '$adv.input_seq_summary'
        #end if
        reads.fasta &&

        ln -s '$b' reads.bam &&
        ln -s '${b.metadata.bam_index}' reads.bam.bai &&
        #if $reference_source.reference_source_selector == 'history':
            ln -f -s '$reference_source.ref_file' genome.fa &&
        #else:
            ln -f -s '$reference_source.ref_file.fields.path' genome.fa &&
        #end if

        nanopolish variants
        -r reads.fasta
        -b reads.bam
        -g genome.fa
        -o variants.vcf
        #if $consensus:
            --consensus
        #end if

        $snps
        $verbose
        $homopolymer
        $faster
        $all_bases

        -m $min_candidate_frequency
        -d $min_candidate_depth
        -x $max_haplotypes
        --max-rounds $max_rounds
        --threads "\${GALAXY_SLOTS:-4}"
        #if str($min_flanking_sequence):
            --min-flanking-sequence $min_flanking_sequence
        #end if
        #if $ploidy != -1:
            --ploidy $ploidy
        #end if

        #if $w and str($w).strip():
          -w "${w}"
        #end if
        #if $methylation_aware and str($methylation_aware).strip():
          -q "${methylation_aware}"
        #end if

        #if $adv.input_events_bam:
          -e '$adv.input_events_bam'
        #end if
        #if $adv.input_genotype:
          --genotype '$adv.input_genotype'
        #end if
        #if $adv.input_candidates:
          -c '$adv.input_candidates'
        #end if
        #if $adv.input_alt_bc_bam:
          -a '$adv.input_alt_bc_bam'
        #end if
        #if $adv.input_models_fofn:
          --models-fofn '$input_models_fofn'
        #end if

        &&

        nanopolish vcf2fasta --skip-checks -g genome.fa variants.vcf > polished.fa


    ]]></command>
    <inputs>
      <!-- index inputs -->
        <param type="data" name="input_merged" format="fasta,fastq" label="Basecalled merged reads.fa"/>
        <param type="data" name="input_reads_raw" format="fast5.tar.gz,fast5.tar.bz2,fast5.tar" label="Flat archive file of raw fast5 files"/>

        <!-- variants consensus inputs -->
        <param type="data" argument="-b" format="bam" label="Reads aligned to the reference genome" />
        <conditional name="reference_source">
          <param name="reference_source_selector" type="select" label="Load reference genome from">
            <option value="cached">Local cache</option>
            <option value="history">History</option>
          </param>
          <when value="cached">
            <param name="ref_file" type="select" label="Using reference genome" help="REFERENCE_SEQUENCE">
              <options from_data_table="all_fasta">
              </options>
              <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
            </param>
          </when>
          <when value="history">
            <param name="ref_file" type="data" format="fasta" label="Use the following dataset as the reference sequence" help="REFERENCE_SEQUENCE; You can upload a FASTA sequence to the history and use it as reference" />
          </when>
        </conditional>

        <section name="adv" title="Optional data inputs">
            <!-- optional inputs -->
            <param type="data" name="input_seq_summary" format="txt" optional="true" label="Sequencing summary file from albacore" help="(-s)"/>
            <param type="data" name="input_events_bam" format="bam" optional="true" label="Events aligned to the reference genome" help="(-e)" />
            <param type="data" name="input_genotype" format="vcf" optional="true" label="Call genotypes for the variants in the vcf file" help="(--genotype)" />
            <param type="data" name="input_candidates" format="vcf" optional="true"
                    label="Use variant candidates, rather than discovering them from aligned reads" help="(-c)" />
            <param type="data" name="input_alt_bc_bam" format="bam" optional="true" label="Alternative basecaller used that does not output event annotations" help="(-a)" />
            <param type="data" name="input_models_fofn" format="txt" optional="true" label="Read alternative k-mer models" help="(--models-fofn)" />
       </section>

        <!-- optional params -->
        <!-- optional params -->
        <param argument="-w" type="text" optional="true"
            label="find variants in window of region chromsome:start-end" />
        <param name="methylation_aware" type="text" optional="true" label="methylation aware polishing and test motifs given" help="(-q)"/>
        <param name="min_candidate_frequency" type="float" optional="true" value="0.2" label="Extarct if the variant frequency is at least F" help="(-m)"/>
        <param name="min_candidate_depth" type="integer" optional="true" value="20" label="Extarct if the depth is at least D" help="(-d)"/>
        <param name="max_haplotypes" type="integer" optional="true" value="1000" label="Consider at most N haplotype combinations" help="(-x)"/>
        <param name="max_rounds" type="integer" optional="true" value="50" label="Perform N rounds of consensus sequence improvement" help="(--max_rounds)"/>
        <param name="ploidy" type="integer" optional="true" value="-1" label="The ploidy level of the sequenced genome. -1:disabled" help="(-p)"/>
        <param name="min_flanking_sequence" type="integer" optional="true" value="" label="Distance from alignment end to calculate variants" help="(--min-flanking-sequence)"/>

        <!-- optional flags -->
        <param argument="--consensus" type="boolean" truevalue="--consensus" falsevalue="" checked="true" label="Consensus calling mode and output polished sequence" />
        <param argument="--snps" type="boolean" truevalue="--snps" falsevalue="" checked="false" label="only call SNPs"/>
        <param argument="--verbose" type="boolean" truevalue="--verbose" falsevalue="" checked="false" label="verbose output"/>
        <param name="homopolymer" type="boolean" argument="--fix-homopolymers" truevalue="--fix-homopolymers" falsevalue="" checked="false" label="homopolymer caller" />
        <param argument="--faster" type="boolean" truevalue="--faster" falsevalue="" checked="false"
                label="speedup while slightly reducing consensus accuracy"/>
        <param name="all_bases" type="boolean" argument="--calculate-all-support" truevalue="--calculate-all-support" falsevalue="" checked="false"
                label="calculate the support of the 3 other possible bases" />
    </inputs>

    <outputs>
      <!-- variants consensus outputs -->
        <data name="output_polished" format="fasta" from_work_dir="polished.fa" label="polished sequence by consensus calling mode" />
        <data name="output_variants" format="vcf" from_work_dir="variants.vcf" label="Computed variants"/>
    </outputs>
    <tests>
        <test>
            <param name="input_merged" ftype="fasta" value="reads.fasta" />
            <param name="input_reads_raw" ftype="fast5.tar.gz" value="fast5_files.tar.gz" />
            <param name="b" value="reads.sorted.bam" />
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="draft_single_seq.fa" />
            <param name="w" value="tig00000001:200000-202000" />
            <output name="output_polished" file="polished.fa" />
            <output name="output_variants">
              <assert_contents>
                <has_text text="TotalReads" />
                <has_text text="AlleleCount" />
                <has_text text="SupportFraction" />
                <has_text text="200061" />
                <has_text text="200776" />
                <has_text text="201588" />
                <has_text text="tig00000001" />
                <has_n_lines n="27" />
              </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_merged" ftype="fasta" value="reads.fasta" />
            <param name="input_reads_raw" ftype="fast5.tar.bz2" value="fast5_files.tar.bz2" />
            <param name="b" value="reads.sorted.bam" />
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="draft_single_seq.fa" />
            <param name="w" value="tig00000001:200000-202000" />
            <output name="output_polished" file="t3_polished.fa" />
            <output name="output_variants">
              <assert_contents>
                <has_text text="TotalReads" />
                <has_text text="AlleleCount" />
                <has_text text="SupportFraction" />
                <has_text text="200061" />
                <has_text text="200776" />
                <has_text text="201588" />
                <has_text text="tig00000001" />
                <has_n_lines n="27" />
              </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_merged" ftype="fasta" value="reads.fasta" />
            <param name="input_reads_raw" ftype="fast5.tar" value="fast5_files.tar" />
            <param name="b" value="reads.sorted.bam" />
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="draft_single_seq.fa" />
            <param name="w" value="tig00000001:200000-202000" />
            <output name="output_polished" file="t4_polished.fa" />
            <output name="output_variants">
              <assert_contents>
                <has_text text="TotalReads" />
                <has_text text="AlleleCount" />
                <has_text text="SupportFraction" />
                <has_text text="200061" />
                <has_text text="200776" />
                <has_text text="201588" />
                <has_text text="tig00000001" />
                <has_n_lines n="27" />
              </assert_contents>
            </output>
        </test>
        <test>
            <param name="input_merged" ftype="fasta" value="reads.fasta" />
            <param name="input_reads_raw" ftype="fast5.tar.gz" value="fast5_files.tar.gz" />
            <param name="b" value="reads.sorted.bam" />
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="draft_single_seq.fa" />
            <param name="w" value="tig00000001:198000-202000" />
            <param name="ploidy" value="2" />
            <param name="snps" value="true" />
            <param name="faster" value="true" />
            <param name="all_bases" value="true" />
            <param name="consensus" value="false" />
            <param name="min_flanking_sequence" value="10" />
            <output name="output_polished" file="t2-polished.fa" />
            <output name="output_variants">
              <assert_contents>
                <has_text text="TotalReads" />
                <has_text text="AlleleCount" />
                <has_text text="SupportFraction" />
                <has_text text="tig00000001" />
                <has_text text="198000-202000" />
                <has_n_lines n="15" />
              </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

        Build an index mapping from basecalled reads to the signals measured by the sequencer
        and
        Find SNPs using a signal-level HMM

        Tutorial and manual available at:
        http://nanopolish.readthedocs.io/en/latest/quickstart_consensus.html


           ]]></help>
    <expand macro="citations" />
</tool>
author	bgruening
date	Fri, 29 May 2020 13:29:14 -0400
parents	de5b3d8f5b90
children	bec636361cfd