Mercurial > repos > nml > refseq_masher

<tool id="refseq_masher_matches" name="RefSeq Masher Matches" version="0.1.2">
  <description>
    Find closest matching NCBI RefSeq Genomes to your sequences
  </description>
  <requirements>
    <requirement type="package" version="0.1.1">refseq_masher</requirement>
  </requirements>
  <command detect_errors="exit_code">
<![CDATA[

#import re
#import os

#if $input.type == 'fasta'
#set $input_files = '"{}.fasta"'.format(os.path.splitext($input.fasta.name)[0])
  ln -s "$input.fasta" $input_files &&
#elif $input.type == 'paired'
#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq'
#set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext)
#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq'
#set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext)
#set $input_files = '{} {}'.format($_forward, $_reverse)
  ln -s "$input.forward" $_forward &&
  ln -s "$input.reverse" $_reverse &&
#elif $input.type == 'single'
#set $input_files = '"{}"'.format($input.single.name)
  ln -s "$input.single" $input_files &&
#elif $input.type == 'paired_collection'
#set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq'
#set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext)
#set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq'
#set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext)
#set $input_files = '{} {}'.format($_forward, $_reverse)
  ln -s "$input.paired_collection.forward" $_forward &&
  ln -s "$input.paired_collection.reverse" $_reverse &&
#end if

refseq_masher
  $adv.verbosity
  matches
  --output refseq_masher-matches.${adv.output_type}
  --output-type $adv.output_type
  --top-n-results $top_n_results
#if $adv.min_kmer_threshold
  --min-kmer-threshold $adv.min_kmer_threshold
#end if
  -T "\${TMPDIR:-/tmp}"
  $input_files
]]>
  </command>
  <inputs>
    <conditional name="input">
      <param name="type" type="select" label="Sequence input type">
        <option value="fasta">Genome FASTA</option>
        <option value="paired">Paired-end FASTQs</option>
        <option value="single">Single-end FASTQ</option>
        <option value="paired_collection">Paired-end FASTQ collection</option>
      </param>
      <when value="fasta">
        <param name="fasta"
          type="data" format="fasta"
          optional="false"
          label="Genome FASTA file"
          />
      </when>
      <when value="paired">
        <param name="forward"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Forward FASTQ file"
          />
        <param name="reverse"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Reverse FASTQ file"
          help="File format must match the Forward FASTQ file"
          />
      </when>
      <when value="single">
        <param name="single"
          type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
          optional="false"
          label="Single-end FASTQ file"
          />
      </when>
      <when value="paired_collection">
        <param name="paired_collection"
          type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt"
          collection_type="paired"
          optional="false"
          label="Paired-end FASTQ collection"
          />
      </when>
    </conditional>
    <param name="top_n_results"
      type="integer"
      min="0"
      value="20"
      optional="true"
      label="Top N matches to report (set to 0 to report all)"
      />
    <section name="adv" title="Advanced Options" expanded="false">
      <param name="min_kmer_threshold"
        type="integer"
        min="1"
        value="8"
        optional="true"
        label="Mash sketch of reads: Minimum copies of each k-mer required to pass noise filter for reads (default=8)"
        />
      <param name="output_type"
        type="select"
        label="Output type"
        multiple="false">
        <option value="tab" selected="true">
          Tabular (tab-delimited values)
        </option>
        <option value="csv">
          CSV (Comma Separated Values)
        </option>
      </param>
      <param name="verbosity"
        type="select"
        label="Logging verbosity">
        <option value="">Error messages only</option>
        <option value="-v">Show warning messages</option>
        <option value="-vv" selected="true">Show info messages</option>
        <option value="-vvv">Show debug messages</option>
      </param>
    </section>
  </inputs>
  <outputs>
    <data name="output_path_csv"
      format="csv"
      label="RefSeq Masher matches table"
      from_work_dir="refseq_masher-matches.csv">
      <filter>adv['output_type'] == 'csv'</filter>
    </data>
    <data name="output_path_tab"
      format="tabular"
      label="RefSeq Masher matches table"
      from_work_dir="refseq_masher-matches.tab">
      <filter>adv['output_type'] == 'tab'</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <conditional name="input">
        <param name="type" value="fasta"/>
        <param name="fasta" value="Se-Enteritidis.fasta"/>
      </conditional>
      <param name="top_n_results" value="1"/>
      <section name="adv">
        <param name="output_type" value="tab"/>
      </section>
      <output name="output_path_tab"
        value="Se-Enteritidis-refseq_masher-matches.tab"
        ftype="tabular"
        lines_diff="0">
      </output>
    </test>
    <test>
      <conditional name="input">
        <param name="type" value="single"/>
        <param name="single" value="SRR1203042_1-head4000.fastq"/>
      </conditional>
      <param name="top_n_results" value="1"/>
      <section name="adv">
        <param name="output_type" value="tab"/>
        <param name="min_kmer_threshold" value="2"/>
      </section>
      <output name="output_path_tab"
        value="SRR1203042_1-head4000-refseq_masher-matches-m2.tab"
        ftype="tabular"
        lines_diff="0">
      </output>
    </test>
  </tests>
  <help>
<![CDATA[
RefSeq Masher - Genomic Distance
================================

Find what NCBI RefSeq genomes most closely match your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes.


Source code available on Github at github.com/phac-nml/refseq_masher


`matches` - find the closest matching NCBI RefSeq Genomes in your input sequences
---------------------------------------------------------------------------------

Command-line usage information::

    Usage: refseq_masher matches [OPTIONS] INPUT...

      Find NCBI RefSeq genome matches for an input genome fasta file

      Input is expected to be one or more FASTA/FASTQ files or one or more
      directories containing FASTA/FASTQ files. Files can be Gzipped.

    Options:
      --mash-bin TEXT                 Mash binary path (default="mash")
      -o, --output PATH               Output file path (default="-"/stdout)
      --output-type [tab|csv]         Output file type (tab|csv)
      -n, --top-n-results INTEGER     Output top N results sorted by distance in
                                      ascending order (default=5)
      -m, --min-kmer-threshold INTEGER
                                      Mash sketch of reads: "Minimum copies of
                                      each k-mer required to pass noise filter for
                                      reads" (default=8)
      -h, --help                      Show this message and exit.


Example
~~~~~~~

With the FNA.GZ_ file for *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_::


    # download sequence file
    wget ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz

    # find RefSeq matches
    refseq_masher -vv matches GCF_000329025.1_ASM32902v1_genomic.fna.gz


**Log**::


    2018-01-29 11:02:13,786 INFO: Collected 1 FASTA inputs and 0 read sets [in ...refseq_masher/refseq_masher/utils.py:185]
    2018-01-29 11:02:13,786 INFO: Creating Mash sketch file for ...refseq_masher/GCF_000329025.1_ASM32902v1_genomic.fna.gz [in ...refseq_masher/refseq_masher/mash/sketch.py:24]
    2018-01-29 11:02:14,055 INFO: Created Mash sketch file at "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/sketch.py:40]
    2018-01-29 11:02:14,613 INFO: Ran Mash dist successfully (output length=11647035). Parsing Mash dist output [in ...refseq_masher/refseq_masher/mash/dist.py:64]
    2018-01-29 11:02:15,320 INFO: Parsed Mash dist output into Pandas DataFrame with 54924 rows [in ...refseq_masher/refseq_masher/mash/dist.py:67]
    2018-01-29 11:02:15,321 INFO: Deleting temporary sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/dist.py:72]
    2018-01-29 11:02:15,321 INFO: Sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" deleted! [in ...refseq_masher/refseq_masher/mash/dist.py:74]
    2018-01-29 11:02:15,322 INFO: Ran Mash dist on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:88]
    2018-01-29 11:02:15,323 INFO: Fetching all taxonomy info for 5 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35]
    2018-01-29 11:02:15,325 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38]
    2018-01-29 11:02:15,327 INFO: Columns with all NA values dropped (ncol=11) [in ...refseq_masher/refseq_masher/taxonomy.py:40]
    2018-01-29 11:02:15,327 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41]
    2018-01-29 11:02:15,329 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43]
    2018-01-29 11:02:15,329 INFO: Merged taxonomic info into results output [in ...refseq_masher/refseq_masher/cli.py:90]
    2018-01-29 11:02:15,329 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:91]
    2018-01-29 11:02:15,331 INFO: Writing output to stdout [in ...refseq_masher/refseq_masher/writers.py:16]


**Output**

+---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+
|  sample                               | top_taxonomy_name                                                  | distance | pvalue | matching | full_taxonomy                                                                                                                                               | taxonomic_subspecies                | taxonomic_species   | taxonomic_genus  | taxonomic_family   | taxonomic_order  | taxonomic_class     | taxonomic_phylum  | taxonomic_superkingdom  | subspecies | serovar     | plasmid | bioproject  | biosample    | taxid  | assembly_accession  | match_id                                                                                                                                 |
+=======================================+====================================================================+==========+========+==========+=============================================================================================================================================================+=====================================+=====================+==================+====================+==================+=====================+===================+=========================+============+=============+=========+=============+==============+========+=====================+==========================================================================================================================================+
| GCF_000329025.1_ASM32902v1_genomic    | Salmonella enterica subsp. enterica serovar Enteritidis str. CHS44 | 0.0      | 0.0    | 400/400  | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Salmonella; enterica; subsp. enterica; serovar Enteritidis; str. CHS44 | Salmonella enterica subsp. enterica | Salmonella enterica | Salmonella       | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria    | Bacteria                | enterica   | Enteritidis |         | PRJNA185053 | SAMN01041154 | 702979 | NZ_ALFF             | ./rcn/refseq-NZ-702979-PRJNA185053-SAMN01041154-NZ_ALFF-.-Salmonella_enterica_subsp._enterica_serovar_Enteritidis_str._CHS44.fna         |
+---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+


The top match is *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_ with a distance of 0.0 and 400/400 sketches matching, which is what we expected. There's other taxonomic information available in the results table that may be useful.


Legal
-----

Copyright Government of Canada 2017

Written by: National Microbiology Laboratory, Public Health Agency of Canada

Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this work except in compliance with the License. You may obtain a copy of the
License at:

www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.

Contact
-------

**Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca

.. _Mash: genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x
.. _FNA.GZ: ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz
.. _CHS44: ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/


]]>
  </help>
  <citations>
    <!-- Citation for Mash paper -->
    <citation type="doi">10.1186/s13059-016-0997-x</citation>
  </citations>
</tool>
author	nml
date	Wed, 08 May 2019 09:15:56 -0400
parents	26df66c32861
children	1ec42f033bb4