view barcode_splitter_multi/barcode_splitter.xml @ 3:d521806e1b60 draft default tip

Deleted selected files
author hepcat72
date Wed, 07 Sep 2016 13:48:58 -0400
parents 5e0fd61660b7
children
line wrap: on
line source

<tool id="cshl_princeton_fastx_barcode_splitter" version="0.13" name="Barcode Splitter">
  <description></description>
  <command interpreter="bash" detect_errors="aggressive"><![CDATA[
barcode_splitter_galaxy_wrapper.sh split
##There must be an index file, so supply the extension of the first one
#for $indf in $indexfiles
    ${indf.input2.extension}
    #break
#end for
--bcfile $bcfile --mismatches $mismatches --galaxy $barcodes_at_end
#set $num_index_files = 1
#if self.varExists('indexfiles'):
    #for $indf in $indexfiles
        ${indf.input2}
        #set $num_index_files += 1
    #end for
#end if
#if self.varExists('seqfiles'):
    #for $sf in $seqfiles
        ${sf.input}
    #end for
#end if
--idxread
#for $n in range( 1, $num_index_files )
    ${n}
#end for
> $summary
]]>
  </command>

  <inputs>
    <param format="txt" name="bcfile" type="data" label="Barcode File" help="Tab-delimited text file where the first column is a sample ID and subsequent columns are barcodes." />


    <repeat name="seqfiles" title="Read Files" min="0" default="1">
        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Typically 'Read 1'" help="If your reads file has barcodes embedded in it, do not enter it here - use the 'Index Files' field below." />
    </repeat>
    <repeat name="indexfiles" title="Index Files" min="0" default="1">
        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input2" type="data" label="Typically 'Read 2 (Index Read)'" help="If there are multiple barcode columns in the barcode file, the files must be supplied in the same order as the barcode columns (from left to right).  There must be as many index files as there are barcode columns in the barcodes file.  If your reads have barcodes embedded in them (at the beginning or end), that file must be submitted here as an index file." />
    </repeat>

    <param name="mismatches" type="integer" size="3" value="0" max="2" min="0" label="Number of allowed mismatches" help="An integer between 0 and 2 (inclusive).  Warning: Make sure your barcodes all differ from one another by at least this many nucleotides plus 1, otherwise sequences that match both barcodes equally well will be thrown out as 'multimatched' reads." />

    <param name="barcodes_at_end" type="boolean" truevalue="--barcodes_at_end" falsevalue="" checked="false" label="Barcodes are at the end of all sequences" help="Default is the beginning of all sequences" />

  </inputs>

  <outputs>
    <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" />
    <collection name="split_output" type="list" format_source="input" label="${tool.name} on ${on_string}">
      <discover_datasets pattern="__designation_and_ext__" directory="split" visible="false" label="${designation}"/>
    </collection>
  </outputs>

  <tests>
    <test>
      <!-- Split a FASTQ file -->
      <param name="bcfile" value="barcode_splitter1.txt" />
      <param name="num_barcode_columns" value="1" />
      <repeat name="indexfiles">
        <param name="input2" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
      </repeat>
      <param name="barcodes_at_end" value="" />
      <param name="mismatches" value="2" />
      <output name="summary" file="barcode_splitter1.out" />
      <collection name="output" type="list">
        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
      </collection>
    </test>

    <test>
      <!-- Split a FASTQ file, using separate index read -->
      <param name="bcfile" value="barcode_splitter1.txt" />
      <param name="num_barcode_columns" value="1" />
      <repeat name="indexfiles">
        <param name="input2" value="barcode_splitter_index.fastq" ftype="fastqsolexa" />
      </repeat>
      <repeat name="seqfiles">
        <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
      </repeat>
      <param name="barcodes_at_end" value="" />
      <param name="mismatches" value="2" />
      <output name="output" file="barcode_splitter1.out" />
      <collection name="split_output" type="list">
        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
      </collection>
    </test>
  </tests>

    <help><![CDATA[
**What it does**

This tool splits a FASTQ file into several files, using barcodes as the split criteria.  Barcodes in one file can be used to split multiple sorted files.  Multiple sets of barcodes, each located in a different file, can be used.

--------

**Barcode file Format**

Barcode files are simple text files.
Each line should contain an identifier (descriptive name for the barcode), and at least 1 barcode, separated by TAB characters. Multiple columns of barcodes are supported (each corresponding to a separate barcoded read file), though there's usually just 1.  An example of the usage of multiple sets of barcodes could be the first set of barcodes can denote user and the second set can be each user's sample barcodes.
Example::

    #This line is a comment (starts with a 'number' sign)
    BC1	GATCT	TTGCAT
    BC2	ATCGT	GCGCAT
    BC3	GTGAT	AGGTCA
    BC4	TGTCT	CTTTGG

For each barcode, a new FASTQ file will be created (with the barcodes' identifier as part of the file name).
Sequences matching the barcodes in a row will be stored in the appropriate file.

The first sequence file submitted must contain sequences with the barcodes in the first column of the barcode file.  The second sequence file must contain sequences with the barcodes in the second column, and so on.  The Number of Index Files supplied must match the number of actual columns in the barcode file and the order in which they are supplied must match the order of the barcode columns as well.

As many as 2 additional FASTQ output files will be created for each read/index file: the 'unmatched' file and the 'multimatched' file, where sequences not matching any barcode or matching more than 1 barcode (when mismatches are taken into account) will be stored.

The output of this tool is a summary table displaying the split counts for each barcode identifier and the percentage of the total reads those represent.
In addition, each fastq file produced will be loaded into the galaxy history as part of a collection list.
]]>
  </help>

  <!-- Barcode-Splitter is part of the paired_sequence_utils package, by L.Parsons (lparsons@princeton.edu) and R.Leach (rleach@princeton.edu) -->
  <citations>
        <citation type="bibtex">
      @misc{paired_sequence_utils,
        title = {{Barcode}-{Splitter}},
        url = {https://bitbucket.org/hepcat72/paired_sequence_utils},
        author = "Parsons, Lance and Leach, Robert"
      }
    </citation>
  </citations>

</tool>