view barcode_splitter_multi/barcode_splitter.xml @ 0:50df2d629d51 draft

Uploaded
author hepcat72
date Fri, 26 Aug 2016 16:30:56 -0400
parents
children 5e0fd61660b7
line wrap: on
line source

<tool id="cshl_princeton_fastx_barcode_splitter" version="0.4" name="Barcode Splitter">
  <description></description>
  <command interpreter="bash" detect_errors="aggressive"><![CDATA[
barcode_splitter_galaxy_wrapper.sh split
#for $sf in $seqfiles
    ${sf.input.extension}
    #break
#end for
--bcfile $bcfile --mismatches $mismatches --galaxy $zip $barcodes_at_end
#for $sf in $seqfiles
    ${sf.input}
#end for
--idxread
#set $bound = $num_barcode_columns.value + 1
#for $n in range( 1, $bound )
    ${n}
#end for
> $summary
]]>
  </command>

  <inputs>
    <param format="txt" name="bcfile" type="data" label="Barcode File" help="Tab-delimited text file where the first column is a sample ID and subsequent columns are barcodes." />
    <param name="num_barcode_columns" type="integer" size="2" value="1" label="Number of barcode columns" help="The number of columns in the barcode file containing barcode sequences.  Note that you must submit at least this many read files." />


    <repeat name="seqfiles" title="Read Files" min="1" default="2">
        <param format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" help="Barcoded reads files must be first. If there are multiple barcode columns in the barcode file, the files must be supplied in the same order as the barcode columns (from left to right)." />
    </repeat>

    <param name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" />

    <param name="barcodes_at_end" type="boolean" truevalue="--barcodes_at_end" falsevalue="" checked="false"
        label="Barcodes are at the end of all sequences" help="Default is the beginning of all sequences" />

    <param name="zip" type="boolean" truevalue="--gzip" falsevalue="" checked="false"
        label="Compress/zip the output" help="This generates reads files with a .gz extension.  Default is based on the file extension of the first input file." />

  </inputs>

  <outputs>
    <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" />
    <collection name="split_output" type="list" format_source="input" label="${tool.name} on ${on_string}">
      <discover_datasets pattern="__designation_and_ext__" directory="split" visible="false" label="${designation}"/>
    </collection>
  </outputs>

  <tests>
    <test>
      <!-- Split a FASTQ file -->
      <param name="bcfile" value="barcode_splitter1.txt" />
      <param name="num_barcode_columns" value="1" />
      <repeat name="seqfiles">
        <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
      </repeat>
      <param name="barcodes_at_end" value="" />
      <param name="mismatches" value="2" />
      <output name="summary" file="barcode_splitter1.out" />
      <collection name="output" type="list">
        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
      </collection>
    </test>

    <test>
      <!-- Split a FASTQ file, using separate index read -->
      <param name="bcfile" value="barcode_splitter1.txt" />
      <param name="num_barcode_columns" value="1" />
      <repeat name="seqfiles">
        <param name="input" value="barcode_splitter_index.fastq" ftype="fastqsolexa" />
      </repeat>
      <repeat name="seqfiles">
        <param name="input" value="barcode_splitter1.fastq" ftype="fastqsolexa" />
      </repeat>
      <param name="barcodes_at_end" value="" />
      <param name="mismatches" value="2" />
      <output name="output" file="barcode_splitter1.out" />
      <collection name="split_output" type="list">
        <discovered_dataset designation="BC1" ftype="fastqsolexa" file="barcode_splitter1_BC1.out" />
        <discovered_dataset designation="BC2" ftype="fastqsolexa" file="barcode_splitter1_BC2.out" />
        <discovered_dataset designation="BC3" ftype="fastqsolexa" file="barcode_splitter1_BC3.out" />
        <discovered_dataset designation="BC4" ftype="fastqsolexa" file="barcode_splitter1_BC4.out" />
        <discovered_dataset designation="unmatched" ftype="fastqsolexa" file="barcode_splitter1_unmatched.out" />
      </collection>
    </test>
  </tests>

    <help><![CDATA[
**What it does**

This tool splits a FASTQ file into several files, using barcodes as the split criteria.  Barcodes in one file can be used to split multiple sorted files.  Multiple sets of barcodes, each located in a different file, can be used.

--------

**Barcode file Format**

Barcode files are simple text files.
Each line should contain an identifier (descriptive name for the barcode), and at least 1 barcode, separated by TAB characters. Multiple columns of barcodes are supported (each corresponding to a separate barcoded read file), though there's usually just 1.  An example of the usage of multiple sets of barcodes could be the first set of barcodes can denote user and the second set can be each user's sample barcodes.
Example::

    #This line is a comment (starts with a 'number' sign)
    BC1	GATCT	TTGCAT
    BC2	ATCGT	GCGCAT
    BC3	GTGAT	AGGTCA
    BC4	TGTCT	CTTTGG

For each barcode, a new FASTQ file will be created (with the barcodes' identifier as part of the file name).
Sequences matching the barcodes in a row will be stored in the appropriate file.

The first sequence file submitted must contain sequences with the barcodes in the first column of the barcode file.  The second sequence file must contain sequences with the barcodes in the second column, and so on.  The 'Number of barcode columns' specified must match the number of actual columns in the barcode file.

One (possibly two) additional FASTQ files will be created: the 'unmatched' file (and the 'multimatched' file), where sequences not matching any barcode (or matching more than 1 barcode when mismatches are taken into account) will be stored.

The output of this tool is a summary table displaying the split counts for each barcode identifier and the percentage of the total reads those represent.
In addition, each fastq file produced will be loaded into the galaxy history as part of a collection list.
]]>
  </help>

  <!-- Barcode-Splitter is part of the paired_sequence_utils package, by L.Parsons (lparsons@princeton.edu) and R.Leach (rleach@princeton.edu) -->
  <citations>
        <citation type="bibtex">
      @misc{paired_sequence_utils,
        title = {{Barcode}-{Splitter}},
        url = {https://bitbucket.org/hepcat72/paired_sequence_utils},
        author = "Parsons, Lance and Leach, Robert"
      }
    </citation>
  </citations>

</tool>