je_demultiplex: je-demultiplex.xml comparison

comparison je-demultiplex.xml @ 0:424f44e2124e draft

Initial upload

author	gbcs-embl-heidelberg
date	Wed, 25 Nov 2015 12:37:28 -0500
parents
children	8930b411a9d7

comparison

equal deleted inserted replaced

--1:000000000000
+:424f44e2124e
+<tool id="je_demultiplex" name="Je-Demultiplex" version="1.0">
+<description>demultiplexes fastq files</description>
+<macros>
+<import>macros.xml</import>
+</macros>
+<stdio>
+<exit_code range="1:" level="fatal" description="Tool exception" />
+</stdio>
+<version_command>echo '1.0'</version_command>
+<command interpreter="bash">
+<![CDATA[
+je demultiplex
+## Fastq inputs
+@single_or_paired_cmd@
+#if str( $library.type ) != "single":
+@demultiplex_paired_end_cmd_options@
+#end if
+@barcode_option_cmd@
+@barcode_len_cmd@
+C=$CLIP_BARCODE
+@demultiplexer_common_options_cmd@
+@common_options_cmd@
+@demultiplexer_common_output_options_cmd@
+@demultiplexer_common_outputs_cmd@
+]]>
+</command>
+<configfiles>
+<expand macro="barcode_config_file"></expand>
+</configfiles>
+<inputs>
+<!-- single/paired - similar to macro 'single_or_paired_general' -->
+<expand macro="single_or_paired_general">
+<expand macro="demultiplex_paired_end_options"/>
+</expand>
+<expand macro="barcode_option"/>
+<expand macro="barcode_len_option"/>
+<expand macro="clip_barcode"/>
+<expand macro="demultiplexer_common_options"/>
+<expand macro="common_options"/>
+<expand macro="demultiplexer_common_output_options"/>
+</inputs>
+<outputs>
+<expand macro="demultiplexer_common_outputs"/>
+</outputs>
+<tests>
+<test>
+<!-- simple test on single end data -->
+<param name="type" value="single"/>
+<param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
+<param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/>
+<output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4">
+<discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" />
+</output>
+</test>
+<test>
+<!-- more complex test on paired end data with different barcode for fwd/rev -->
+<param name="type" value="paired"/>
+<param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
+<param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/>
+<param name="BPOS" value="BOTH"/>
+<param name="BM" value="BOTH"/>
+<param name="BRED" value="false"/>
+<param name="barcode_list_type_con" value="text"/>
+<param name="barcode_text"
+value="sample1 CACTGT:GTATAG&#10;sample2 ATTCCG:TCCGTC&#10;sample3 GCTACC:TGGTCA&#10;sample4 CGAAAC:CACTGT"/>
+<output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4">
+<discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" />
+<discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" />
+<discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/>
+<discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/>
+<discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/>
+<discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/>
+<discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/>
+<discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/>
+<discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/>
+<discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/>
+</output>
+</test>
+</tests>
+<help>
+<![CDATA[
+**What it does**
+Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use
+in 'markdupes' module.
+Input files are fastq files, and can be in gzip compressed format.
+Author: Charles Girardot  (charles.girardot@embl.de).
+Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
+------
+**Know what you are doing**
+.. class:: warningmark
+You will want to read the `documentation`__.
+.. __: http://gbcs.embl.de/portal/Je
+------
+**Parameter list**
+This is an exhaustive list of options::
+FASTQ_FILE1=File
+F1=File
+Input fastq file (optionally gzipped) for single end data, or first read in paired end
+data.
+Required.
+FASTQ_FILE2=File
+F2=File
+Input fastq file (optionally gzipped) for the second read of paired end data.
+Default value: null.
+BARCODE_FILE=File
+BF=File
+Barcode file describing sequence list and sample names. Tab-delimited file with 2
+columns, with the sample in col1 and the corresponding barcode in col2.
+Simple barcode file format : 2 tab-delimited colums
+If multiple barcode map to the same sample, either line can be duplicated e.g.
+sample1  ATAT
+sample1  GAGG
+sample2  CCAA
+sample2  TGTG
+Or barcodes can be combined using the OR operator '|' i.e. the file above can be
+re-written like
+sample1  ATAT|GAGG
+sample2  CCAA|TGTG
+Finally, for the special situation of paired-end data in which barcodes differ at both
+ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
+and read_2 can be distinguished using a ':' separator i.e.
+sample1  ATAT:GAGG
+sample2  CCAA:TGTG
+This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
+barcode at read_2. Note that you can still combine barcodes using | e.g.
+sample1  ATAT|GAGG:CCAA|TGTG
+would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
+AND CCAA OR TGTG at read_2.
+Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
+same as the simple barcode file format but the extra columns contains the file name(s)
+to use to name output files. A unique extra column is expected for single-end while 2
+extra columns are expected for paired-end. In case, lines are duplicated (multiple
+barcodesmapping the same sample), the same file name should be indicated in the third
+(and fourth) column(s).
+sample1  ATAT  spl1_1.txt.gz  spl1_2.txt.gz
+sample1  GAGG  spl1_1.txt.gz  spl1_2.txt.gz
+sample2  CCAA  spl2_1.txt.gz  spl2_2.txt.gz
+Or
+sample1  ATAT|GAGG:CCAA|TGTG  spl1_1.txt.gz  spl1_2.txt.gz
+Ns in barcode sequence are allowed and are used to flag positions that should be ignored
+in sample matching
+i.e. they will be clipped off the read sequence (like in iCLIP protocol).
+Required.
+BARCODE_READ_POS=BarcodePosition
+BPOS=BarcodePosition
+For paired-end data, where to expect the barcode(s) :
+READ_1 (beginning of read from FASTQ_FILE_1),
+READ_2 (beginning of read from FASTQ_FILE_2),
+BOTH (beginning of both reads).
+Automatically set to READ_1 in single end mode.
+Default value: BOTH. This option can be set to 'null' to clear the default value.
+Possible values: {READ_1, READ_2, BOTH, NONE}
+BCLEN=String
+LEN=String
+Length of the barcode sequences, optional. Taken from barcode file when not given.
+In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
+length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
+the barcode length for read_1 and read_2 respectively.
+Default value: null.
+BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition
+BM=BarcodePosition
+Indicates which barcode(s) should be used for sample lookup
+Automatically set to READ_1 in single end mode.
+For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to
+resolve sample:
+use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used
+for sample matching:
+use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used
+for sample matching:
+use BM=BOTH (beginning of both reads) if both should be used.
+When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES :
+If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample
+and 'Je demultiplex' uses the two barcodes according to the STRICT value.
+If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each
+sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find
+the relevant sample. Note that this is the only situation in which all barcode matching
+options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X
+and Z are 2 integers.
+Default value: BOTH. This option can be set to 'null' to clear the default value.
+Possible values: {READ_1, READ_2, BOTH, NONE}
+REDUNDANT_BARCODES=Boolean
+BRED=Boolean
+This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH'
+Indicates if both read's barcodes encode redundant information or if barcodes are
+supposed to be identical at both ends (or to resolve to the same sample when a pool of
+barcodes is used per sample).
+When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode
+different information. For example, only one of the barcodes encodes the sample identity
+while
+the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real
+duplicates.
+Another example is when both barcodes should be used in a combined fashion to resolve the
+sample.
+In the first example, you should use BPOS=BOTH BRED=false BM=READ_1.
+In the second example, you should have BPOS=BOTH BRED=false BM=BOTH.
+Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as
+'demultiplex' would then check the STRICT option to perform sample resolution.
+Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN,
+barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept
+different values for both barcodes in the form X:Z where X and Z are 2 integers.
+Default value: true. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+STRICT=Boolean
+S=Boolean
+For paired-end data and when two distinct barcodes/indices are used to encode samples,
+this option tells if both barcodes should resolve to the same sample.
+When true and if only one of the two reads has a barcode match, the read pair is
+'unassigned'.
+When false and if only one of the two reads has a barcode match, the read pair is
+assigned to the
+corresponding sample
+When reads resolve to different samples, the read pair is always 'unassigned'.
+Default value: false. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+MAX_MISMATCHES=String
+MM=String
+Maximum mismatches for a barcode to be considered a match. In situations where both
+barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
+distinct
+values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
+read_1 and read_2 respectively.
+MM=null is like MM=0
+Default value: 1. This option can be set to 'null' to clear the default value.
+MIN_MISMATCH_DELTA=String
+MMD=String
+Minimum difference between the number of mismatches against the best and the second best
+barcode. When MMD is not respected, the read remains unassigned.
+When two distinct barcodes are used for sample matching (dual encoding), two distinct
+values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
+first (e.g. from read_1 or index_1)
+MMD=null is like MMD=0
+Default value: 1. This option can be set to 'null' to clear the default value.
+MIN_BASE_QUALITY=String
+Q=String
+Minimum base quality during barcode matching: bases which quality is less than this
+cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
+matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
+and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
+(e.g. from read_2 or index_2) respectively.
+Q=null is like Q=0.
+Default value: 10. This option can be set to 'null' to clear the default value.
+XTRIMLEN=String
+XT=String
+Optional extra number of base to be trimmed right after the barcode (only used if
+CLIP_BARCODE=true).
+When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
+and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
+BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
+end up with reads of the same length (note that this can also be operated using ZT). If a
+unique value is given, e.g. XT=1, while running paired-end the following rule applies:
+(1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
+(2) BPOS=BOTH, the value is used for both reads.
+Note that XT=null is like XT=0.
+Default value: 0. This option can be set to 'null' to clear the default value.
+ZTRIMLEN=String
+ZT=String
+Optional extra number of bases to be trimmed from the read end i.e. 3' end.
+When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
+where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
+when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
+as to end up with reads of the same length (note that this can also be operated using
+XT). Note that if a single value is passed, the value always applies to both reads in
+paired-end mode without further consideration.
+ZT=null is like ZT=0.
+Default value: 0. This option can be set to 'null' to clear the default value.
+CLIP_BARCODE=Boolean
+C=Boolean
+Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
+applicable, before writing to output file.
+If false, reads are written without modification to output file.
+Apply to both barcodes when BPOS=BOTH.
+Default value: true. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+ADD_BARCODE_TO_HEADER=Boolean
+ADD=Boolean
+Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH.
+If true, the string ':barcode' is added at the end of the read header with a ':' added
+only if current read header does not end with ':'.
+If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second
+read also has its own matched barcode written. Else, the read without a barcode receives
+the barcode from the barcoded read.
+For example:
+@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
+becomes:
+@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE
+When barcodes containing random positions, i.e. 'N', (for example like 	in the iCLIP
+protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT
+the matched barcode.
+Default value: true. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+ENSURE_IDENTICAL_HEADER_NAMES=Boolean
+SAME_HEADERS=Boolean
+Makes sure that headers of both reads of a pair are identical, using the following read
+header pattern (for both reads of a pair):
+@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
+This option only makes sense in
+paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the
+read headers are not identical. When molecular barcodes are present in reads (either as
+additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you
+will end with (problematic) read headers like this:
+HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
+HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
+SAME_HEADERS=true will instead generates the following identical header for both reads:
+HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
+Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
+generating different headers.
+Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
+case a space will be preserved ie:
+HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
+Default value: true. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+READ_NAME_REPLACE_CHAR=String
+RCHAR=String
+Replace spaces in read name/header using provided character. This is particularly handy
+when you need to retain ADDed barcode in read name/header during mapping (everything
+after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
+@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
+becomes
+@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE
+Default value: null.
+QUALITY_FORMAT=FastqQualityFormat
+V=FastqQualityFormat
+A value describing how the quality values are encoded in the fastq.  Either 'Solexa' for
+pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
+above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
+of 33.  If this value is not specified (or 'null' is given), the quality format will be
+detected.
+Default value: Standard. This option can be set to 'null' to clear the default value.
+Possible values: {Solexa, Illumina, Standard}
+KEEP_UNASSIGNED_READ=Boolean
+UN=Boolean
+Should un-assigned reads be saved in files or simply ignored. File names are
+automatically created or can be given using UF1 & UF2 options.
+Default value: true. This option can be set to 'null' to clear the default value.
+Possible values: {true, false}
+BARCODE_DIAG_FILE=String
+DIAG=String
+Name for a barcode match reporting file (not generated by default).Either a name (in
+which case the file will be created in the output dir) or full path. This file will
+contain a line per read pair with the barcode best matching the read subsequence or
+'null' when no match is found according to matching parameters ; and the final selected
+sample. This file is useful for debugging or further processing in case both ends are
+barcoded.
+N.B: this file will have a size of about one of the fastq input files.
+Default value: null.
+]]>
+</help>
+</tool>

Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex

comparison je-demultiplex.xml @ 0:424f44e2124e draft