Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex
diff je-demultiplex.xml @ 0:424f44e2124e draft
Initial upload
author | gbcs-embl-heidelberg |
---|---|
date | Wed, 25 Nov 2015 12:37:28 -0500 |
parents | |
children | 8930b411a9d7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/je-demultiplex.xml Wed Nov 25 12:37:28 2015 -0500 @@ -0,0 +1,439 @@ +<tool id="je_demultiplex" name="Je-Demultiplex" version="1.0"> + <description>demultiplexes fastq files</description> + <macros> + <import>macros.xml</import> + </macros> + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + <version_command>echo '1.0'</version_command> + <command interpreter="bash"> +<![CDATA[ + je demultiplex + + ## Fastq inputs + @single_or_paired_cmd@ + #if str( $library.type ) != "single": + @demultiplex_paired_end_cmd_options@ + #end if + + @barcode_option_cmd@ + @barcode_len_cmd@ + C=$CLIP_BARCODE + + @demultiplexer_common_options_cmd@ + @common_options_cmd@ + + @demultiplexer_common_output_options_cmd@ + @demultiplexer_common_outputs_cmd@ + +]]> + </command> + <configfiles> + <expand macro="barcode_config_file"></expand> + </configfiles> + <inputs> + <!-- single/paired - similar to macro 'single_or_paired_general' --> + <expand macro="single_or_paired_general"> + <expand macro="demultiplex_paired_end_options"/> + </expand> + + <expand macro="barcode_option"/> + <expand macro="barcode_len_option"/> + <expand macro="clip_barcode"/> + + <expand macro="demultiplexer_common_options"/> + + <expand macro="common_options"/> + + <expand macro="demultiplexer_common_output_options"/> + + </inputs> + <outputs> + <expand macro="demultiplexer_common_outputs"/> + </outputs> + + <tests> + <test> + <!-- simple test on single end data --> + <param name="type" value="single"/> + <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/> + <param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/> + <output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4"> + <discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" /> + </output> + </test> + <test> + <!-- more complex test on paired end data with different barcode for fwd/rev --> + <param name="type" value="paired"/> + <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/> + <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/> + + <param name="BPOS" value="BOTH"/> + <param name="BM" value="BOTH"/> + <param name="BRED" value="false"/> + + <param name="barcode_list_type_con" value="text"/> + <param name="barcode_text" + value="sample1 CACTGT:GTATAG sample2 ATTCCG:TCCGTC sample3 GCTACC:TGGTCA sample4 CGAAAC:CACTGT"/> + <output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4"> + <discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" /> + <discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" /> + <discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/> + <discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/> + <discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/> + <discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/> + <discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/> + <discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/> + <discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/> + <discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/> + </output> + </test> + </tests> + + + <help> +<![CDATA[ +**What it does** + +Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use +in 'markdupes' module. +Input files are fastq files, and can be in gzip compressed format. + +Author: Charles Girardot (charles.girardot@embl.de). + +Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de). + +------ + +**Know what you are doing** + +.. class:: warningmark + + You will want to read the `documentation`__. + + .. __: http://gbcs.embl.de/portal/Je + +------ + +**Parameter list** + +This is an exhaustive list of options:: + + FASTQ_FILE1=File + F1=File + + Input fastq file (optionally gzipped) for single end data, or first read in paired end + data. + + Required. + + FASTQ_FILE2=File + F2=File + + Input fastq file (optionally gzipped) for the second read of paired end data. + + Default value: null. + + BARCODE_FILE=File + BF=File + + Barcode file describing sequence list and sample names. Tab-delimited file with 2 + columns, with the sample in col1 and the corresponding barcode in col2. + Simple barcode file format : 2 tab-delimited colums + If multiple barcode map to the same sample, either line can be duplicated e.g. + sample1 ATAT + sample1 GAGG + sample2 CCAA + sample2 TGTG + Or barcodes can be combined using the OR operator '|' i.e. the file above can be + re-written like + sample1 ATAT|GAGG + sample2 CCAA|TGTG + Finally, for the special situation of paired-end data in which barcodes differ at both + ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1 + and read_2 can be distinguished using a ':' separator i.e. + sample1 ATAT:GAGG + sample2 CCAA:TGTG + This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG + barcode at read_2. Note that you can still combine barcodes using | e.g. + sample1 ATAT|GAGG:CCAA|TGTG + would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1 + AND CCAA OR TGTG at read_2. + Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums + same as the simple barcode file format but the extra columns contains the file name(s) + to use to name output files. A unique extra column is expected for single-end while 2 + extra columns are expected for paired-end. In case, lines are duplicated (multiple + barcodesmapping the same sample), the same file name should be indicated in the third + (and fourth) column(s). + sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz + sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz + sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz + Or + sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz + Ns in barcode sequence are allowed and are used to flag positions that should be ignored + in sample matching + i.e. they will be clipped off the read sequence (like in iCLIP protocol). + + Required. + + BARCODE_READ_POS=BarcodePosition + BPOS=BarcodePosition + + For paired-end data, where to expect the barcode(s) : + READ_1 (beginning of read from FASTQ_FILE_1), + READ_2 (beginning of read from FASTQ_FILE_2), + BOTH (beginning of both reads). + Automatically set to READ_1 in single end mode. + + Default value: BOTH. This option can be set to 'null' to clear the default value. + Possible values: {READ_1, READ_2, BOTH, NONE} + + BCLEN=String + LEN=String + + Length of the barcode sequences, optional. Taken from barcode file when not given. + In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct + length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing + the barcode length for read_1 and read_2 respectively. + + Default value: null. + + BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition + BM=BarcodePosition + + Indicates which barcode(s) should be used for sample lookup + Automatically set to READ_1 in single end mode. + For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to + resolve sample: + use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used + for sample matching: + use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used + for sample matching: + use BM=BOTH (beginning of both reads) if both should be used. + + When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES : + If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample + and 'Je demultiplex' uses the two barcodes according to the STRICT value. + If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each + sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find + the relevant sample. Note that this is the only situation in which all barcode matching + options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X + and Z are 2 integers. + + Default value: BOTH. This option can be set to 'null' to clear the default value. + Possible values: {READ_1, READ_2, BOTH, NONE} + + + REDUNDANT_BARCODES=Boolean + BRED=Boolean + + This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH' + Indicates if both read's barcodes encode redundant information or if barcodes are + supposed to be identical at both ends (or to resolve to the same sample when a pool of + barcodes is used per sample). + When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode + different information. For example, only one of the barcodes encodes the sample identity + while + the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real + duplicates. + Another example is when both barcodes should be used in a combined fashion to resolve the + sample. + In the first example, you should use BPOS=BOTH BRED=false BM=READ_1. + In the second example, you should have BPOS=BOTH BRED=false BM=BOTH. + Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as + 'demultiplex' would then check the STRICT option to perform sample resolution. + Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN, + barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept + different values for both barcodes in the form X:Z where X and Z are 2 integers. + + Default value: true. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + STRICT=Boolean + S=Boolean + + For paired-end data and when two distinct barcodes/indices are used to encode samples, + this option tells if both barcodes should resolve to the same sample. + When true and if only one of the two reads has a barcode match, the read pair is + 'unassigned'. + When false and if only one of the two reads has a barcode match, the read pair is + assigned to the + corresponding sample + When reads resolve to different samples, the read pair is always 'unassigned'. + + Default value: false. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + MAX_MISMATCHES=String + MM=String + + Maximum mismatches for a barcode to be considered a match. In situations where both + barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two + distinct + values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for + read_1 and read_2 respectively. + MM=null is like MM=0 + + Default value: 1. This option can be set to 'null' to clear the default value. + + MIN_MISMATCH_DELTA=String + MMD=String + + Minimum difference between the number of mismatches against the best and the second best + barcode. When MMD is not respected, the read remains unassigned. + When two distinct barcodes are used for sample matching (dual encoding), two distinct + values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for + first (e.g. from read_1 or index_1) + MMD=null is like MMD=0 + + Default value: 1. This option can be set to 'null' to clear the default value. + + MIN_BASE_QUALITY=String + Q=String + + Minimum base quality during barcode matching: bases which quality is less than this + cutoff are always considered as a mismatch.When two distinct barcodes are used for sample + matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X + and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode + (e.g. from read_2 or index_2) respectively. + Q=null is like Q=0. + + Default value: 10. This option can be set to 'null' to clear the default value. + + XTRIMLEN=String + XT=String + + Optional extra number of base to be trimmed right after the barcode (only used if + CLIP_BARCODE=true). + When running paired-end, two distinct values can be given using the syntax XT=X:Z where X + and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when + BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to + end up with reads of the same length (note that this can also be operated using ZT). If a + unique value is given, e.g. XT=1, while running paired-end the following rule applies: + (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode + (2) BPOS=BOTH, the value is used for both reads. + + Note that XT=null is like XT=0. + Default value: 0. This option can be set to 'null' to clear the default value. + + ZTRIMLEN=String + ZT=String + + Optional extra number of bases to be trimmed from the read end i.e. 3' end. + When running paired-end, two distinct values can be given here using the syntax ZT=X:Z + where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even + when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode + as to end up with reads of the same length (note that this can also be operated using + XT). Note that if a single value is passed, the value always applies to both reads in + paired-end mode without further consideration. + ZT=null is like ZT=0. + + Default value: 0. This option can be set to 'null' to clear the default value. + + CLIP_BARCODE=Boolean + C=Boolean + + Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if + applicable, before writing to output file. + If false, reads are written without modification to output file. + Apply to both barcodes when BPOS=BOTH. + + Default value: true. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + ADD_BARCODE_TO_HEADER=Boolean + ADD=Boolean + + Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH. + If true, the string ':barcode' is added at the end of the read header with a ':' added + only if current read header does not end with ':'. + If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second + read also has its own matched barcode written. Else, the read without a barcode receives + the barcode from the barcoded read. + For example: + @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0: + becomes: + @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE + + When barcodes containing random positions, i.e. 'N', (for example like in the iCLIP + protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT + the matched barcode. + + Default value: true. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + + ENSURE_IDENTICAL_HEADER_NAMES=Boolean + SAME_HEADERS=Boolean + + Makes sure that headers of both reads of a pair are identical, using the following read + header pattern (for both reads of a pair): + @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2) + This option only makes sense in + paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the + read headers are not identical. When molecular barcodes are present in reads (either as + additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you + will end with (problematic) read headers like this: + HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG + HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT + SAME_HEADERS=true will instead generates the following identical header for both reads: + HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT + Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in + generating different headers. + Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which + case a space will be preserved ie: + HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT + + Default value: true. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + + READ_NAME_REPLACE_CHAR=String + RCHAR=String + + Replace spaces in read name/header using provided character. This is particularly handy + when you need to retain ADDed barcode in read name/header during mapping (everything + after space in read name is usually clipped in BAM files). For example, with RCHAR=':': + @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0: + becomes + @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE + Default value: null. + + QUALITY_FORMAT=FastqQualityFormat + V=FastqQualityFormat + + A value describing how the quality values are encoded in the fastq. Either 'Solexa' for + pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and + above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift + of 33. If this value is not specified (or 'null' is given), the quality format will be + detected. + + Default value: Standard. This option can be set to 'null' to clear the default value. + Possible values: {Solexa, Illumina, Standard} + + KEEP_UNASSIGNED_READ=Boolean + UN=Boolean + + Should un-assigned reads be saved in files or simply ignored. File names are + automatically created or can be given using UF1 & UF2 options. + + Default value: true. This option can be set to 'null' to clear the default value. + Possible values: {true, false} + + BARCODE_DIAG_FILE=String + DIAG=String + + Name for a barcode match reporting file (not generated by default).Either a name (in + which case the file will be created in the output dir) or full path. This file will + contain a line per read pair with the barcode best matching the read subsequence or + 'null' when no match is found according to matching parameters ; and the final selected + sample. This file is useful for debugging or further processing in case both ends are + barcoded. + N.B: this file will have a size of about one of the fastq input files. + + Default value: null. +]]> + </help> + +</tool>