Mercurial > repos > petr-novak > re_utils
diff paired_fastq_filtering.xml @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | 378565f5a875 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/paired_fastq_filtering.xml Mon Apr 01 07:56:36 2019 -0400 @@ -0,0 +1,181 @@ +<tool id="paired_fastq_filtering" name="Preprocessing of fastq paired-reads"> + <description> + Preprocessing of paired reads fastq files + including trimming, quality filtering, cutadapt filtering and interlacing. Broken + pairs are discarded. + </description> + <requirements> + <requirement type="package">blast</requirement> + <requirement type="package">cutadapt</requirement> + <requirement type="package">bioconductor-shortread</requirement> + <requirement type="package">r-optparse</requirement> + </requirements> + <command interpreter="bash"> + paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output} + + #if $sampling.sequence_sampling : + -n $sampling.sample_size + #end if + + #if $trimming.sequence_trimming : + -e $trimming.trim_end -s $trimming.trim_start + #end if + + #if $cutadapt.use_custom : + -C "${cutadapt.custom_options}" + #end if + + #if $similarity_filtering.include : + -F "${similarity_filtering.filter_database}" + #end if + + </command> + + <inputs> + <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" /> + + <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" /> + + <conditional name="sampling"> + <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/> + <when value="false"> + <!-- do nothing here --> + </when> + <when value="true"> + <param name="sample_size" type="integer" label="Sample size(number of pairs)" help="How many sequence pairs should be in resulting dataset" value="500000" min="0"/> + </when> + </conditional> + + <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" /> + <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0" + help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" /> + + <conditional name="trimming"> + <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/> + <when value="false"> + <!-- do nothing here --> + </when> + <when value="true"> + <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1" + help="sequences are trimmed at specified start" /> + <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1" + help="sequences are trimmed to specified end position, shorted sequences are discarded" /> + </when> + + </conditional> + <param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/> + + <conditional name="cutadapt"> + <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/> + <when value="false"> + <!-- do nothing here --> + </when> + <when value="true"> + <param name="custom_options" type="text" area="True" size="8x30" label="Cutadapt custom options" help="Consult cutadapt for usage" value=""> + <sanitizer sanitize="False"/> + </param>> + </when> + </conditional> + + <conditional name="similarity_filtering"> + <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/> + <when value="false"> + <!-- do nothing here --> + </when> + <when value="true"> + + <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/> + </when> + </conditional> + + <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="False" label="Rename sequences" help="By default, original sequence ID are used, in case your sequences do not follow proper naming scheme to label paired-end read mate, use this option. All read pairs must be complete!"/> + </inputs> + + + <outputs> + <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/> + <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>" + </outputs> + + + <tests> + <test> + <param name="A" value="ERR215189_1_part.fastq.gz" /> + <param name="B" value="ERR215189_2_part.fastq.gz" /> + <param name="max_n" value="0"/> + <param name="cut_off" value="10" /> + <param name="percent_above" value="95" /> + <output name="output" value="paired_output.fasta" /> + <output name="png_output" value="paired_output.png" /> + </test> + </tests> + + <help> +**What it does** + +This tool is designed to make memory efficient preprocessing of two +fastq files. Output of this file can be used as input of RepeatExplorer clustering. +Input files can be in GNU zipped archive (.gz extension). +Reads are filtered based on the quality, presence of N bases and +adapters. Two input fastq files are procesed in parallel. Only complete pair +are kept. As the input files are process in chunks, it is required that +pair reads are complete and in the same order in both input files. All +reads which pass the quality filter fill be writen into output files. +If sampling is specified, only sample of sequences will be +returned. Cutadapt us run with this options:: + + --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' + --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' + --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' + --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' + --anywhere='CAAGCAGAAGACGGCATACGAGAT' + --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' + --error-rate=0.05 + --times=1 --overlap=15 --discard + + +**Order of fastq files processing** + +1. Trimming (optional) +#. Filter by quality +#. Discard single reads, keep complete pairs +#. Cutadapt filtering +#. Discard single reads, keep complete pairs +#. Sampling (optional) +#. Interlacing two fasta files + +**Quality setting cut-off** + +To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default +filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below:: + + + Default filtering cut-off + | + | + V + SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... + ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... + ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... + .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... + LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... + !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ + | | | | | | + 33 59 64 73 104 126 + 0........................26...31.......40 + -5....0........9.............................40 + 0........9.............................40 + 3.....9.............................40 + 0.2......................26...31........41 + + S - Sanger Phred+33, raw reads typically (0, 40) + X - Solexa Solexa+64, raw reads typically (-5, 40) + I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) + J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) + with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) + (Note: See discussion above). + L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) + + </help> +</tool> +