re_utils: paired_fastq_filtering.xml comparison

comparison paired_fastq_filtering.xml @ 22:58807b35777a draft

planemo upload commit 20bdf879b52796d3fb251a20807191ff02084d3c-dirty

author	petr-novak
date	Wed, 02 Aug 2023 11:31:12 +0000
parents	768883847008
children	36c418bca8b2

comparison

equal deleted inserted replaced

-:f4ed6a65a2ff
+:58807b35777a
 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads">
 <stdio>
-<exit_code range="1:" level="fatal" description="Error" />
+<exit_code range="1:" level="fatal" description="Error" version="1.0.0.3"/>
 </stdio>
 <description>
 Preprocessing of paired-end reads in FASTQ format
 including trimming, quality filtering, cutadapt filtering and interlacing. Broken
 pairs are discarded.
 </description>
 <requirements>
 <requirement type="package">blast</requirement>
 <requirement type="package">cutadapt</requirement>
 <requirement type="package">bioconductor-shortread</requirement>
 <requirement type="package">r-optparse</requirement>
 </requirements>
-<command interpreter="bash">
+<required_files>
-paired_fastq_filtering_wrapper.sh -a ${A} -b ${B}  -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
+<include type="literal" path="paired_fastq_filtering_wrapper.sh"/>
+<include type="literal" path="paired_fastq_filtering.R"/>
-#if $sampling.sequence_sampling :
+<include type="literal" path="fasta_interlacer.py"/>
--n $sampling.sample_size
+</required_files>
-#end if
+<command>
+bash '$__tool_directory__'/paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o
-#if $trimming.sequence_trimming :
+${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
--e $trimming.trim_end -s $trimming.trim_start
-#end if
+#if $sampling.sequence_sampling :
+-n $sampling.sample_size
-#if $cutadapt.use_custom :
+#end if
--C "${cutadapt.custom_options}"
-#end if
+#if $trimming.sequence_trimming :
+-e $trimming.trim_end -s $trimming.trim_start
-#if $similarity_filtering.include :
+#end if
--F "${similarity_filtering.filter_database}"
-#end if
+#if $cutadapt.use_custom :
+-C "${cutadapt.custom_options}"
-</command>
+#end if
-<inputs>
+#if $similarity_filtering.include :
-<param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />
+-F "${similarity_filtering.filter_database}"
+#end if
-<param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />
+</command>
-<conditional name="sampling">
-<param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/>
+<inputs>
-	    <when value="false">
+<param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads"/>
-<!-- do nothing here -->
-</when>
+<param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads"/>
-<when value="true">
-		  <param name="sample_size" type="integer" label="Sample size (number of pairs)" help="How many read pairs should be sampled" value="500000" min="0"/>
+<conditional name="sampling">
-</when>
+<param name="sequence_sampling" type="boolean" truevalue="true"
-</conditional>
+falsevalue="false" checked="False" label="Read sampling"/>
+<when value="false">
-<param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" help="See below how to correctly set the quality cutoff" />
+<!-- do nothing here -->
-<param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0"
+</when>
-help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value" />
+<when value="true">
+<param name="sample_size" type="integer"
-<conditional name="trimming">
+label="Sample size (number of pairs)"
-<param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/>
+help="How many read pairs should be sampled" value="500000"
-<when value="false">
+min="0"/>
-<!-- do nothing here -->
+</when>
-</when>
+</conditional>
-<when value="true">
-<param type="integer" name="trim_start" label="Start position" value="1" min="1"
+<param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0"
-help="Reads are trimmed at the specified start" />
+help="See below how to correctly set the quality cutoff"/>
-<param type="integer" name="trim_end" label="End position" value="100" min="1"
+<param type="integer" name="percent_above" label="Percent above cutoff" value="95"
-help="Reads are trimmed to the specified end position, shorted sequences are discarded" />
+min="0"
-</when>
+help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value"/>
-</conditional>
+<conditional name="trimming">
-	<param name="max_n" type="integer" label="Maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
+<param name="sequence_trimming" type="boolean" truevalue="true"
+falsevalue="false" checked="False" label="Trim reads"/>
-<conditional name="cutadapt">
+<when value="false">
-<param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/>
+<!-- do nothing here -->
-	    <when value="false">
+</when>
-<!-- do nothing here -->
+<when value="true">
-</when>
+<param type="integer" name="trim_start" label="Start position" value="1"
-<when value="true">
+min="1"
-		  <param name="custom_options" type="text" area="True" size="8x30"  label="Custom options" help="Consult cutadapt for usage" value="">
+help="Reads are trimmed at the specified start"/>
-<sanitizer sanitize="False"/>
+<param type="integer" name="trim_end" label="End position" value="100"
-</param>>
+min="1"
-</when>
+help="Reads are trimmed to the specified end position, shorted sequences are discarded"/>
-</conditional>
+</when>
-<conditional name="similarity_filtering">
+</conditional>
-	    <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
+<param name="max_n" type="integer" label="Maximum Ns"
-	    <when value="false">
+help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
-<!-- do nothing here -->
-</when>
+<conditional name="cutadapt">
-<when value="true">
+<param name="use_custom" type="boolean" truevalue="true" falsevalue="false"
+checked="False" label="Custom cutadapt options"/>
-		  <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
+<when value="false">
-</when>
+<!-- do nothing here -->
-</conditional>
+</when>
+<when value="true">
-<param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" label="Rename reads" help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
+<param name="custom_options" type="text" area="True" size="8x30"
-</inputs>
+label="Custom options" help="Consult cutadapt for usage" value="">
+<sanitizer sanitize="False"/>
+</param>
-<outputs>
+>
-<data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
+</when>
-<data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
+</conditional>
-</outputs>
+<conditional name="similarity_filtering">
+<param name="include" type="boolean" truevalue="true" falsevalue="false"
-<tests>
+checked="False" label="Use similarity search filtering"/>
-<test>
+<when value="false">
-<param name="A" value="ERR215189_1_part.fastq.gz" />
+<!-- do nothing here -->
-<param name="B" value="ERR215189_2_part.fastq.gz" />
+</when>
-<param name="max_n" value="0"/>
+<when value="true">
-<param name="cut_off" value="10" />
-<param name="percent_above" value="95" />
+<param name="filter_database" format="fasta" type="data"
-<output name="output" value="paired_output.fasta" />
+label="Sequence filter database"
-<output name="png_output" value="paired_output.png" />
+help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
-</test>
+</when>
-</tests>
+</conditional>
-<help>
+<param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True"
-**What it does**
+label="Rename reads"
+help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
-This tool is designed to make memory efficient preprocessing of two
+</inputs>
-fastq files. Output of this file can be used as input of RepeatExplorer clustering.
-Input files can be in GNU zipped archive (.gz extension).
-Reads are filtered based on the quality, presence of N bases and
+<outputs>
-adapters. Two input fastq files are procesed in parallel. Only complete pair
+<data format="fasta" name="paired"
-are kept. As the input files are process in chunks, it is required that
+label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
-pair reads are complete and in the same order in both input files. All
+<data format="png" name="png_output"
-reads which pass the quality filter fill be writen into output files.
+label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
-If sampling is specified, only sample of sequences will be
+</outputs>
-returned. Cutadapt us run with this options::
---anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
+<tests>
---anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
+<test>
---anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
+<param name="A" value="ERR215189_1_part.fastq.gz"/>
---anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
+<param name="B" value="ERR215189_2_part.fastq.gz"/>
---anywhere='CAAGCAGAAGACGGCATACGAGAT'
+<param name="max_n" value="0"/>
---anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
+<param name="cut_off" value="10"/>
---error-rate=0.05
+<param name="percent_above" value="95"/>
---times=1 --overlap=15 --discard
+<output name="output" value="paired_output.fasta"/>
+<output name="png_output" value="paired_output.png"/>
+</test>
-**Order of fastq files processing**
+</tests>
-1. Trimming (optional)
+<help>
-#. Filter by quality
+**What it does**
-#. Discard single reads, keep complete pairs
-#. Cutadapt filtering
+This tool is designed to make memory efficient preprocessing of two
-#. Discard single reads, keep complete pairs
+fastq files. Output of this file can be used as input of RepeatExplorer
-#. Sampling (optional)
+clustering.
-#. Interlacing two fasta files
+Input files can be in GNU zipped archive (.gz extension).
+Reads are filtered based on the quality, presence of N bases and
-**Quality setting cutoff**
+adapters. Two input fastq files are procesed in parallel. Only complete pair
+are kept. As the input files are process in chunks, it is required that
-To correctly set quality cutoff, you need to know how the quality is encoded in your fastq file, default
+pair reads are complete and in the same order in both input files. All
-filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
+reads which pass the quality filter fill be writen into output files.
+If sampling is specified, only sample of sequences will be
+returned. Cutadapt us run with this options::
-Default filtering cutoff
-|
+--anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
-|
+--anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
-V
+--anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
-SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+--anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
-..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+--anywhere='CAAGCAGAAGACGGCATACGAGAT'
-...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+--anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
-.................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+--error-rate=0.05
-LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+--times=1 --overlap=15 --discard
-!"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
-|                         |    |        |                              |                     |
-33                        59   64       73                            104                   126
+**Order of fastq files processing**
-0........................26...31.......40
--5....0........9.............................40
+1. Trimming (optional)
-0........9.............................40
+#. Filter by quality
-3.....9.............................40
+#. Discard single reads, keep complete pairs
-0.2......................26...31........41
+#. Cutadapt filtering
+#. Discard single reads, keep complete pairs
-S - Sanger        Phred+33,  raw reads typically (0, 40)
+#. Sampling (optional)
-X - Solexa        Solexa+64, raw reads typically (-5, 40)
+#. Interlacing two fasta files
-I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
-J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
+**Quality setting cutoff**
-with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
-(Note: See discussion above).
+To correctly set quality cutoff, you need to know how the quality is encoded in
-L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
+your fastq file, default
+filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
-</help>
+Default filtering cutoff
+|
+|
+V
+SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+.................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+!"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+| | | | | |
+33 59 64 73 104 126
+0........................26...31.......40
+-5....0........9.............................40
+0........9.............................40
+3.....9.............................40
+0.2......................26...31........41
+S - Sanger Phred+33, raw reads typically (0, 40)
+X - Solexa Solexa+64, raw reads typically (-5, 40)
+I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
+J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
+with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
+(Note: See discussion above).
+L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
+</help>
 </tool>

Mercurial > repos > petr-novak > re_utils

comparison paired_fastq_filtering.xml @ 22:58807b35777a draft