Mercurial > repos > petr-novak > re_utils

diff paired_fastq_filtering.xml @ 22:58807b35777a draft
planemo upload commit 20bdf879b52796d3fb251a20807191ff02084d3c-dirty
author: petr-novak
date: Wed, 02 Aug 2023 11:31:12 +0000
parents: 768883847008
children: 36c418bca8b2
--- a/paired_fastq_filtering.xml	Thu Jul 27 09:46:13 2023 +0000
+++ b/paired_fastq_filtering.xml	Wed Aug 02 11:31:12 2023 +0000
@@ -1,184 +1,212 @@
 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads">
-  <stdio>
-     <exit_code range="1:" level="fatal" description="Error" />
-  </stdio>
-  <description>
-    Preprocessing of paired-end reads in FASTQ format
-    including trimming, quality filtering, cutadapt filtering and interlacing. Broken
-    pairs are discarded.
-  </description>
-  <requirements>
-    <requirement type="package">blast</requirement>
-    <requirement type="package">cutadapt</requirement>
-    <requirement type="package">bioconductor-shortread</requirement>
-    <requirement type="package">r-optparse</requirement>
-  </requirements>
-  <command interpreter="bash">
-    paired_fastq_filtering_wrapper.sh -a ${A} -b ${B}  -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Error" version="1.0.0.3"/>
+    </stdio>
+    <description>
+        Preprocessing of paired-end reads in FASTQ format
+        including trimming, quality filtering, cutadapt filtering and interlacing. Broken
+        pairs are discarded.
+    </description>
+    <requirements>
+        <requirement type="package">blast</requirement>
+        <requirement type="package">cutadapt</requirement>
+        <requirement type="package">bioconductor-shortread</requirement>
+        <requirement type="package">r-optparse</requirement>
+    </requirements>
+    <required_files>
+        <include type="literal" path="paired_fastq_filtering_wrapper.sh"/>
+        <include type="literal" path="paired_fastq_filtering.R"/>
+        <include type="literal" path="fasta_interlacer.py"/>
+    </required_files>
+    <command>
+        bash '$__tool_directory__'/paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o
+        ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
 
-    #if $sampling.sequence_sampling :
-    -n $sampling.sample_size
-    #end if
+        #if $sampling.sequence_sampling :
+        -n $sampling.sample_size
+        #end if
 
-    #if $trimming.sequence_trimming :
-    -e $trimming.trim_end -s $trimming.trim_start
-    #end if
+        #if $trimming.sequence_trimming :
+        -e $trimming.trim_end -s $trimming.trim_start
+        #end if
 
-    #if $cutadapt.use_custom :
-    -C "${cutadapt.custom_options}"
-    #end if
+        #if $cutadapt.use_custom :
+        -C "${cutadapt.custom_options}"
+        #end if
 
-    #if $similarity_filtering.include :
-    -F "${similarity_filtering.filter_database}"
-    #end if
+        #if $similarity_filtering.include :
+        -F "${similarity_filtering.filter_database}"
+        #end if
 
-  </command>
+    </command>
 
-  <inputs>
-    <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />
+    <inputs>
+        <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads"/>
 
-    <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />
+        <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads"/>
 
-    <conditional name="sampling">
-      <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/>
-	    <when value="false">
-        <!-- do nothing here -->
-      </when>
-      <when value="true">
-   		  <param name="sample_size" type="integer" label="Sample size (number of pairs)" help="How many read pairs should be sampled" value="500000" min="0"/>
-      </when>
-    </conditional>
+        <conditional name="sampling">
+            <param name="sequence_sampling" type="boolean" truevalue="true"
+                   falsevalue="false" checked="False" label="Read sampling"/>
+            <when value="false">
+                <!-- do nothing here -->
+            </when>
+            <when value="true">
+                <param name="sample_size" type="integer"
+                       label="Sample size (number of pairs)"
+                       help="How many read pairs should be sampled" value="500000"
+                       min="0"/>
+            </when>
+        </conditional>
 
-    <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" help="See below how to correctly set the quality cutoff" />
-    <param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0"
-           help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value" />
+        <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0"
+               help="See below how to correctly set the quality cutoff"/>
+        <param type="integer" name="percent_above" label="Percent above cutoff" value="95"
+               min="0"
+               help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value"/>
 
-    <conditional name="trimming">
-      <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/>
-      <when value="false">
-        <!-- do nothing here -->
-      </when>      
-      <when value="true">
-        <param type="integer" name="trim_start" label="Start position" value="1" min="1"
-               help="Reads are trimmed at the specified start" />
-        <param type="integer" name="trim_end" label="End position" value="100" min="1"
-               help="Reads are trimmed to the specified end position, shorted sequences are discarded" />
-      </when>      
+        <conditional name="trimming">
+            <param name="sequence_trimming" type="boolean" truevalue="true"
+                   falsevalue="false" checked="False" label="Trim reads"/>
+            <when value="false">
+                <!-- do nothing here -->
+            </when>
+            <when value="true">
+                <param type="integer" name="trim_start" label="Start position" value="1"
+                       min="1"
+                       help="Reads are trimmed at the specified start"/>
+                <param type="integer" name="trim_end" label="End position" value="100"
+                       min="1"
+                       help="Reads are trimmed to the specified end position, shorted sequences are discarded"/>
+            </when>
 
-    </conditional>
-   	<param name="max_n" type="integer" label="Maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
+        </conditional>
+        <param name="max_n" type="integer" label="Maximum Ns"
+               help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
+
+        <conditional name="cutadapt">
+            <param name="use_custom" type="boolean" truevalue="true" falsevalue="false"
+                   checked="False" label="Custom cutadapt options"/>
+            <when value="false">
+                <!-- do nothing here -->
+            </when>
+            <when value="true">
+                <param name="custom_options" type="text" area="True" size="8x30"
+                       label="Custom options" help="Consult cutadapt for usage" value="">
+                    <sanitizer sanitize="False"/>
+                </param>
+                >
+            </when>
+        </conditional>
 
-    <conditional name="cutadapt">
-      <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/>
-	    <when value="false">
-        <!-- do nothing here -->
-      </when>
-      <when value="true">
-   		  <param name="custom_options" type="text" area="True" size="8x30"  label="Custom options" help="Consult cutadapt for usage" value="">
-          <sanitizer sanitize="False"/>
-          </param>>
-      </when>
-    </conditional>
+        <conditional name="similarity_filtering">
+            <param name="include" type="boolean" truevalue="true" falsevalue="false"
+                   checked="False" label="Use similarity search filtering"/>
+            <when value="false">
+                <!-- do nothing here -->
+            </when>
+            <when value="true">
 
-    <conditional name="similarity_filtering">
-	    <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
-	    <when value="false">
-        <!-- do nothing here -->
-      </when>
-      <when value="true">
-        
-   		  <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
-      </when>
-    </conditional>
+                <param name="filter_database" format="fasta" type="data"
+                       label="Sequence filter database"
+                       help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
+            </when>
+        </conditional>
 
-    <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" label="Rename reads" help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
-  </inputs>
+        <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True"
+               label="Rename reads"
+               help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
+    </inputs>
 
 
-  <outputs>
-    <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
-    <data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
-  </outputs>
+    <outputs>
+        <data format="fasta" name="paired"
+              label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
+        <data format="png" name="png_output"
+              label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
+    </outputs>
 
 
-  <tests>
-    <test>
-      <param name="A" value="ERR215189_1_part.fastq.gz" />
-      <param name="B" value="ERR215189_2_part.fastq.gz" />
-      <param name="max_n" value="0"/>
-      <param name="cut_off" value="10" />
-      <param name="percent_above" value="95" />
-      <output name="output" value="paired_output.fasta" />
-      <output name="png_output" value="paired_output.png" />
-    </test>
-  </tests>
+    <tests>
+        <test>
+            <param name="A" value="ERR215189_1_part.fastq.gz"/>
+            <param name="B" value="ERR215189_2_part.fastq.gz"/>
+            <param name="max_n" value="0"/>
+            <param name="cut_off" value="10"/>
+            <param name="percent_above" value="95"/>
+            <output name="output" value="paired_output.fasta"/>
+            <output name="png_output" value="paired_output.png"/>
+        </test>
+    </tests>
 
-  <help>
-**What it does**
+    <help>
+        **What it does**
 
-This tool is designed to make memory efficient preprocessing of two
-fastq files. Output of this file can be used as input of RepeatExplorer clustering.
-Input files can be in GNU zipped archive (.gz extension).
-Reads are filtered based on the quality, presence of N bases and
-adapters. Two input fastq files are procesed in parallel. Only complete pair
-are kept. As the input files are process in chunks, it is required that
-pair reads are complete and in the same order in both input files. All
-reads which pass the quality filter fill be writen into output files.
-If sampling is specified, only sample of sequences will be
-returned. Cutadapt us run with this options::
+        This tool is designed to make memory efficient preprocessing of two
+        fastq files. Output of this file can be used as input of RepeatExplorer
+        clustering.
+        Input files can be in GNU zipped archive (.gz extension).
+        Reads are filtered based on the quality, presence of N bases and
+        adapters. Two input fastq files are procesed in parallel. Only complete pair
+        are kept. As the input files are process in chunks, it is required that
+        pair reads are complete and in the same order in both input files. All
+        reads which pass the quality filter fill be writen into output files.
+        If sampling is specified, only sample of sequences will be
+        returned. Cutadapt us run with this options::
 
-    --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
-    --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
-    --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
-    --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
-    --anywhere='CAAGCAGAAGACGGCATACGAGAT'
-    --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
-    --error-rate=0.05
-    --times=1 --overlap=15 --discard
+        --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
+        --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
+        --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
+        --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
+        --anywhere='CAAGCAGAAGACGGCATACGAGAT'
+        --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
+        --error-rate=0.05
+        --times=1 --overlap=15 --discard
 
 
-**Order of fastq files processing**
+        **Order of fastq files processing**
 
-1. Trimming (optional)       
-#. Filter by quality      
-#. Discard single reads, keep complete pairs
-#. Cutadapt filtering 
-#. Discard single reads, keep complete pairs    
-#. Sampling (optional)         
-#. Interlacing two fasta files
+        1. Trimming (optional)
+        #. Filter by quality
+        #. Discard single reads, keep complete pairs
+        #. Cutadapt filtering
+        #. Discard single reads, keep complete pairs
+        #. Sampling (optional)
+        #. Interlacing two fasta files
 
-**Quality setting cutoff**
+        **Quality setting cutoff**
 
-To correctly set quality cutoff, you need to know how the quality is encoded in your fastq file, default
-filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
+        To correctly set quality cutoff, you need to know how the quality is encoded in
+        your fastq file, default
+        filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
 
 
-      Default filtering cutoff
-                |   
-                |
-                V
-      SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
-      ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
-      ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
-      .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
-      LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
-      !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
-      |                         |    |        |                              |                     |
-     33                        59   64       73                            104                   126
-      0........................26...31.......40                                
-                               -5....0........9.............................40 
-                                     0........9.............................40 
-                                        3.....9.............................40 
-      0.2......................26...31........41                              
-    
-     S - Sanger        Phred+33,  raw reads typically (0, 40)
-     X - Solexa        Solexa+64, raw reads typically (-5, 40)
-     I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
-     J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
-         with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) 
-         (Note: See discussion above).
-     L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
-    
-  </help>    
+        Default filtering cutoff
+        |
+        |
+        V
+        SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+        ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+        ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+        .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+        LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+        !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+        | | | | | |
+        33 59 64 73 104 126
+        0........................26...31.......40
+        -5....0........9.............................40
+        0........9.............................40
+        3.....9.............................40
+        0.2......................26...31........41
+
+        S - Sanger Phred+33, raw reads typically (0, 40)
+        X - Solexa Solexa+64, raw reads typically (-5, 40)
+        I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
+        J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
+        with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
+        (Note: See discussion above).
+        L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
+
+    </help>
 </tool>
author	petr-novak
date	Wed, 02 Aug 2023 11:31:12 +0000
parents	768883847008
children	36c418bca8b2