changeset 3:94d67b195acd draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/yac_clipper commit 6884c90d521932ae0981532929db9f5f44c8b4a2
author artbio
date Mon, 21 Jan 2019 18:46:04 -0500
parents da08e89abd18
children f7947c5a18b8
files test-data/out.fasta test-data/out.fasta.gz test-data/out.fastqsanger test-data/out.fastqsanger.gz test-data/yac.fasta test-data/yac.fasta.gz test-data/yac.fastqsanger.gz test-data/yac.out test-data/yac_fastq.out yac.py yac.xml
diffstat 11 files changed, 177 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out.fasta	Mon Jan 21 18:46:04 2019 -0500
@@ -0,0 +1,12 @@
+>1
+TGTAAACATCCCCGACTGGCAGC
+>2
+AAAGTGCTACTACTTTTGAGTCT
+>3
+ACTGGACTTGGAGTCCGAAGGC
+>4
+AAGTGCCGCCAGGTTTTGAGTGG
+>5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
+>6
+TAGCTTATCAGACTGATGTTGAC
Binary file test-data/out.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out.fastqsanger	Mon Jan 21 18:46:04 2019 -0500
@@ -0,0 +1,24 @@
+@HWI-1
+TGTAAACATCCCCGACTGGCAGC
++
+B@BBCBCCBCBCCC8A<@#####
+@HWI-2
+AAAGTGCTACTACTTTTGAGTCT
++
+BAA@7?A@@A@@B<'25?6>59:
+@HWI-3
+ACTGGACTTGGAGTCCGAAGGC
++
+BBB@@ABAAB?9B42&9;####
+@HWI-4
+AAGTGCCGCCAGGTTTTGAGTGG
++
+AB?5;3>/=?>=;416481####
+@HWI-5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
++
+BCB=:ACCBB=>BB8<-############
+@HWI-6
+TAGCTTATCAGACTGATGTTGAC
++
+BBBBBCBBCB;>AA',9=18?1:
Binary file test-data/out.fastqsanger.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac.fasta	Mon Jan 21 18:46:04 2019 -0500
@@ -0,0 +1,20 @@
+>1
+TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG
+>2
+AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG
+>3
+TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG
+>4
+ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT
+>5
+AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG
+>6
+TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT
+>7
+TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC
+>8
+AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT
+>9
+TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG
+>10
+TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
Binary file test-data/yac.fasta.gz has changed
Binary file test-data/yac.fastqsanger.gz has changed
--- a/test-data/yac.out	Sat Oct 13 17:09:16 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
->1
-TGTAAACATCCCCGACTGGCAGC
->2
-AAAGTGCTACTACTTTTGAGTCT
->3
-ACTGGACTTGGAGTCCGAAGGC
->4
-AAGTGCCGCCAGGTTTTGAGTGG
->5
-TATTGCACTTGTCCCGGCCTGAATCNCGT
->6
-TAGCTTATCAGACTGATGTTGAC
--- a/test-data/yac_fastq.out	Sat Oct 13 17:09:16 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-@HWI-1
-TGTAAACATCCCCGACTGGCAGC
-+
-B@BBCBCCBCBCCC8A<@#####
-@HWI-2
-AAAGTGCTACTACTTTTGAGTCT
-+
-BAA@7?A@@A@@B<'25?6>59:
-@HWI-3
-ACTGGACTTGGAGTCCGAAGGC
-+
-BBB@@ABAAB?9B42&9;####
-@HWI-4
-AAGTGCCGCCAGGTTTTGAGTGG
-+
-AB?5;3>/=?>=;416481####
-@HWI-5
-TATTGCACTTGTCCCGGCCTGAATCNCGT
-+
-BCB=:ACCBB=>BB8<-############
-@HWI-6
-TAGCTTATCAGACTGATGTTGAC
-+
-BBBBBCBBCB;>AA',9=18?1:
--- a/yac.py	Sat Oct 13 17:09:16 2018 -0400
+++ b/yac.py	Mon Jan 21 18:46:04 2019 -0500
@@ -46,6 +46,12 @@
         self.minsize = int(minsize)
         self.maxsize = int(maxsize)
         self.Nmode = Nmode
+        for line in open(inputfile):
+            if line[0] == "@":
+                self.inputformat = "fastq"
+                break
+            elif line[0] == ">":
+                self.inputformat = "fasta"
 
         def motives(sequence):
             '''
@@ -65,13 +71,22 @@
     def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
         '''scans sequence for adapter motives'''
         match_position = sequence.rfind(adaptmotives[0])
-        if match_position != -1:
-            return sequence[:match_position], qscore[:match_position]
-        for motif in adaptmotives[1:]:
-            match_position = sequence.rfind(motif)
+        if qscore:
             if match_position != -1:
                 return sequence[:match_position], qscore[:match_position]
-        return sequence, qscore
+            for motif in adaptmotives[1:]:
+                match_position = sequence.rfind(motif)
+                if match_position != -1:
+                    return sequence[:match_position], qscore[:match_position]
+            return sequence, qscore
+        else:
+            if match_position != -1:
+                return sequence[:match_position]
+            for motif in adaptmotives[1:]:
+                match_position = sequence.rfind(motif)
+                if match_position != -1:
+                    return sequence[:match_position]
+            return sequence
 
     def write_output(self, id, read, qscore, output):
         if self.output_format == "fasta":
@@ -80,9 +95,12 @@
             block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
         output.write(block)
 
-    def handle_io(self):
-        '''Open input file, pass read sequence and read qscore to clipping function.
-        Pass clipped read and qscore to output function.'''
+    def fasta_in_write_output(self, id, read, output):
+        output.write(">{0}\n{1}\n".format(id, read))
+
+    def handle_io_fastq(self):
+        '''Open input fastq file, pass read sequence and read qscore to
+        scanadapt function. Pass clipped read and qscore to output function.'''
         id = 0
         output = open(self.outputfile, "a")
         with open(self.inputfile, "r") as input:
@@ -100,12 +118,32 @@
                         continue
                     id += 1
                     self.write_output(id, trimmed_read, trimmed_qscore, output)
-            output.close()
+        output.close()
+
+    def handle_io_fasta(self):
+        '''Open input fasta file, pass header and read sequence to scanadapt
+        function. Pass clipped read and qscore to output function.'''
+        id = 0
+        output = open(self.outputfile, "a")
+        with open(self.inputfile, "r") as input:
+            block_gen = islice(input, 1, None, 2)
+            for i, line in enumerate(block_gen):
+                read = line.rstrip()
+                trimmed_read = self.scanadapt(self.adaptmotifs, read)
+                if self.minsize <= len(trimmed_read) <= self.maxsize:
+                    if (self.Nmode == "reject") and ("N" in trimmed_read):
+                        continue
+                    id += 1
+                    self.fasta_in_write_output(id, trimmed_read, output)
+        output.close()
 
 
 def main(*argv):
     instanceClip = Clip(*argv)
-    instanceClip.handle_io()
+    if instanceClip.inputformat == "fasta":
+        instanceClip.handle_io_fasta()
+    else:
+        instanceClip.handle_io_fastq()
 
 
 if __name__ == "__main__":
--- a/yac.xml	Sat Oct 13 17:09:16 2018 -0400
+++ b/yac.xml	Mon Jan 21 18:46:04 2019 -0500
@@ -1,22 +1,36 @@
-<tool id="yac" name="Clip adapter" version="2.1.1">
+<tool id="yac" name="Clip adapter" version="2.2.0">
     <description />
     <command detect_errors="exit_code"><![CDATA[
         python $__tool_directory__/yac.py
             --input $input
-            --output $output
-            --output_format "$out_format"
+            --output 'clip.tmp'
+            --output_format
+            #if $out_format == 'fasta' or $out_format == 'fastagz':
+                'fasta'
+            #else
+                'fastq'
+            #end if
             --adapter_to_clip $clip_source.clip_sequence
             --min $min
             --max $max
-             --Nmode $Nmode
+            --Nmode $Nmode &&
+            #if ($out_format == 'fastagz') or ($out_format == 'fastqgz'):
+                gzip -c 'clip.tmp' >  $output
+            #else
+                mv clip.tmp $output
+            #end if
+            
     ]]></command>
     <inputs>
-        <param format="fastq" label="Source file" name="input" type="data" />
+        <param format="fasta,fastq" label="Source file" name="input" type="data" />
         <param label="min size" name="min" size="4" type="integer" value="15" />
         <param label="max size" name="max" size="4" type="integer" value="36" />
-        <param label="Select output format" name="out_format" type="select">
-            <option selected="true" value="fasta">Fasta format</option>
-            <option value="fastq">Fastq (Sanger) format</option>
+        <param label="Select output format" name="out_format" type="select"
+               help="be careful not to select a fastq format for your output if your input has a fasta format">
+            <option value="fasta">Fasta</option>
+            <option value="fastq" selected="true" >Fastq (Sanger)</option>
+            <option value="fastagz">gzipped Fasta</option>
+            <option value="fastqgz">gzipped Fastq (Sanger)</option>
         </param>
         <param label="Accept reads containing N?" name="Nmode" type="select">
             <option selected="True" value="accept">accept</option>
@@ -41,9 +55,12 @@
         </conditional>
     </inputs>
     <outputs>
-        <data format_source="input" metadata_source="input" name="output" label="Clipping of ${input.name}">
+        <data format_source="input" metadata_source="input" name="output" label="Clipped ${input.name}-then-${out_format}">
           <change_format>
               <when input="out_format" value="fasta" format="fasta" />
+              <when input="out_format" value="fastq" format="fastqsanger" />
+              <when input="out_format" value="fastagz" format="fasta.gz" />
+              <when input="out_format" value="fastqgz" format="fastqsanger.gz" />
           </change_format>
         </data>
     </outputs>
@@ -55,7 +72,8 @@
             <param name="clip_source_list" value="prebuilt" />
             <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
             <param name="Nmode" value="accept" />
-            <output file="yac.out" name="output" />
+            <param name="out_format" value="fastq" />
+            <output file="out.fastqsanger" name="output" />
         </test>
         <test>
             <param ftype="fastqsanger" name="input" value="yac.fastq" />
@@ -64,8 +82,48 @@
             <param name="clip_source_list" value="prebuilt" />
             <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
             <param name="Nmode" value="accept" />
-            <param name="out_format" value="fastq" />
-            <output file="yac_fastq.out" name="output" />
+            <param name="out_format" value="fasta" />
+            <output file="out.fasta" name="output" />
+        </test>
+        <test>
+            <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastqgz" />
+            <output file="out.fastqsanger.gz" name="output" decompress="True" />
+        </test>
+        <test>
+            <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastagz" />
+            <output file="out.fasta.gz" name="output" decompress="True" />
+        </test>
+        <test>
+            <param ftype="fasta.gz" name="input" value="yac.fasta.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="out_format" value="fasta" />
+            <param name="Nmode" value="accept" />
+            <output file="out.fasta" name="output" />
+        </test>
+        <test>
+            <param ftype="fasta.gz" name="input" value="yac.fasta.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastagz" />
+            <output file="out.fasta.gz" name="output" decompress="True" />
         </test>
     </tests>
     <help>
@@ -81,9 +139,10 @@
 
 **Inputs**
 
-1. A fastq file of reads to be clipped
+1. A fastq or fasta file of reads to be clipped
 2. Select the size of the reads to be kept
-3. Select an output format when input is a fastq file (this may be fastq or fastq)
+3. Select an output format. When input is a fastq file, this may be fastq or fasta, whereas
+when input is a fasta file, this only may be a fasta.
 4. Select whether you wish or do not wish to keep clipped sequences with unknown nucleotides (N)
 5. Select a pre-built adapter sequence or enter your own sequence (at least 7 nucleotides long)