Mercurial > repos > artbio > yac_clipper
changeset 3:94d67b195acd draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/yac_clipper commit 6884c90d521932ae0981532929db9f5f44c8b4a2
author | artbio |
---|---|
date | Mon, 21 Jan 2019 18:46:04 -0500 |
parents | da08e89abd18 |
children | f7947c5a18b8 |
files | test-data/out.fasta test-data/out.fasta.gz test-data/out.fastqsanger test-data/out.fastqsanger.gz test-data/yac.fasta test-data/yac.fasta.gz test-data/yac.fastqsanger.gz test-data/yac.out test-data/yac_fastq.out yac.py yac.xml |
diffstat | 11 files changed, 177 insertions(+), 60 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out.fasta Mon Jan 21 18:46:04 2019 -0500 @@ -0,0 +1,12 @@ +>1 +TGTAAACATCCCCGACTGGCAGC +>2 +AAAGTGCTACTACTTTTGAGTCT +>3 +ACTGGACTTGGAGTCCGAAGGC +>4 +AAGTGCCGCCAGGTTTTGAGTGG +>5 +TATTGCACTTGTCCCGGCCTGAATCNCGT +>6 +TAGCTTATCAGACTGATGTTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out.fastqsanger Mon Jan 21 18:46:04 2019 -0500 @@ -0,0 +1,24 @@ +@HWI-1 +TGTAAACATCCCCGACTGGCAGC ++ +B@BBCBCCBCBCCC8A<@##### +@HWI-2 +AAAGTGCTACTACTTTTGAGTCT ++ +BAA@7?A@@A@@B<'25?6>59: +@HWI-3 +ACTGGACTTGGAGTCCGAAGGC ++ +BBB@@ABAAB?9B42&9;#### +@HWI-4 +AAGTGCCGCCAGGTTTTGAGTGG ++ +AB?5;3>/=?>=;416481#### +@HWI-5 +TATTGCACTTGTCCCGGCCTGAATCNCGT ++ +BCB=:ACCBB=>BB8<-############ +@HWI-6 +TAGCTTATCAGACTGATGTTGAC ++ +BBBBBCBBCB;>AA',9=18?1:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac.fasta Mon Jan 21 18:46:04 2019 -0500 @@ -0,0 +1,20 @@ +>1 +TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG +>2 +AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG +>3 +TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG +>4 +ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT +>5 +AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG +>6 +TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT +>7 +TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC +>8 +AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT +>9 +TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG +>10 +TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
--- a/test-data/yac.out Sat Oct 13 17:09:16 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ ->1 -TGTAAACATCCCCGACTGGCAGC ->2 -AAAGTGCTACTACTTTTGAGTCT ->3 -ACTGGACTTGGAGTCCGAAGGC ->4 -AAGTGCCGCCAGGTTTTGAGTGG ->5 -TATTGCACTTGTCCCGGCCTGAATCNCGT ->6 -TAGCTTATCAGACTGATGTTGAC
--- a/test-data/yac_fastq.out Sat Oct 13 17:09:16 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -@HWI-1 -TGTAAACATCCCCGACTGGCAGC -+ -B@BBCBCCBCBCCC8A<@##### -@HWI-2 -AAAGTGCTACTACTTTTGAGTCT -+ -BAA@7?A@@A@@B<'25?6>59: -@HWI-3 -ACTGGACTTGGAGTCCGAAGGC -+ -BBB@@ABAAB?9B42&9;#### -@HWI-4 -AAGTGCCGCCAGGTTTTGAGTGG -+ -AB?5;3>/=?>=;416481#### -@HWI-5 -TATTGCACTTGTCCCGGCCTGAATCNCGT -+ -BCB=:ACCBB=>BB8<-############ -@HWI-6 -TAGCTTATCAGACTGATGTTGAC -+ -BBBBBCBBCB;>AA',9=18?1:
--- a/yac.py Sat Oct 13 17:09:16 2018 -0400 +++ b/yac.py Mon Jan 21 18:46:04 2019 -0500 @@ -46,6 +46,12 @@ self.minsize = int(minsize) self.maxsize = int(maxsize) self.Nmode = Nmode + for line in open(inputfile): + if line[0] == "@": + self.inputformat = "fastq" + break + elif line[0] == ">": + self.inputformat = "fasta" def motives(sequence): ''' @@ -65,13 +71,22 @@ def scanadapt(self, adaptmotives=[], sequence="", qscore=""): '''scans sequence for adapter motives''' match_position = sequence.rfind(adaptmotives[0]) - if match_position != -1: - return sequence[:match_position], qscore[:match_position] - for motif in adaptmotives[1:]: - match_position = sequence.rfind(motif) + if qscore: if match_position != -1: return sequence[:match_position], qscore[:match_position] - return sequence, qscore + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + return sequence, qscore + else: + if match_position != -1: + return sequence[:match_position] + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position] + return sequence def write_output(self, id, read, qscore, output): if self.output_format == "fasta": @@ -80,9 +95,12 @@ block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) output.write(block) - def handle_io(self): - '''Open input file, pass read sequence and read qscore to clipping function. - Pass clipped read and qscore to output function.''' + def fasta_in_write_output(self, id, read, output): + output.write(">{0}\n{1}\n".format(id, read)) + + def handle_io_fastq(self): + '''Open input fastq file, pass read sequence and read qscore to + scanadapt function. Pass clipped read and qscore to output function.''' id = 0 output = open(self.outputfile, "a") with open(self.inputfile, "r") as input: @@ -100,12 +118,32 @@ continue id += 1 self.write_output(id, trimmed_read, trimmed_qscore, output) - output.close() + output.close() + + def handle_io_fasta(self): + '''Open input fasta file, pass header and read sequence to scanadapt + function. Pass clipped read and qscore to output function.''' + id = 0 + output = open(self.outputfile, "a") + with open(self.inputfile, "r") as input: + block_gen = islice(input, 1, None, 2) + for i, line in enumerate(block_gen): + read = line.rstrip() + trimmed_read = self.scanadapt(self.adaptmotifs, read) + if self.minsize <= len(trimmed_read) <= self.maxsize: + if (self.Nmode == "reject") and ("N" in trimmed_read): + continue + id += 1 + self.fasta_in_write_output(id, trimmed_read, output) + output.close() def main(*argv): instanceClip = Clip(*argv) - instanceClip.handle_io() + if instanceClip.inputformat == "fasta": + instanceClip.handle_io_fasta() + else: + instanceClip.handle_io_fastq() if __name__ == "__main__":
--- a/yac.xml Sat Oct 13 17:09:16 2018 -0400 +++ b/yac.xml Mon Jan 21 18:46:04 2019 -0500 @@ -1,22 +1,36 @@ -<tool id="yac" name="Clip adapter" version="2.1.1"> +<tool id="yac" name="Clip adapter" version="2.2.0"> <description /> <command detect_errors="exit_code"><![CDATA[ python $__tool_directory__/yac.py --input $input - --output $output - --output_format "$out_format" + --output 'clip.tmp' + --output_format + #if $out_format == 'fasta' or $out_format == 'fastagz': + 'fasta' + #else + 'fastq' + #end if --adapter_to_clip $clip_source.clip_sequence --min $min --max $max - --Nmode $Nmode + --Nmode $Nmode && + #if ($out_format == 'fastagz') or ($out_format == 'fastqgz'): + gzip -c 'clip.tmp' > $output + #else + mv clip.tmp $output + #end if + ]]></command> <inputs> - <param format="fastq" label="Source file" name="input" type="data" /> + <param format="fasta,fastq" label="Source file" name="input" type="data" /> <param label="min size" name="min" size="4" type="integer" value="15" /> <param label="max size" name="max" size="4" type="integer" value="36" /> - <param label="Select output format" name="out_format" type="select"> - <option selected="true" value="fasta">Fasta format</option> - <option value="fastq">Fastq (Sanger) format</option> + <param label="Select output format" name="out_format" type="select" + help="be careful not to select a fastq format for your output if your input has a fasta format"> + <option value="fasta">Fasta</option> + <option value="fastq" selected="true" >Fastq (Sanger)</option> + <option value="fastagz">gzipped Fasta</option> + <option value="fastqgz">gzipped Fastq (Sanger)</option> </param> <param label="Accept reads containing N?" name="Nmode" type="select"> <option selected="True" value="accept">accept</option> @@ -41,9 +55,12 @@ </conditional> </inputs> <outputs> - <data format_source="input" metadata_source="input" name="output" label="Clipping of ${input.name}"> + <data format_source="input" metadata_source="input" name="output" label="Clipped ${input.name}-then-${out_format}"> <change_format> <when input="out_format" value="fasta" format="fasta" /> + <when input="out_format" value="fastq" format="fastqsanger" /> + <when input="out_format" value="fastagz" format="fasta.gz" /> + <when input="out_format" value="fastqgz" format="fastqsanger.gz" /> </change_format> </data> </outputs> @@ -55,7 +72,8 @@ <param name="clip_source_list" value="prebuilt" /> <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> <param name="Nmode" value="accept" /> - <output file="yac.out" name="output" /> + <param name="out_format" value="fastq" /> + <output file="out.fastqsanger" name="output" /> </test> <test> <param ftype="fastqsanger" name="input" value="yac.fastq" /> @@ -64,8 +82,48 @@ <param name="clip_source_list" value="prebuilt" /> <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> <param name="Nmode" value="accept" /> - <param name="out_format" value="fastq" /> - <output file="yac_fastq.out" name="output" /> + <param name="out_format" value="fasta" /> + <output file="out.fasta" name="output" /> + </test> + <test> + <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastqgz" /> + <output file="out.fastqsanger.gz" name="output" decompress="True" /> + </test> + <test> + <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastagz" /> + <output file="out.fasta.gz" name="output" decompress="True" /> + </test> + <test> + <param ftype="fasta.gz" name="input" value="yac.fasta.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="out_format" value="fasta" /> + <param name="Nmode" value="accept" /> + <output file="out.fasta" name="output" /> + </test> + <test> + <param ftype="fasta.gz" name="input" value="yac.fasta.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastagz" /> + <output file="out.fasta.gz" name="output" decompress="True" /> </test> </tests> <help> @@ -81,9 +139,10 @@ **Inputs** -1. A fastq file of reads to be clipped +1. A fastq or fasta file of reads to be clipped 2. Select the size of the reads to be kept -3. Select an output format when input is a fastq file (this may be fastq or fastq) +3. Select an output format. When input is a fastq file, this may be fastq or fasta, whereas +when input is a fasta file, this only may be a fasta. 4. Select whether you wish or do not wish to keep clipped sequences with unknown nucleotides (N) 5. Select a pre-built adapter sequence or enter your own sequence (at least 7 nucleotides long)