Previous changeset 1:8c02a91a8680 (2013-04-30) Next changeset 3:708ce1e5ac94 (2013-11-21) |
Commit message:
Uploaded v0.0.10, README now using RST, MIT licence, automatic Biopython dependency |
added:
tools/seq_primer_clip/README.rst tools/seq_primer_clip/repository_dependencies.xml tools/seq_primer_clip/seq_primer_clip.py tools/seq_primer_clip/seq_primer_clip.xml |
removed:
tools/primers/seq_primer_clip.py tools/primers/seq_primer_clip.py.orig tools/primers/seq_primer_clip.txt tools/primers/seq_primer_clip.txt.orig tools/primers/seq_primer_clip.xml tools/primers/seq_primer_clip.xml.orig |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.py --- a/tools/primers/seq_primer_clip.py Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,358 +0,0 @@\n-#!/usr/bin/env python\n-"""Looks for the given primer sequences and clips matching SFF reads.\n-\n-Takes eight command line options, input read filename, input read format,\n-input primer FASTA filename, type of primers (forward, reverse or reverse-\n-complement), number of mismatches (currently only 0, 1 and 2 are supported),\n-minimum length to keep a read (after primer trimming), should primer-less\n-reads be kept (boolean), and finally the output sequence filename.\n-\n-Both the primer and read sequences can contain IUPAC ambiguity codes like N.\n-\n-This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not\n-supported.\n-\n-The mismatch parameter does not consider gapped alignemnts, however the\n-special case of missing bases at the very start or end of the read is handled.\n-e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...\n-if one or more mismatches are allowed.\n-\n-This can also be used for stripping off (and optionally filtering on) barcodes.\n-\n-Note that only the trim/clip values in the SFF file are changed, not the flow\n-information of the full read sequence.\n-\n-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute\n-(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.\n-See accompanying text file for licence details (MIT/BSD style).\n-\n-This is version 0.0.8 of the script. Currently it uses Python\'s regular\n-expression engine for finding the primers, which for my needs is fast enough.\n-"""\n-import sys\n-import re\n-from galaxy_utils.sequence.fasta import fastaReader, fastaWriter\n-from galaxy_utils.sequence.fastq import fastqReader, fastqWriter\n-\n-if "-v" in sys.argv or "--version" in sys.argv:\n- print "v0.0.5"\n- sys.exit(0)\n-\n-def stop_err(msg, err=1):\n- sys.stderr.write(msg)\n- sys.exit(err)\n-\n-try:\n- from Bio.Seq import reverse_complement\n- from Bio.SeqIO.SffIO import SffIterator, SffWriter\n-except ImportError:\n- stop_err("Requires Biopython 1.54 or later")\n-try:\n- from Bio.SeqIO.SffIO import ReadRocheXmlManifest\n-except ImportError:\n- #Prior to Biopython 1.56 this was a private function\n- from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest\n-\n-#Parse Command Line\n-try:\n- in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]\n-except ValueError:\n- stop_err("Expected 8 arguments, got %i:\\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))\n-\n-if in_file == primer_fasta:\n- stop_err("Same file given as both primer sequences and sequences to clip!")\n-if in_file == out_file:\n- stop_err("Same file given as both sequences to clip and output!")\n-if primer_fasta == out_file:\n- stop_err("Same file given as both primer sequences and output!")\n-\n-try:\n- mm = int(mm)\n-except ValueError:\n- stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n-if mm < 0:\n- stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n-if mm not in [0,1,2]:\n- raise NotImplementedError\n-\n-try:\n- min_len = int(min_len)\n-except ValueError:\n- stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)\n-if min_len < 0:\n- stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)\n-\n-\n-if keep_negatives.lower() in ["true", "yes", "on"]:\n- keep_negatives = True\n-elif keep_negatives.lower() in ["false", "no", "off"]:\n- keep_negatives = False\n-else:\n- stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)\n-\n-\n-if primer_type.lower() == "forward":\n- forward = True\n- rc = False\n-elif primer_type.lower() == "reverse":\n- forward = False\n- rc = False\n-elif primer_type.lower() == "reverse-complement":\n- forward = False\n- rc = True\n-else:\n- stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)\n-\n-\n-ambiguous_dna_values = {\n- "A": "A",\n- "C": '..b'e = open(in_file, "rb")\n- try:\n- manifest = ReadRocheXmlManifest(in_handle)\n- except ValueError:\n- manifest = None\n- in_handle.seek(0)\n- out_handle = open(out_file, "wb")\n- writer = SffWriter(out_handle, xml=manifest)\n- writer.write_file(process(SffIterator(in_handle)))\n- #End of SFF code\n-elif seq_format.lower().startswith("fastq"):\n- in_handle = open(in_file, "rU")\n- out_handle = open(out_file, "w")\n- reader = fastqReader(in_handle)\n- writer = fastqWriter(out_handle)\n- if forward:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Forward primer, take everything after it\n- cut = result.end()\n- record.sequence = seq[cut:]\n- if len(record.sequence) >= min_len:\n- record.quality = record.quality[cut:]\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n- else:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Reverse primer, take everything before it\n- cut = result.start()\n- record.sequence = seq[:cut]\n- if len(record.sequence) >= min_len:\n- record.quality = record.quality[:cut]\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n-elif seq_format.lower()=="fasta":\n- in_handle = open(in_file, "rU")\n- out_handle = open(out_file, "w")\n- reader = fastaReader(in_handle)\n- writer = fastaWriter(out_handle)\n- #Following code is identical to that for FASTQ but without editing qualities\n- if forward:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Forward primer, take everything after it\n- cut = result.end()\n- record.sequence = seq[cut:]\n- if len(record.sequence) >= min_len:\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n- else:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Reverse primer, take everything before it\n- cut = result.start()\n- record.sequence = seq[:cut]\n- if len(record.sequence) >= min_len:\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n-else:\n- stop_err("Unsupported file type %r" % seq_format)\n-in_handle.close()\n-out_handle.close()\n-\n-print "Kept %i clipped reads," % clipped\n-print "discarded %i short." % short_clipped\n-if keep_negatives:\n- print "Kept %i non-matching reads," % negs\n- print "discarded %i short." % short_neg\n' |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.py.orig --- a/tools/primers/seq_primer_clip.py.orig Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,358 +0,0 @@\n-#!/usr/bin/env python\n-"""Looks for the given primer sequences and clips matching SFF reads.\n-\n-Takes eight command line options, input read filename, input read format,\n-input primer FASTA filename, type of primers (forward, reverse or reverse-\n-complement), number of mismatches (currently only 0, 1 and 2 are supported),\n-minimum length to keep a read (after primer trimming), should primer-less\n-reads be kept (boolean), and finally the output sequence filename.\n-\n-Both the primer and read sequences can contain IUPAC ambiguity codes like N.\n-\n-This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not\n-supported.\n-\n-The mismatch parameter does not consider gapped alignemnts, however the\n-special case of missing bases at the very start or end of the read is handled.\n-e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...\n-if one or more mismatches are allowed.\n-\n-This can also be used for stripping off (and optionally filtering on) barcodes.\n-\n-Note that only the trim/clip values in the SFF file are changed, not the flow\n-information of the full read sequence.\n-\n-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute\n-(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.\n-See accompanying text file for licence details (MIT/BSD style).\n-\n-This is version 0.0.8 of the script. Currently it uses Python\'s regular\n-expression engine for finding the primers, which for my needs is fast enough.\n-"""\n-import sys\n-import re\n-from galaxy_utils.sequence.fasta import fastaReader, fastaWriter\n-from galaxy_utils.sequence.fastq import fastqReader, fastqWriter\n-\n-if "-v" in sys.argv or "--version" in sys.argv:\n- print "v0.0.5"\n- sys.exit(0)\n-\n-def stop_err(msg, err=1):\n- sys.stderr.write(msg)\n- sys.exit(err)\n-\n-try:\n- from Bio.Seq import reverse_complement\n- from Bio.SeqIO.SffIO import SffIterator, SffWriter\n-except ImportError:\n- stop_err("Requires Biopython 1.54 or later")\n-try:\n- from Bio.SeqIO.SffIO import ReadRocheXmlManifest\n-except ImportError:\n- #Prior to Biopython 1.56 this was a private function\n- from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest\n-\n-#Parse Command Line\n-try:\n- in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]\n-except ValueError:\n- stop_err("Expected 8 arguments, got %i:\\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))\n-\n-if in_file == primer_fasta:\n- stop_err("Same file given as both primer sequences and sequences to clip!")\n-if in_file == out_file:\n- stop_err("Same file given as both sequences to clip and output!")\n-if primer_fasta == out_file:\n- stop_err("Same file given as both primer sequences and output!")\n-\n-try:\n- mm = int(mm)\n-except ValueError:\n- stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n-if mm < 0:\n- stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n-if mm not in [0,1,2]:\n- raise NotImplementedError\n-\n-try:\n- min_len = int(min_len)\n-except ValueError:\n- stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)\n-if min_len < 0:\n- stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)\n-\n-\n-if keep_negatives.lower() in ["true", "yes", "on"]:\n- keep_negatives = True\n-elif keep_negatives.lower() in ["false", "no", "off"]:\n- keep_negatives = False\n-else:\n- stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)\n-\n-\n-if primer_type.lower() == "forward":\n- forward = True\n- rc = False\n-elif primer_type.lower() == "reverse":\n- forward = False\n- rc = False\n-elif primer_type.lower() == "reverse-complement":\n- forward = False\n- rc = True\n-else:\n- stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)\n-\n-\n-ambiguous_dna_values = {\n- "A": "A",\n- "C": '..b'e = open(in_file, "rb")\n- try:\n- manifest = ReadRocheXmlManifest(in_handle)\n- except ValueError:\n- manifest = None\n- in_handle.seek(0)\n- out_handle = open(out_file, "wb")\n- writer = SffWriter(out_handle, xml=manifest)\n- writer.write_file(process(SffIterator(in_handle)))\n- #End of SFF code\n-elif seq_format.lower().startswith("fastq"):\n- in_handle = open(in_file, "rU")\n- out_handle = open(out_file, "w")\n- reader = fastqReader(in_handle)\n- writer = fastqWriter(out_handle)\n- if forward:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Forward primer, take everything after it\n- cut = result.end()\n- record.sequence = seq[cut:]\n- if len(record.sequence) >= min_len:\n- record.quality = record.quality[cut:]\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n- else:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Reverse primer, take everything before it\n- cut = result.start()\n- record.sequence = seq[:cut]\n- if len(record.sequence) >= min_len:\n- record.quality = record.quality[:cut]\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n-elif seq_format.lower()=="fasta":\n- in_handle = open(in_file, "rU")\n- out_handle = open(out_file, "w")\n- reader = fastaReader(in_handle)\n- writer = fastaWriter(out_handle)\n- #Following code is identical to that for FASTQ but without editing qualities\n- if forward:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Forward primer, take everything after it\n- cut = result.end()\n- record.sequence = seq[cut:]\n- if len(record.sequence) >= min_len:\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n- else:\n- for record in reader:\n- seq = record.sequence.upper()\n- result = primer.search(seq)\n- if result:\n- #Reverse primer, take everything before it\n- cut = result.start()\n- record.sequence = seq[:cut]\n- if len(record.sequence) >= min_len:\n- clipped += 1\n- writer.write(record)\n- else:\n- short_clipped += 1\n- elif keep_negatives:\n- if len(record) >= min_len:\n- negs += 1\n- writer.write(record)\n- else:\n- short_negs += 1\n-else:\n- stop_err("Unsupported file type %r" % seq_format)\n-in_handle.close()\n-out_handle.close()\n-\n-print "Kept %i clipped reads," % clipped\n-print "discarded %i short." % short_clipped\n-if keep_negatives:\n- print "Kept %i non-matching reads," % negs\n- print "discarded %i short." % short_neg\n' |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.txt --- a/tools/primers/seq_primer_clip.txt Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,93 +0,0 @@ -Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads -=========================================================== - -This tool is copyright 2011 by Peter Cock, The James Hutton Institute -(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. -See the licence text below. - -This tool is a short Python script (using the Galaxy library functions and -Biopython). There are just two files to install: - -* seq_primer_clip.py (the Python script) -* seq_primer_clip.xml (the Galaxy tool definition) - -The suggested location is a new tools/primers folder. You will also need to -modify the tools_conf.xml file to tell Galaxy to offer the tool: - -<tool file="primers/seq_primer_clip.xml" /> - -If you wish to run the unit tests, also add this to tools_conf.xml.sample -and move/copy the test-data files under Galaxy's test-data folder. Then: - -$ ./run_functional_tests.sh -id seq_primer_clip - -You will also need to install Biopython 1.54 or later. That's it. - - -History -======= - -v0.0.1 - Initial version (not publicly released) -v0.0.2 - Sort primers by length (longest and therefore most specific first) -v0.0.3 - Consider missing bases at start/end of read as mismatches -v0.0.4 - Apply minimum length to sequences with no match too -v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes -v0.0.6 - Added some functional tests -v0.0.7 - Added error check for bad filename arguments -v0.0.8 - Record version of Python script when run from Galaxy. - - Check for errors using Python script's return code. -v0.0.9 - Moved test data to workaround Galaxy Tool Shed limititation. - - -Developers -========== - -This script and related tools are being developed on the following hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/tools - -This incorporates the previously used hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter - -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder: - -$ tar -czf seq_primer_clip.tar.gz tools/primers/seq_primer_clip.* test-data/dop_primers.fasta test-data/MID4_GLZRM4E04_rnd30* - -Check this worked: - -$ tar -tzf seq_primer_clip.tar.gz -tools/primers/seq_primer_clip.py -tools/primers/seq_primer_clip.txt -tools/primers/seq_primer_clip.xml -test-data/dop_primers.fasta -test-data/MID4_GLZRM4E04_rnd30.fasta -test-data/MID4_GLZRM4E04_rnd30.fastqsanger -test-data/MID4_GLZRM4E04_rnd30_fclip.fasta -test-data/MID4_GLZRM4E04_rnd30_fclip.fastqsanger -test-data/MID4_GLZRM4E04_rnd30_fclip.sff -test-data/MID4_GLZRM4E04_rnd30_frclip.fasta -test-data/MID4_GLZRM4E04_rnd30_frclip.fastqsanger -test-data/MID4_GLZRM4E04_rnd30_frclip.sff -test-data/MID4_GLZRM4E04_rnd30.sff - - -Licence (MIT/BSD style) -======================= - -Permission to use, copy, modify, and distribute this software and its -documentation with or without modifications and for any purpose and -without fee is hereby granted, provided that any copyright notices -appear in all copies and that both those copyright notices and this -permission notice appear in supporting documentation, and that the -names of the contributors or copyright holders not be used in -advertising or publicity pertaining to distribution of the software -without specific prior permission. - -THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL -WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT -OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -OR PERFORMANCE OF THIS SOFTWARE. |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.txt.orig --- a/tools/primers/seq_primer_clip.txt.orig Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,103 +0,0 @@ -Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads -=========================================================== - -This tool is copyright 2011 by Peter Cock, The James Hutton Institute -(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. -See the licence text below. - -This tool is a short Python script (using the Galaxy library functions and -Biopython). There are just two files to install: - -* seq_primer_clip.py (the Python script) -* seq_primer_clip.xml (the Galaxy tool definition) - -The suggested location is a new tools/primers folder. You will also need to -modify the tools_conf.xml file to tell Galaxy to offer the tool and also do -this to tools_conf.xml.sample in order to run the tests: - -<tool file="primers/seq_primer_clip.xml" /> - -There are optionally some example files required to run the functional tests, -put these in the test-data/primers folder. - -test-data/primers/dop_primers.fasta -test-data/primers/MID4_GLZRM4E04_rnd30.fasta -test-data/primers/MID4_GLZRM4E04_rnd30.fastqsanger -test-data/primers/MID4_GLZRM4E04_rnd30.sff -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fasta -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.sff - -You should then be able to run the tests with: - -sh run_functional_tests.sh -id seq_primer_clip - -You will also need to install Biopython 1.54 or later. That's it. - - -History -======= - -v0.0.1 - Initial version (not publicly released) -v0.0.2 - Sort primers by length (longest and therefore most specific first) -v0.0.3 - Consider missing bases at start/end of read as mismatches -v0.0.4 - Apply minimum length to sequences with no match too -v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes -v0.0.6 - Added some functional tests -v0.0.7 - Added error check for bad filename arguments -v0.0.8 - Record version of Python script when run from Galaxy. - - Check for errors using Python script's return code. - - -Developers -========== - -This script and related tools are being developed on the following hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/tools - -This incorporates the previously used hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter - -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder: - -$ tar -czf seq_primer_clip.tar.gz tools/primers/seq_primer_clip.* test-data/primers/* - -Check this worked: - -$ tar -tzf seq_primer_clip.tar.gz -tools/primers/seq_primer_clip.py -tools/primers/seq_primer_clip.txt -tools/primers/seq_primer_clip.xml -test-data/primers/dop_primers.fasta -test-data/primers/MID4_GLZRM4E04_rnd30.fasta -test-data/primers/MID4_GLZRM4E04_rnd30.fastqsanger -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fasta -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger -test-data/primers/MID4_GLZRM4E04_rnd30_fclip.sff -test-data/primers/MID4_GLZRM4E04_rnd30_frclip.fasta -test-data/primers/MID4_GLZRM4E04_rnd30_frclip.fastqsanger -test-data/primers/MID4_GLZRM4E04_rnd30_frclip.sff -test-data/primers/MID4_GLZRM4E04_rnd30.sff - - -Licence (MIT/BSD style) -======================= - -Permission to use, copy, modify, and distribute this software and its -documentation with or without modifications and for any purpose and -without fee is hereby granted, provided that any copyright notices -appear in all copies and that both those copyright notices and this -permission notice appear in supporting documentation, and that the -names of the contributors or copyright holders not be used in -advertising or publicity pertaining to distribution of the software -without specific prior permission. - -THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL -WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT -OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -OR PERFORMANCE OF THIS SOFTWARE. |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.xml --- a/tools/primers/seq_primer_clip.xml Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,137 +0,0 @@ -<tool id="seq_primer_clip" name="Primer clip sequences" version="0.0.9"> - <description>Trim off 5' or 3' primers</description> - <version_command interpreter="python">seq_primer_clip.py --version</version_command> - <command interpreter="python"> -seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file - </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <inputs> - <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to clip" description="FASTA, FASTQ, or SFF format."/> - <param name="primer_fasta" type="data" format="fasta" label="FASTA file containing primer(s)"/> - <param name="primer_type" type="select" label="Type of primers"> - <option value="Forward">Forward (5') primers</option> - <option value="Reverse">Reverse (3') primers (given with respect to the forward strand)</option> - <option value="Reverse-complement">Reverse (3') primers (given with respect to the reverse strand)</option> - </param> - <param name="mm" type="integer" value="0" label="How many mismatches to allow? (0, 1 or 2)"> - <validator type="in_range" min="0" max="2" /> - </param> - <param name="keep_negatives" type="boolean" value="false" label="Keep reads with no matched primer"/> - <param name="min_len" type="integer" label="Minimum length for (clipped) sequences " value="1"/> - </inputs> - <outputs> - <data name="output_file" format="data" label="$primer_type primer clipped"> - <!-- TODO - Replace this with format="input:input_fastq" if/when that works --> - <change_format> - <when input_dataset="input_file" attribute="extension" value="sff" format="sff" /> - <when input_dataset="input_file" attribute="extension" value="fasta" format="fasta" /> - <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" /> - <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" /> - <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" /> - <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" /> - <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" /> - </change_format> - </data> - </outputs> - <tests> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30.fasta" ftype="fasta" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30.fastqsanger" ftype="fastqsanger" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> - </test> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> - </test> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.fastqsanger" ftype="fastqsanger" /> - </test> - <test> - <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> - <param name="primer_fasta" value="dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> - </test> - </tests> - <requirements> - <requirement type="python-module">Bio</requirement> - </requirements> - <help> - -**What it does** - -Looks for the given primer sequences (within the existing clipped sequence) and -further clips the reads to remove the primers and any preceding/trailing sequence. - -Reads containing a forward primer are reduced to just the sequence after (and -excluding) the forward primer. - -Reads containing a reverse primer are reduced to just the sequence before (and -excluding) the reverse primer. - -Degenerate primers can be specified using the standard IUPAC ambiguity codes, -thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity -codes) and so on. - -Note that for SFF files only the clip/trim positions are edited - you will still -be able to extract the original full read (with any adapter sequence and poor -quality sequence) if you need to. - -.. class:: warningmark - -**Note**. This tool was initially written for Roche 454 data, and should also -work fine on Sanger or Ion Torrent as well. However, it is probably too slow -for use on large Illumina datasets. - - -**Citation** - -This tool uses Biopython. If you use this tool in scientific work leading to a -publication, please cite: - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - - </help> -</tool> |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.xml.orig --- a/tools/primers/seq_primer_clip.xml.orig Tue Apr 30 11:04:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,137 +0,0 @@ -<tool id="seq_primer_clip" name="Primer clip sequences" version="0.0.8"> - <description>Trim off 5' or 3' primers</description> - <version_command interpreter="python">seq_primer_clip.py --version</version_command> - <command interpreter="python"> -seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file - </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <inputs> - <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to clip" description="FASTA, FASTQ, or SFF format."/> - <param name="primer_fasta" type="data" format="fasta" label="FASTA file containing primer(s)"/> - <param name="primer_type" type="select" label="Type of primers"> - <option value="Forward">Forward (5') primers</option> - <option value="Reverse">Reverse (3') primers (given with respect to the forward strand)</option> - <option value="Reverse-complement">Reverse (3') primers (given with respect to the reverse strand)</option> - </param> - <param name="mm" type="integer" value="0" label="How many mismatches to allow? (0, 1 or 2)"> - <validator type="in_range" min="0" max="2" /> - </param> - <param name="keep_negatives" type="boolean" value="false" label="Keep reads with no matched primer"/> - <param name="min_len" type="integer" label="Minimum length for (clipped) sequences " value="1"/> - </inputs> - <outputs> - <data name="output_file" format="data" label="$primer_type primer clipped"> - <!-- TODO - Replace this with format="input:input_fastq" if/when that works --> - <change_format> - <when input_dataset="input_file" attribute="extension" value="sff" format="sff" /> - <when input_dataset="input_file" attribute="extension" value="fasta" format="fasta" /> - <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" /> - <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" /> - <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" /> - <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" /> - <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" /> - </change_format> - </data> - </outputs> - <tests> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30.fasta" ftype="fasta" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30.fastqsanger" ftype="fastqsanger" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> - </test> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Forward" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="false" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> - </test> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_frclip.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_frclip.fastqsanger" ftype="fastqsanger" /> - </test> - <test> - <param name="input_file" value="primers/MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> - <param name="primer_fasta" value="primers/dop_primers.fasta" /> - <param name="primer_type" value="Reverse" /> - <param name="mm" value="2" /> - <param name="keep_negatives" value="true" /> - <param name="min_len" value="35" /> - <output name="output_file" file="primers/MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> - </test> - </tests> - <requirements> - <requirement type="python-module">Bio</requirement> - </requirements> - <help> - -**What it does** - -Looks for the given primer sequences (within the existing clipped sequence) and -further clips the reads to remove the primers and any preceding/trailing sequence. - -Reads containing a forward primer are reduced to just the sequence after (and -excluding) the forward primer. - -Reads containing a reverse primer are reduced to just the sequence before (and -excluding) the reverse primer. - -Degenerate primers can be specified using the standard IUPAC ambiguity codes, -thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity -codes) and so on. - -Note that for SFF files only the clip/trim positions are edited - you will still -be able to extract the original full read (with any adapter sequence and poor -quality sequence) if you need to. - -.. class:: warningmark - -**Note**. This tool was initially written for Roche 454 data, and should also -work fine on Sanger or Ion Torrent as well. However, it is probably too slow -for use on large Illumina datasets. - - -**Citation** - -This tool uses Biopython. If you use this tool in scientific work leading to a -publication, please cite: - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - - </help> -</tool> |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_primer_clip/README.rst Thu Oct 24 09:37:25 2013 -0400 |
b |
@@ -0,0 +1,120 @@ +Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads +=========================================================== + +This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below (MIT licence). + +This tool is a short Python script (using the Galaxy library functions and +Biopython). It is available from the Galaxy Tool Shed here: +http://toolshed.g2.bx.psu.edu/view/peterjc/seq_primer_clip + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install: + +* seq_primer_clip.py (the Python script) +* seq_primer_clip.xml (the Galaxy tool definition) + +The suggested location is a new tools/seq_primer_clip folder. You will also +need to modify the tools_conf.xml file to tell Galaxy to offer the tool:: + + <tool file="seq_primer_clip/seq_primer_clip.xml" /> + +If you wish to run the unit tests, also add this to tools_conf.xml.sample +and move/copy the test-data files under Galaxy's test-data folder. Then:: + + $ ./run_functional_tests.sh -id seq_primer_clip + +You will also need to install Biopython 1.54 or later. That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version (not publicly released) +v0.0.2 - Sort primers by length (longest and therefore most specific first) +v0.0.3 - Consider missing bases at start/end of read as mismatches +v0.0.4 - Apply minimum length to sequences with no match too +v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes +v0.0.6 - Added some functional tests +v0.0.7 - Added error check for bad filename arguments +v0.0.8 - Record version of Python script when run from Galaxy. + - Check for errors using Python script's return code. +v0.0.9 - Moved test data to workaround Galaxy Tool Shed limititation. +v0.0.10 - Include links to Tool Shed in help text and this README file. + - Use reStructuredText for this README file. + - Adopted standard MIT licence. + - Automated installation of Biopython dependency. + - Development moved to GitHub, https://github.com/peterjc/pico_galaxy + - Renamed folder and adopted README.rst naming. +======= ====================================================================== + + +Developers +========== + +This script and related tools were initially developed on the following hg branches: +http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter +http://bitbucket.org/peterjc/galaxy-central/src/tools + +Development has now moved to a dedicated GitHub repository: +https://github.com/peterjc/pico_galaxy + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder:: + + $ tar -czf seq_primer_clip.tar.gz tools/seq_primer_clip/README.rst tools/seq_primer_clip/seq_primer_clip.* tools/seq_primer_clip/repository_dependencies.xml test-data/dop_primers.fasta test-data/MID4_GLZRM4E04_rnd30* + +Check this worked:: + + $ tar -tzf seq_primer_clip.tar.gz + tools/seq_primer_clip/README.rst + tools/seq_primer_clip/seq_primer_clip.xml + tools/seq_primer_clip/seq_primer_clip.py + tools/seq_primer_clip/repository_dependencies.xml + test-data/dop_primers.fasta + test-data/MID4_GLZRM4E04_rnd30.fasta + test-data/MID4_GLZRM4E04_rnd30.fastqsanger + test-data/MID4_GLZRM4E04_rnd30_fclip.fasta + test-data/MID4_GLZRM4E04_rnd30_fclip.fastqsanger + test-data/MID4_GLZRM4E04_rnd30_fclip.sff + test-data/MID4_GLZRM4E04_rnd30_frclip.fasta + test-data/MID4_GLZRM4E04_rnd30_frclip.fastqsanger + test-data/MID4_GLZRM4E04_rnd30_frclip.sff + test-data/MID4_GLZRM4E04_rnd30.sff + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_primer_clip/repository_dependencies.xml Thu Oct 24 09:37:25 2013 -0400 |
b |
@@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<repositories description="This requires Biopython as a dependency."> +<!-- Leave out the tool shed and revision to get the current + tool shed and latest revision at the time of upload --> +<repository changeset_revision="3e82cbc44886" name="package_biopython_1_62" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" /> +</repositories> |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/seq_primer_clip.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_primer_clip/seq_primer_clip.py Thu Oct 24 09:37:25 2013 -0400 |
[ |
b'@@ -0,0 +1,358 @@\n+#!/usr/bin/env python\n+"""Looks for the given primer sequences and clips matching SFF reads.\n+\n+Takes eight command line options, input read filename, input read format,\n+input primer FASTA filename, type of primers (forward, reverse or reverse-\n+complement), number of mismatches (currently only 0, 1 and 2 are supported),\n+minimum length to keep a read (after primer trimming), should primer-less\n+reads be kept (boolean), and finally the output sequence filename.\n+\n+Both the primer and read sequences can contain IUPAC ambiguity codes like N.\n+\n+This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not\n+supported.\n+\n+The mismatch parameter does not consider gapped alignemnts, however the\n+special case of missing bases at the very start or end of the read is handled.\n+e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...\n+if one or more mismatches are allowed.\n+\n+This can also be used for stripping off (and optionally filtering on) barcodes.\n+\n+Note that only the trim/clip values in the SFF file are changed, not the flow\n+information of the full read sequence.\n+\n+This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute\n+(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.\n+See accompanying text file for licence details (MIT/BSD style).\n+\n+This is version 0.0.8 of the script. Currently it uses Python\'s regular\n+expression engine for finding the primers, which for my needs is fast enough.\n+"""\n+import sys\n+import re\n+from galaxy_utils.sequence.fasta import fastaReader, fastaWriter\n+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter\n+\n+if "-v" in sys.argv or "--version" in sys.argv:\n+ print "v0.0.5"\n+ sys.exit(0)\n+\n+def stop_err(msg, err=1):\n+ sys.stderr.write(msg)\n+ sys.exit(err)\n+\n+try:\n+ from Bio.Seq import reverse_complement\n+ from Bio.SeqIO.SffIO import SffIterator, SffWriter\n+except ImportError:\n+ stop_err("Requires Biopython 1.54 or later")\n+try:\n+ from Bio.SeqIO.SffIO import ReadRocheXmlManifest\n+except ImportError:\n+ #Prior to Biopython 1.56 this was a private function\n+ from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest\n+\n+#Parse Command Line\n+try:\n+ in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]\n+except ValueError:\n+ stop_err("Expected 8 arguments, got %i:\\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))\n+\n+if in_file == primer_fasta:\n+ stop_err("Same file given as both primer sequences and sequences to clip!")\n+if in_file == out_file:\n+ stop_err("Same file given as both sequences to clip and output!")\n+if primer_fasta == out_file:\n+ stop_err("Same file given as both primer sequences and output!")\n+\n+try:\n+ mm = int(mm)\n+except ValueError:\n+ stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n+if mm < 0:\n+ stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)\n+if mm not in [0,1,2]:\n+ raise NotImplementedError\n+\n+try:\n+ min_len = int(min_len)\n+except ValueError:\n+ stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)\n+if min_len < 0:\n+ stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)\n+\n+\n+if keep_negatives.lower() in ["true", "yes", "on"]:\n+ keep_negatives = True\n+elif keep_negatives.lower() in ["false", "no", "off"]:\n+ keep_negatives = False\n+else:\n+ stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)\n+\n+\n+if primer_type.lower() == "forward":\n+ forward = True\n+ rc = False\n+elif primer_type.lower() == "reverse":\n+ forward = False\n+ rc = False\n+elif primer_type.lower() == "reverse-complement":\n+ forward = False\n+ rc = True\n+else:\n+ stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)\n+\n+\n+ambiguous_dna_values = {\n+ "A": "A",\n+ "C": '..b'e = open(in_file, "rb")\n+ try:\n+ manifest = ReadRocheXmlManifest(in_handle)\n+ except ValueError:\n+ manifest = None\n+ in_handle.seek(0)\n+ out_handle = open(out_file, "wb")\n+ writer = SffWriter(out_handle, xml=manifest)\n+ writer.write_file(process(SffIterator(in_handle)))\n+ #End of SFF code\n+elif seq_format.lower().startswith("fastq"):\n+ in_handle = open(in_file, "rU")\n+ out_handle = open(out_file, "w")\n+ reader = fastqReader(in_handle)\n+ writer = fastqWriter(out_handle)\n+ if forward:\n+ for record in reader:\n+ seq = record.sequence.upper()\n+ result = primer.search(seq)\n+ if result:\n+ #Forward primer, take everything after it\n+ cut = result.end()\n+ record.sequence = seq[cut:]\n+ if len(record.sequence) >= min_len:\n+ record.quality = record.quality[cut:]\n+ clipped += 1\n+ writer.write(record)\n+ else:\n+ short_clipped += 1\n+ elif keep_negatives:\n+ if len(record) >= min_len:\n+ negs += 1\n+ writer.write(record)\n+ else:\n+ short_negs += 1\n+ else:\n+ for record in reader:\n+ seq = record.sequence.upper()\n+ result = primer.search(seq)\n+ if result:\n+ #Reverse primer, take everything before it\n+ cut = result.start()\n+ record.sequence = seq[:cut]\n+ if len(record.sequence) >= min_len:\n+ record.quality = record.quality[:cut]\n+ clipped += 1\n+ writer.write(record)\n+ else:\n+ short_clipped += 1\n+ elif keep_negatives:\n+ if len(record) >= min_len:\n+ negs += 1\n+ writer.write(record)\n+ else:\n+ short_negs += 1\n+elif seq_format.lower()=="fasta":\n+ in_handle = open(in_file, "rU")\n+ out_handle = open(out_file, "w")\n+ reader = fastaReader(in_handle)\n+ writer = fastaWriter(out_handle)\n+ #Following code is identical to that for FASTQ but without editing qualities\n+ if forward:\n+ for record in reader:\n+ seq = record.sequence.upper()\n+ result = primer.search(seq)\n+ if result:\n+ #Forward primer, take everything after it\n+ cut = result.end()\n+ record.sequence = seq[cut:]\n+ if len(record.sequence) >= min_len:\n+ clipped += 1\n+ writer.write(record)\n+ else:\n+ short_clipped += 1\n+ elif keep_negatives:\n+ if len(record) >= min_len:\n+ negs += 1\n+ writer.write(record)\n+ else:\n+ short_negs += 1\n+ else:\n+ for record in reader:\n+ seq = record.sequence.upper()\n+ result = primer.search(seq)\n+ if result:\n+ #Reverse primer, take everything before it\n+ cut = result.start()\n+ record.sequence = seq[:cut]\n+ if len(record.sequence) >= min_len:\n+ clipped += 1\n+ writer.write(record)\n+ else:\n+ short_clipped += 1\n+ elif keep_negatives:\n+ if len(record) >= min_len:\n+ negs += 1\n+ writer.write(record)\n+ else:\n+ short_negs += 1\n+else:\n+ stop_err("Unsupported file type %r" % seq_format)\n+in_handle.close()\n+out_handle.close()\n+\n+print "Kept %i clipped reads," % clipped\n+print "discarded %i short." % short_clipped\n+if keep_negatives:\n+ print "Kept %i non-matching reads," % negs\n+ print "discarded %i short." % short_neg\n' |
b |
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/seq_primer_clip.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_primer_clip/seq_primer_clip.xml Thu Oct 24 09:37:25 2013 -0400 |
b |
@@ -0,0 +1,143 @@ +<tool id="seq_primer_clip" name="Primer clip sequences" version="0.0.10"> + <description>Trim off 5' or 3' primers</description> + <requirements> + <requirement type="package" version="1.62">biopython</requirement> + <requirement type="python-module">Bio</requirement> + </requirements> + <version_command interpreter="python">seq_primer_clip.py --version</version_command> + <command interpreter="python"> +seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to clip" description="FASTA, FASTQ, or SFF format."/> + <param name="primer_fasta" type="data" format="fasta" label="FASTA file containing primer(s)"/> + <param name="primer_type" type="select" label="Type of primers"> + <option value="Forward">Forward (5') primers</option> + <option value="Reverse">Reverse (3') primers (given with respect to the forward strand)</option> + <option value="Reverse-complement">Reverse (3') primers (given with respect to the reverse strand)</option> + </param> + <param name="mm" type="integer" value="0" label="How many mismatches to allow? (0, 1 or 2)"> + <validator type="in_range" min="0" max="2" /> + </param> + <param name="keep_negatives" type="boolean" value="false" label="Keep reads with no matched primer"/> + <param name="min_len" type="integer" label="Minimum length for (clipped) sequences " value="1"/> + </inputs> + <outputs> + <data name="output_file" format="data" label="$primer_type primer clipped"> + <!-- TODO - Replace this with format="input:input_fastq" if/when that works --> + <change_format> + <when input_dataset="input_file" attribute="extension" value="sff" format="sff" /> + <when input_dataset="input_file" attribute="extension" value="fasta" format="fasta" /> + <when input_dataset="input_file" attribute="extension" value="fastq" format="fastq" /> + <when input_dataset="input_file" attribute="extension" value="fastqsanger" format="fastqsanger" /> + <when input_dataset="input_file" attribute="extension" value="fastqsolexa" format="fastqsolexa" /> + <when input_dataset="input_file" attribute="extension" value="fastqillumina" format="fastqillumina" /> + <when input_dataset="input_file" attribute="extension" value="fastqcssanger" format="fastqcssanger" /> + </change_format> + </data> + </outputs> + <tests> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30.fasta" ftype="fasta" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Forward" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="false" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30.fastqsanger" ftype="fastqsanger" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Forward" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="false" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Forward" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="false" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.fasta" ftype="fasta" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Reverse" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="true" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.fasta" ftype="fasta" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.fastqsanger" ftype="fastqsanger" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Reverse" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="true" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.fastqsanger" ftype="fastqsanger" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_fclip.sff" ftype="sff" /> + <param name="primer_fasta" value="dop_primers.fasta" /> + <param name="primer_type" value="Reverse" /> + <param name="mm" value="2" /> + <param name="keep_negatives" value="true" /> + <param name="min_len" value="35" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> + </test> + </tests> + <requirements> + <requirement type="python-module">Bio</requirement> + </requirements> + <help> + +**What it does** + +Looks for the given primer sequences (within the existing clipped sequence) and +further clips the reads to remove the primers and any preceding/trailing sequence. + +Reads containing a forward primer are reduced to just the sequence after (and +excluding) the forward primer. + +Reads containing a reverse primer are reduced to just the sequence before (and +excluding) the reverse primer. + +Degenerate primers can be specified using the standard IUPAC ambiguity codes, +thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity +codes) and so on. + +Note that for SFF files only the clip/trim positions are edited - you will still +be able to extract the original full read (with any adapter sequence and poor +quality sequence) if you need to. + +.. class:: warningmark + +**Note**. This tool was initially written for Roche 454 data, and should also +work fine on Sanger or Ion Torrent as well. However, it is probably too slow +for use on large Illumina datasets. + + +**Citation** + +This tool uses Biopython. If you use this tool in scientific work leading to a +publication, please cite: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_primer_clip + </help> +</tool> |