# HG changeset patch
# User peterjc
# Date 1382621845 14400
# Node ID ee5acea162a7744cd8f452fa298466cf67a95ac4
# Parent 8c02a91a868019bbb9e57ea03a05a6f5b86d4df6
Uploaded v0.0.10, README now using RST, MIT licence, automatic Biopython dependency
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.py
--- a/tools/primers/seq_primer_clip.py Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-#!/usr/bin/env python
-"""Looks for the given primer sequences and clips matching SFF reads.
-
-Takes eight command line options, input read filename, input read format,
-input primer FASTA filename, type of primers (forward, reverse or reverse-
-complement), number of mismatches (currently only 0, 1 and 2 are supported),
-minimum length to keep a read (after primer trimming), should primer-less
-reads be kept (boolean), and finally the output sequence filename.
-
-Both the primer and read sequences can contain IUPAC ambiguity codes like N.
-
-This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not
-supported.
-
-The mismatch parameter does not consider gapped alignemnts, however the
-special case of missing bases at the very start or end of the read is handled.
-e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...
-if one or more mismatches are allowed.
-
-This can also be used for stripping off (and optionally filtering on) barcodes.
-
-Note that only the trim/clip values in the SFF file are changed, not the flow
-information of the full read sequence.
-
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
-(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
-See accompanying text file for licence details (MIT/BSD style).
-
-This is version 0.0.8 of the script. Currently it uses Python's regular
-expression engine for finding the primers, which for my needs is fast enough.
-"""
-import sys
-import re
-from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
-from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-
-if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.5"
- sys.exit(0)
-
-def stop_err(msg, err=1):
- sys.stderr.write(msg)
- sys.exit(err)
-
-try:
- from Bio.Seq import reverse_complement
- from Bio.SeqIO.SffIO import SffIterator, SffWriter
-except ImportError:
- stop_err("Requires Biopython 1.54 or later")
-try:
- from Bio.SeqIO.SffIO import ReadRocheXmlManifest
-except ImportError:
- #Prior to Biopython 1.56 this was a private function
- from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-
-#Parse Command Line
-try:
- in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
-except ValueError:
- stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
-
-if in_file == primer_fasta:
- stop_err("Same file given as both primer sequences and sequences to clip!")
-if in_file == out_file:
- stop_err("Same file given as both sequences to clip and output!")
-if primer_fasta == out_file:
- stop_err("Same file given as both primer sequences and output!")
-
-try:
- mm = int(mm)
-except ValueError:
- stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
-if mm < 0:
- stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
-if mm not in [0,1,2]:
- raise NotImplementedError
-
-try:
- min_len = int(min_len)
-except ValueError:
- stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
-if min_len < 0:
- stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
-
-
-if keep_negatives.lower() in ["true", "yes", "on"]:
- keep_negatives = True
-elif keep_negatives.lower() in ["false", "no", "off"]:
- keep_negatives = False
-else:
- stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
-
-
-if primer_type.lower() == "forward":
- forward = True
- rc = False
-elif primer_type.lower() == "reverse":
- forward = False
- rc = False
-elif primer_type.lower() == "reverse-complement":
- forward = False
- rc = True
-else:
- stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)
-
-
-ambiguous_dna_values = {
- "A": "A",
- "C": "C",
- "G": "G",
- "T": "T",
- "M": "ACM",
- "R": "AGR",
- "W": "ATW",
- "S": "CGS",
- "Y": "CTY",
- "K": "GTK",
- "V": "ACGMRSV",
- "H": "ACTMWYH",
- "D": "AGTRWKD",
- "B": "CGTSYKB",
- "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC]
- "N": ".",
- }
-
-ambiguous_dna_re = {}
-for letter, values in ambiguous_dna_values.iteritems():
- if len(values) == 1:
- ambiguous_dna_re[letter] = values
- else:
- ambiguous_dna_re[letter] = "[%s]" % values
-
-
-def make_reg_ex(seq):
- return "".join(ambiguous_dna_re[letter] for letter in seq)
-
-def make_reg_ex_mm(seq, mm):
- if mm > 2:
- raise NotImplementedError("At most 2 mismatches allowed!")
- seq = seq.upper()
- yield make_reg_ex(seq)
- for i in range(1,mm+1):
- #Missing first/last i bases at very start/end of sequence
- for reg in make_reg_ex_mm(seq[i:], mm-i):
- yield "^" + reg
- for reg in make_reg_ex_mm(seq[:-i], mm-i):
- yield "$" + reg
- if mm >= 1:
- for i,letter in enumerate(seq):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- pattern = seq[:i] + "N" + seq[i+1:]
- assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
- % (pattern, len(pattern), seq, len(seq))
- yield make_reg_ex(pattern)
- if mm >=2:
- for i,letter in enumerate(seq):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- for k,letter in enumerate(seq[i+1:]):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:]
- assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
- % (pattern, len(pattern), seq, len(seq))
- yield make_reg_ex(pattern)
-
-def load_primers_as_re(primer_fasta, mm, rc=False):
- #Read primer file and record all specified sequences
- primers = set()
- in_handle = open(primer_fasta, "rU")
- reader = fastaReader(in_handle)
- count = 0
- for record in reader:
- if rc:
- seq = reverse_complement(record.sequence)
- else:
- seq = record.sequence
- #primers.add(re.compile(make_reg_ex(seq)))
- count += 1
- for pattern in make_reg_ex_mm(seq, mm):
- primers.add(pattern)
- in_handle.close()
- #Use set to avoid duplicates, sort to have longest first
- #(so more specific primers found before less specific ones)
- primers = sorted(set(primers), key=lambda p: -len(p))
- return count, re.compile("|".join(primers)) #make one monster re!
-
-
-
-#Read primer file and record all specified sequences
-count, primer = load_primers_as_re(primer_fasta, mm, rc)
-print "%i primer sequences" % count
-
-short_neg = 0
-short_clipped = 0
-clipped = 0
-negs = 0
-
-if seq_format.lower()=="sff":
- #SFF is different because we just change the trim points
- if forward:
- def process(records):
- global short_clipped, short_neg, clipped, negs
- for record in records:
- left_clip = record.annotations["clip_qual_left"]
- right_clip = record.annotations["clip_qual_right"]
- seq = str(record.seq)[left_clip:right_clip].upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- #so move the left clip along
- if len(seq) - result.end() >= min_len:
- record.annotations["clip_qual_left"] = left_clip + result.end()
- clipped += 1
- yield record
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(seq) >= min_len:
- negs += 1
- yield record
- else:
- short_neg += 1
- else:
- def process(records):
- global short_clipped, short_neg, clipped, negs
- for record in records:
- left_clip = record.annotations["clip_qual_left"]
- right_clip = record.annotations["clip_qual_right"]
- seq = str(record.seq)[left_clip:right_clip].upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- #so move the right clip back
- new_len = result.start()
- if new_len >= min_len:
- record.annotations["clip_qual_right"] = left_clip + new_len
- clipped += 1
- yield record
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(seq) >= min_len:
- negs += 1
- yield record
- else:
- short_neg += 1
-
- in_handle = open(in_file, "rb")
- try:
- manifest = ReadRocheXmlManifest(in_handle)
- except ValueError:
- manifest = None
- in_handle.seek(0)
- out_handle = open(out_file, "wb")
- writer = SffWriter(out_handle, xml=manifest)
- writer.write_file(process(SffIterator(in_handle)))
- #End of SFF code
-elif seq_format.lower().startswith("fastq"):
- in_handle = open(in_file, "rU")
- out_handle = open(out_file, "w")
- reader = fastqReader(in_handle)
- writer = fastqWriter(out_handle)
- if forward:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- cut = result.end()
- record.sequence = seq[cut:]
- if len(record.sequence) >= min_len:
- record.quality = record.quality[cut:]
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
- else:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- cut = result.start()
- record.sequence = seq[:cut]
- if len(record.sequence) >= min_len:
- record.quality = record.quality[:cut]
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
-elif seq_format.lower()=="fasta":
- in_handle = open(in_file, "rU")
- out_handle = open(out_file, "w")
- reader = fastaReader(in_handle)
- writer = fastaWriter(out_handle)
- #Following code is identical to that for FASTQ but without editing qualities
- if forward:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- cut = result.end()
- record.sequence = seq[cut:]
- if len(record.sequence) >= min_len:
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
- else:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- cut = result.start()
- record.sequence = seq[:cut]
- if len(record.sequence) >= min_len:
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
-else:
- stop_err("Unsupported file type %r" % seq_format)
-in_handle.close()
-out_handle.close()
-
-print "Kept %i clipped reads," % clipped
-print "discarded %i short." % short_clipped
-if keep_negatives:
- print "Kept %i non-matching reads," % negs
- print "discarded %i short." % short_neg
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.py.orig
--- a/tools/primers/seq_primer_clip.py.orig Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-#!/usr/bin/env python
-"""Looks for the given primer sequences and clips matching SFF reads.
-
-Takes eight command line options, input read filename, input read format,
-input primer FASTA filename, type of primers (forward, reverse or reverse-
-complement), number of mismatches (currently only 0, 1 and 2 are supported),
-minimum length to keep a read (after primer trimming), should primer-less
-reads be kept (boolean), and finally the output sequence filename.
-
-Both the primer and read sequences can contain IUPAC ambiguity codes like N.
-
-This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not
-supported.
-
-The mismatch parameter does not consider gapped alignemnts, however the
-special case of missing bases at the very start or end of the read is handled.
-e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...
-if one or more mismatches are allowed.
-
-This can also be used for stripping off (and optionally filtering on) barcodes.
-
-Note that only the trim/clip values in the SFF file are changed, not the flow
-information of the full read sequence.
-
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
-(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
-See accompanying text file for licence details (MIT/BSD style).
-
-This is version 0.0.8 of the script. Currently it uses Python's regular
-expression engine for finding the primers, which for my needs is fast enough.
-"""
-import sys
-import re
-from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
-from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-
-if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.5"
- sys.exit(0)
-
-def stop_err(msg, err=1):
- sys.stderr.write(msg)
- sys.exit(err)
-
-try:
- from Bio.Seq import reverse_complement
- from Bio.SeqIO.SffIO import SffIterator, SffWriter
-except ImportError:
- stop_err("Requires Biopython 1.54 or later")
-try:
- from Bio.SeqIO.SffIO import ReadRocheXmlManifest
-except ImportError:
- #Prior to Biopython 1.56 this was a private function
- from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-
-#Parse Command Line
-try:
- in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
-except ValueError:
- stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
-
-if in_file == primer_fasta:
- stop_err("Same file given as both primer sequences and sequences to clip!")
-if in_file == out_file:
- stop_err("Same file given as both sequences to clip and output!")
-if primer_fasta == out_file:
- stop_err("Same file given as both primer sequences and output!")
-
-try:
- mm = int(mm)
-except ValueError:
- stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
-if mm < 0:
- stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
-if mm not in [0,1,2]:
- raise NotImplementedError
-
-try:
- min_len = int(min_len)
-except ValueError:
- stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
-if min_len < 0:
- stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
-
-
-if keep_negatives.lower() in ["true", "yes", "on"]:
- keep_negatives = True
-elif keep_negatives.lower() in ["false", "no", "off"]:
- keep_negatives = False
-else:
- stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
-
-
-if primer_type.lower() == "forward":
- forward = True
- rc = False
-elif primer_type.lower() == "reverse":
- forward = False
- rc = False
-elif primer_type.lower() == "reverse-complement":
- forward = False
- rc = True
-else:
- stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)
-
-
-ambiguous_dna_values = {
- "A": "A",
- "C": "C",
- "G": "G",
- "T": "T",
- "M": "ACM",
- "R": "AGR",
- "W": "ATW",
- "S": "CGS",
- "Y": "CTY",
- "K": "GTK",
- "V": "ACGMRSV",
- "H": "ACTMWYH",
- "D": "AGTRWKD",
- "B": "CGTSYKB",
- "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC]
- "N": ".",
- }
-
-ambiguous_dna_re = {}
-for letter, values in ambiguous_dna_values.iteritems():
- if len(values) == 1:
- ambiguous_dna_re[letter] = values
- else:
- ambiguous_dna_re[letter] = "[%s]" % values
-
-
-def make_reg_ex(seq):
- return "".join(ambiguous_dna_re[letter] for letter in seq)
-
-def make_reg_ex_mm(seq, mm):
- if mm > 2:
- raise NotImplementedError("At most 2 mismatches allowed!")
- seq = seq.upper()
- yield make_reg_ex(seq)
- for i in range(1,mm+1):
- #Missing first/last i bases at very start/end of sequence
- for reg in make_reg_ex_mm(seq[i:], mm-i):
- yield "^" + reg
- for reg in make_reg_ex_mm(seq[:-i], mm-i):
- yield "$" + reg
- if mm >= 1:
- for i,letter in enumerate(seq):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- pattern = seq[:i] + "N" + seq[i+1:]
- assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
- % (pattern, len(pattern), seq, len(seq))
- yield make_reg_ex(pattern)
- if mm >=2:
- for i,letter in enumerate(seq):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- for k,letter in enumerate(seq[i+1:]):
- #We'll use a set to remove any duplicate patterns
- #if letter not in "NX":
- pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:]
- assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
- % (pattern, len(pattern), seq, len(seq))
- yield make_reg_ex(pattern)
-
-def load_primers_as_re(primer_fasta, mm, rc=False):
- #Read primer file and record all specified sequences
- primers = set()
- in_handle = open(primer_fasta, "rU")
- reader = fastaReader(in_handle)
- count = 0
- for record in reader:
- if rc:
- seq = reverse_complement(record.sequence)
- else:
- seq = record.sequence
- #primers.add(re.compile(make_reg_ex(seq)))
- count += 1
- for pattern in make_reg_ex_mm(seq, mm):
- primers.add(pattern)
- in_handle.close()
- #Use set to avoid duplicates, sort to have longest first
- #(so more specific primers found before less specific ones)
- primers = sorted(set(primers), key=lambda p: -len(p))
- return count, re.compile("|".join(primers)) #make one monster re!
-
-
-
-#Read primer file and record all specified sequences
-count, primer = load_primers_as_re(primer_fasta, mm, rc)
-print "%i primer sequences" % count
-
-short_neg = 0
-short_clipped = 0
-clipped = 0
-negs = 0
-
-if seq_format.lower()=="sff":
- #SFF is different because we just change the trim points
- if forward:
- def process(records):
- global short_clipped, short_neg, clipped, negs
- for record in records:
- left_clip = record.annotations["clip_qual_left"]
- right_clip = record.annotations["clip_qual_right"]
- seq = str(record.seq)[left_clip:right_clip].upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- #so move the left clip along
- if len(seq) - result.end() >= min_len:
- record.annotations["clip_qual_left"] = left_clip + result.end()
- clipped += 1
- yield record
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(seq) >= min_len:
- negs += 1
- yield record
- else:
- short_neg += 1
- else:
- def process(records):
- global short_clipped, short_neg, clipped, negs
- for record in records:
- left_clip = record.annotations["clip_qual_left"]
- right_clip = record.annotations["clip_qual_right"]
- seq = str(record.seq)[left_clip:right_clip].upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- #so move the right clip back
- new_len = result.start()
- if new_len >= min_len:
- record.annotations["clip_qual_right"] = left_clip + new_len
- clipped += 1
- yield record
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(seq) >= min_len:
- negs += 1
- yield record
- else:
- short_neg += 1
-
- in_handle = open(in_file, "rb")
- try:
- manifest = ReadRocheXmlManifest(in_handle)
- except ValueError:
- manifest = None
- in_handle.seek(0)
- out_handle = open(out_file, "wb")
- writer = SffWriter(out_handle, xml=manifest)
- writer.write_file(process(SffIterator(in_handle)))
- #End of SFF code
-elif seq_format.lower().startswith("fastq"):
- in_handle = open(in_file, "rU")
- out_handle = open(out_file, "w")
- reader = fastqReader(in_handle)
- writer = fastqWriter(out_handle)
- if forward:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- cut = result.end()
- record.sequence = seq[cut:]
- if len(record.sequence) >= min_len:
- record.quality = record.quality[cut:]
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
- else:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- cut = result.start()
- record.sequence = seq[:cut]
- if len(record.sequence) >= min_len:
- record.quality = record.quality[:cut]
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
-elif seq_format.lower()=="fasta":
- in_handle = open(in_file, "rU")
- out_handle = open(out_file, "w")
- reader = fastaReader(in_handle)
- writer = fastaWriter(out_handle)
- #Following code is identical to that for FASTQ but without editing qualities
- if forward:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Forward primer, take everything after it
- cut = result.end()
- record.sequence = seq[cut:]
- if len(record.sequence) >= min_len:
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
- else:
- for record in reader:
- seq = record.sequence.upper()
- result = primer.search(seq)
- if result:
- #Reverse primer, take everything before it
- cut = result.start()
- record.sequence = seq[:cut]
- if len(record.sequence) >= min_len:
- clipped += 1
- writer.write(record)
- else:
- short_clipped += 1
- elif keep_negatives:
- if len(record) >= min_len:
- negs += 1
- writer.write(record)
- else:
- short_negs += 1
-else:
- stop_err("Unsupported file type %r" % seq_format)
-in_handle.close()
-out_handle.close()
-
-print "Kept %i clipped reads," % clipped
-print "discarded %i short." % short_clipped
-if keep_negatives:
- print "Kept %i non-matching reads," % negs
- print "discarded %i short." % short_neg
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.txt
--- a/tools/primers/seq_primer_clip.txt Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,93 +0,0 @@
-Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads
-===========================================================
-
-This tool is copyright 2011 by Peter Cock, The James Hutton Institute
-(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the licence text below.
-
-This tool is a short Python script (using the Galaxy library functions and
-Biopython). There are just two files to install:
-
-* seq_primer_clip.py (the Python script)
-* seq_primer_clip.xml (the Galaxy tool definition)
-
-The suggested location is a new tools/primers folder. You will also need to
-modify the tools_conf.xml file to tell Galaxy to offer the tool:
-
-
-
-If you wish to run the unit tests, also add this to tools_conf.xml.sample
-and move/copy the test-data files under Galaxy's test-data folder. Then:
-
-$ ./run_functional_tests.sh -id seq_primer_clip
-
-You will also need to install Biopython 1.54 or later. That's it.
-
-
-History
-=======
-
-v0.0.1 - Initial version (not publicly released)
-v0.0.2 - Sort primers by length (longest and therefore most specific first)
-v0.0.3 - Consider missing bases at start/end of read as mismatches
-v0.0.4 - Apply minimum length to sequences with no match too
-v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes
-v0.0.6 - Added some functional tests
-v0.0.7 - Added error check for bad filename arguments
-v0.0.8 - Record version of Python script when run from Galaxy.
- - Check for errors using Python script's return code.
-v0.0.9 - Moved test data to workaround Galaxy Tool Shed limititation.
-
-
-Developers
-==========
-
-This script and related tools are being developed on the following hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/tools
-
-This incorporates the previously used hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter
-
-For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
-the following command from the Galaxy root folder:
-
-$ tar -czf seq_primer_clip.tar.gz tools/primers/seq_primer_clip.* test-data/dop_primers.fasta test-data/MID4_GLZRM4E04_rnd30*
-
-Check this worked:
-
-$ tar -tzf seq_primer_clip.tar.gz
-tools/primers/seq_primer_clip.py
-tools/primers/seq_primer_clip.txt
-tools/primers/seq_primer_clip.xml
-test-data/dop_primers.fasta
-test-data/MID4_GLZRM4E04_rnd30.fasta
-test-data/MID4_GLZRM4E04_rnd30.fastqsanger
-test-data/MID4_GLZRM4E04_rnd30_fclip.fasta
-test-data/MID4_GLZRM4E04_rnd30_fclip.fastqsanger
-test-data/MID4_GLZRM4E04_rnd30_fclip.sff
-test-data/MID4_GLZRM4E04_rnd30_frclip.fasta
-test-data/MID4_GLZRM4E04_rnd30_frclip.fastqsanger
-test-data/MID4_GLZRM4E04_rnd30_frclip.sff
-test-data/MID4_GLZRM4E04_rnd30.sff
-
-
-Licence (MIT/BSD style)
-=======================
-
-Permission to use, copy, modify, and distribute this software and its
-documentation with or without modifications and for any purpose and
-without fee is hereby granted, provided that any copyright notices
-appear in all copies and that both those copyright notices and this
-permission notice appear in supporting documentation, and that the
-names of the contributors or copyright holders not be used in
-advertising or publicity pertaining to distribution of the software
-without specific prior permission.
-
-THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
-OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-OR PERFORMANCE OF THIS SOFTWARE.
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.txt.orig
--- a/tools/primers/seq_primer_clip.txt.orig Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,103 +0,0 @@
-Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads
-===========================================================
-
-This tool is copyright 2011 by Peter Cock, The James Hutton Institute
-(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the licence text below.
-
-This tool is a short Python script (using the Galaxy library functions and
-Biopython). There are just two files to install:
-
-* seq_primer_clip.py (the Python script)
-* seq_primer_clip.xml (the Galaxy tool definition)
-
-The suggested location is a new tools/primers folder. You will also need to
-modify the tools_conf.xml file to tell Galaxy to offer the tool and also do
-this to tools_conf.xml.sample in order to run the tests:
-
-
-
-There are optionally some example files required to run the functional tests,
-put these in the test-data/primers folder.
-
-test-data/primers/dop_primers.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30.fastqsanger
-test-data/primers/MID4_GLZRM4E04_rnd30.sff
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.sff
-
-You should then be able to run the tests with:
-
-sh run_functional_tests.sh -id seq_primer_clip
-
-You will also need to install Biopython 1.54 or later. That's it.
-
-
-History
-=======
-
-v0.0.1 - Initial version (not publicly released)
-v0.0.2 - Sort primers by length (longest and therefore most specific first)
-v0.0.3 - Consider missing bases at start/end of read as mismatches
-v0.0.4 - Apply minimum length to sequences with no match too
-v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes
-v0.0.6 - Added some functional tests
-v0.0.7 - Added error check for bad filename arguments
-v0.0.8 - Record version of Python script when run from Galaxy.
- - Check for errors using Python script's return code.
-
-
-Developers
-==========
-
-This script and related tools are being developed on the following hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/tools
-
-This incorporates the previously used hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter
-
-For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
-the following command from the Galaxy root folder:
-
-$ tar -czf seq_primer_clip.tar.gz tools/primers/seq_primer_clip.* test-data/primers/*
-
-Check this worked:
-
-$ tar -tzf seq_primer_clip.tar.gz
-tools/primers/seq_primer_clip.py
-tools/primers/seq_primer_clip.txt
-tools/primers/seq_primer_clip.xml
-test-data/primers/dop_primers.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30.fastqsanger
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.fastqsanger
-test-data/primers/MID4_GLZRM4E04_rnd30_fclip.sff
-test-data/primers/MID4_GLZRM4E04_rnd30_frclip.fasta
-test-data/primers/MID4_GLZRM4E04_rnd30_frclip.fastqsanger
-test-data/primers/MID4_GLZRM4E04_rnd30_frclip.sff
-test-data/primers/MID4_GLZRM4E04_rnd30.sff
-
-
-Licence (MIT/BSD style)
-=======================
-
-Permission to use, copy, modify, and distribute this software and its
-documentation with or without modifications and for any purpose and
-without fee is hereby granted, provided that any copyright notices
-appear in all copies and that both those copyright notices and this
-permission notice appear in supporting documentation, and that the
-names of the contributors or copyright holders not be used in
-advertising or publicity pertaining to distribution of the software
-without specific prior permission.
-
-THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
-WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
-OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-OR PERFORMANCE OF THIS SOFTWARE.
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.xml
--- a/tools/primers/seq_primer_clip.xml Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-
- Trim off 5' or 3' primers
- seq_primer_clip.py --version
-
-seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Bio
-
-
-
-**What it does**
-
-Looks for the given primer sequences (within the existing clipped sequence) and
-further clips the reads to remove the primers and any preceding/trailing sequence.
-
-Reads containing a forward primer are reduced to just the sequence after (and
-excluding) the forward primer.
-
-Reads containing a reverse primer are reduced to just the sequence before (and
-excluding) the reverse primer.
-
-Degenerate primers can be specified using the standard IUPAC ambiguity codes,
-thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity
-codes) and so on.
-
-Note that for SFF files only the clip/trim positions are edited - you will still
-be able to extract the original full read (with any adapter sequence and poor
-quality sequence) if you need to.
-
-.. class:: warningmark
-
-**Note**. This tool was initially written for Roche 454 data, and should also
-work fine on Sanger or Ion Torrent as well. However, it is probably too slow
-for use on large Illumina datasets.
-
-
-**Citation**
-
-This tool uses Biopython. If you use this tool in scientific work leading to a
-publication, please cite:
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-
-
diff -r 8c02a91a8680 -r ee5acea162a7 tools/primers/seq_primer_clip.xml.orig
--- a/tools/primers/seq_primer_clip.xml.orig Tue Apr 30 11:04:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-
- Trim off 5' or 3' primers
- seq_primer_clip.py --version
-
-seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Bio
-
-
-
-**What it does**
-
-Looks for the given primer sequences (within the existing clipped sequence) and
-further clips the reads to remove the primers and any preceding/trailing sequence.
-
-Reads containing a forward primer are reduced to just the sequence after (and
-excluding) the forward primer.
-
-Reads containing a reverse primer are reduced to just the sequence before (and
-excluding) the reverse primer.
-
-Degenerate primers can be specified using the standard IUPAC ambiguity codes,
-thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity
-codes) and so on.
-
-Note that for SFF files only the clip/trim positions are edited - you will still
-be able to extract the original full read (with any adapter sequence and poor
-quality sequence) if you need to.
-
-.. class:: warningmark
-
-**Note**. This tool was initially written for Roche 454 data, and should also
-work fine on Sanger or Ion Torrent as well. However, it is probably too slow
-for use on large Illumina datasets.
-
-
-**Citation**
-
-This tool uses Biopython. If you use this tool in scientific work leading to a
-publication, please cite:
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-
-
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_primer_clip/README.rst Thu Oct 24 09:37:25 2013 -0400
@@ -0,0 +1,120 @@
+Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads
+===========================================================
+
+This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below (MIT licence).
+
+This tool is a short Python script (using the Galaxy library functions and
+Biopython). It is available from the Galaxy Tool Shed here:
+http://toolshed.g2.bx.psu.edu/view/peterjc/seq_primer_clip
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython, and then install
+this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install:
+
+* seq_primer_clip.py (the Python script)
+* seq_primer_clip.xml (the Galaxy tool definition)
+
+The suggested location is a new tools/seq_primer_clip folder. You will also
+need to modify the tools_conf.xml file to tell Galaxy to offer the tool::
+
+
+
+If you wish to run the unit tests, also add this to tools_conf.xml.sample
+and move/copy the test-data files under Galaxy's test-data folder. Then::
+
+ $ ./run_functional_tests.sh -id seq_primer_clip
+
+You will also need to install Biopython 1.54 or later. That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1 - Initial version (not publicly released)
+v0.0.2 - Sort primers by length (longest and therefore most specific first)
+v0.0.3 - Consider missing bases at start/end of read as mismatches
+v0.0.4 - Apply minimum length to sequences with no match too
+v0.0.5 - Count clipped & non-matched short reads separately, length bug fixes
+v0.0.6 - Added some functional tests
+v0.0.7 - Added error check for bad filename arguments
+v0.0.8 - Record version of Python script when run from Galaxy.
+ - Check for errors using Python script's return code.
+v0.0.9 - Moved test data to workaround Galaxy Tool Shed limititation.
+v0.0.10 - Include links to Tool Shed in help text and this README file.
+ - Use reStructuredText for this README file.
+ - Adopted standard MIT licence.
+ - Automated installation of Biopython dependency.
+ - Development moved to GitHub, https://github.com/peterjc/pico_galaxy
+ - Renamed folder and adopted README.rst naming.
+======= ======================================================================
+
+
+Developers
+==========
+
+This script and related tools were initially developed on the following hg branches:
+http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+Development has now moved to a dedicated GitHub repository:
+https://github.com/peterjc/pico_galaxy
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+ $ tar -czf seq_primer_clip.tar.gz tools/seq_primer_clip/README.rst tools/seq_primer_clip/seq_primer_clip.* tools/seq_primer_clip/repository_dependencies.xml test-data/dop_primers.fasta test-data/MID4_GLZRM4E04_rnd30*
+
+Check this worked::
+
+ $ tar -tzf seq_primer_clip.tar.gz
+ tools/seq_primer_clip/README.rst
+ tools/seq_primer_clip/seq_primer_clip.xml
+ tools/seq_primer_clip/seq_primer_clip.py
+ tools/seq_primer_clip/repository_dependencies.xml
+ test-data/dop_primers.fasta
+ test-data/MID4_GLZRM4E04_rnd30.fasta
+ test-data/MID4_GLZRM4E04_rnd30.fastqsanger
+ test-data/MID4_GLZRM4E04_rnd30_fclip.fasta
+ test-data/MID4_GLZRM4E04_rnd30_fclip.fastqsanger
+ test-data/MID4_GLZRM4E04_rnd30_fclip.sff
+ test-data/MID4_GLZRM4E04_rnd30_frclip.fasta
+ test-data/MID4_GLZRM4E04_rnd30_frclip.fastqsanger
+ test-data/MID4_GLZRM4E04_rnd30_frclip.sff
+ test-data/MID4_GLZRM4E04_rnd30.sff
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_primer_clip/repository_dependencies.xml Thu Oct 24 09:37:25 2013 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/seq_primer_clip.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_primer_clip/seq_primer_clip.py Thu Oct 24 09:37:25 2013 -0400
@@ -0,0 +1,358 @@
+#!/usr/bin/env python
+"""Looks for the given primer sequences and clips matching SFF reads.
+
+Takes eight command line options, input read filename, input read format,
+input primer FASTA filename, type of primers (forward, reverse or reverse-
+complement), number of mismatches (currently only 0, 1 and 2 are supported),
+minimum length to keep a read (after primer trimming), should primer-less
+reads be kept (boolean), and finally the output sequence filename.
+
+Both the primer and read sequences can contain IUPAC ambiguity codes like N.
+
+This supports FASTA, FASTQ and SFF sequence files. Colorspace reads are not
+supported.
+
+The mismatch parameter does not consider gapped alignemnts, however the
+special case of missing bases at the very start or end of the read is handled.
+e.g. a primer sequence CCGACTCGAG will match a read starting CGACTCGAG...
+if one or more mismatches are allowed.
+
+This can also be used for stripping off (and optionally filtering on) barcodes.
+
+Note that only the trim/clip values in the SFF file are changed, not the flow
+information of the full read sequence.
+
+This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
+See accompanying text file for licence details (MIT/BSD style).
+
+This is version 0.0.8 of the script. Currently it uses Python's regular
+expression engine for finding the primers, which for my needs is fast enough.
+"""
+import sys
+import re
+from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+if "-v" in sys.argv or "--version" in sys.argv:
+ print "v0.0.5"
+ sys.exit(0)
+
+def stop_err(msg, err=1):
+ sys.stderr.write(msg)
+ sys.exit(err)
+
+try:
+ from Bio.Seq import reverse_complement
+ from Bio.SeqIO.SffIO import SffIterator, SffWriter
+except ImportError:
+ stop_err("Requires Biopython 1.54 or later")
+try:
+ from Bio.SeqIO.SffIO import ReadRocheXmlManifest
+except ImportError:
+ #Prior to Biopython 1.56 this was a private function
+ from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
+
+#Parse Command Line
+try:
+ in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:]
+except ValueError:
+ stop_err("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+
+if in_file == primer_fasta:
+ stop_err("Same file given as both primer sequences and sequences to clip!")
+if in_file == out_file:
+ stop_err("Same file given as both sequences to clip and output!")
+if primer_fasta == out_file:
+ stop_err("Same file given as both primer sequences and output!")
+
+try:
+ mm = int(mm)
+except ValueError:
+ stop_err("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm)
+if mm < 0:
+ stop_err("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm)
+if mm not in [0,1,2]:
+ raise NotImplementedError
+
+try:
+ min_len = int(min_len)
+except ValueError:
+ stop_err("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len)
+if min_len < 0:
+ stop_err("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len)
+
+
+if keep_negatives.lower() in ["true", "yes", "on"]:
+ keep_negatives = True
+elif keep_negatives.lower() in ["false", "no", "off"]:
+ keep_negatives = False
+else:
+ stop_err("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives)
+
+
+if primer_type.lower() == "forward":
+ forward = True
+ rc = False
+elif primer_type.lower() == "reverse":
+ forward = False
+ rc = False
+elif primer_type.lower() == "reverse-complement":
+ forward = False
+ rc = True
+else:
+ stop_err("Expected foward, reverse or reverse-complement not %r" % primer_type)
+
+
+ambiguous_dna_values = {
+ "A": "A",
+ "C": "C",
+ "G": "G",
+ "T": "T",
+ "M": "ACM",
+ "R": "AGR",
+ "W": "ATW",
+ "S": "CGS",
+ "Y": "CTY",
+ "K": "GTK",
+ "V": "ACGMRSV",
+ "H": "ACTMWYH",
+ "D": "AGTRWKD",
+ "B": "CGTSYKB",
+ "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC]
+ "N": ".",
+ }
+
+ambiguous_dna_re = {}
+for letter, values in ambiguous_dna_values.iteritems():
+ if len(values) == 1:
+ ambiguous_dna_re[letter] = values
+ else:
+ ambiguous_dna_re[letter] = "[%s]" % values
+
+
+def make_reg_ex(seq):
+ return "".join(ambiguous_dna_re[letter] for letter in seq)
+
+def make_reg_ex_mm(seq, mm):
+ if mm > 2:
+ raise NotImplementedError("At most 2 mismatches allowed!")
+ seq = seq.upper()
+ yield make_reg_ex(seq)
+ for i in range(1,mm+1):
+ #Missing first/last i bases at very start/end of sequence
+ for reg in make_reg_ex_mm(seq[i:], mm-i):
+ yield "^" + reg
+ for reg in make_reg_ex_mm(seq[:-i], mm-i):
+ yield "$" + reg
+ if mm >= 1:
+ for i,letter in enumerate(seq):
+ #We'll use a set to remove any duplicate patterns
+ #if letter not in "NX":
+ pattern = seq[:i] + "N" + seq[i+1:]
+ assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
+ % (pattern, len(pattern), seq, len(seq))
+ yield make_reg_ex(pattern)
+ if mm >=2:
+ for i,letter in enumerate(seq):
+ #We'll use a set to remove any duplicate patterns
+ #if letter not in "NX":
+ for k,letter in enumerate(seq[i+1:]):
+ #We'll use a set to remove any duplicate patterns
+ #if letter not in "NX":
+ pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:]
+ assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \
+ % (pattern, len(pattern), seq, len(seq))
+ yield make_reg_ex(pattern)
+
+def load_primers_as_re(primer_fasta, mm, rc=False):
+ #Read primer file and record all specified sequences
+ primers = set()
+ in_handle = open(primer_fasta, "rU")
+ reader = fastaReader(in_handle)
+ count = 0
+ for record in reader:
+ if rc:
+ seq = reverse_complement(record.sequence)
+ else:
+ seq = record.sequence
+ #primers.add(re.compile(make_reg_ex(seq)))
+ count += 1
+ for pattern in make_reg_ex_mm(seq, mm):
+ primers.add(pattern)
+ in_handle.close()
+ #Use set to avoid duplicates, sort to have longest first
+ #(so more specific primers found before less specific ones)
+ primers = sorted(set(primers), key=lambda p: -len(p))
+ return count, re.compile("|".join(primers)) #make one monster re!
+
+
+
+#Read primer file and record all specified sequences
+count, primer = load_primers_as_re(primer_fasta, mm, rc)
+print "%i primer sequences" % count
+
+short_neg = 0
+short_clipped = 0
+clipped = 0
+negs = 0
+
+if seq_format.lower()=="sff":
+ #SFF is different because we just change the trim points
+ if forward:
+ def process(records):
+ global short_clipped, short_neg, clipped, negs
+ for record in records:
+ left_clip = record.annotations["clip_qual_left"]
+ right_clip = record.annotations["clip_qual_right"]
+ seq = str(record.seq)[left_clip:right_clip].upper()
+ result = primer.search(seq)
+ if result:
+ #Forward primer, take everything after it
+ #so move the left clip along
+ if len(seq) - result.end() >= min_len:
+ record.annotations["clip_qual_left"] = left_clip + result.end()
+ clipped += 1
+ yield record
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(seq) >= min_len:
+ negs += 1
+ yield record
+ else:
+ short_neg += 1
+ else:
+ def process(records):
+ global short_clipped, short_neg, clipped, negs
+ for record in records:
+ left_clip = record.annotations["clip_qual_left"]
+ right_clip = record.annotations["clip_qual_right"]
+ seq = str(record.seq)[left_clip:right_clip].upper()
+ result = primer.search(seq)
+ if result:
+ #Reverse primer, take everything before it
+ #so move the right clip back
+ new_len = result.start()
+ if new_len >= min_len:
+ record.annotations["clip_qual_right"] = left_clip + new_len
+ clipped += 1
+ yield record
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(seq) >= min_len:
+ negs += 1
+ yield record
+ else:
+ short_neg += 1
+
+ in_handle = open(in_file, "rb")
+ try:
+ manifest = ReadRocheXmlManifest(in_handle)
+ except ValueError:
+ manifest = None
+ in_handle.seek(0)
+ out_handle = open(out_file, "wb")
+ writer = SffWriter(out_handle, xml=manifest)
+ writer.write_file(process(SffIterator(in_handle)))
+ #End of SFF code
+elif seq_format.lower().startswith("fastq"):
+ in_handle = open(in_file, "rU")
+ out_handle = open(out_file, "w")
+ reader = fastqReader(in_handle)
+ writer = fastqWriter(out_handle)
+ if forward:
+ for record in reader:
+ seq = record.sequence.upper()
+ result = primer.search(seq)
+ if result:
+ #Forward primer, take everything after it
+ cut = result.end()
+ record.sequence = seq[cut:]
+ if len(record.sequence) >= min_len:
+ record.quality = record.quality[cut:]
+ clipped += 1
+ writer.write(record)
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(record) >= min_len:
+ negs += 1
+ writer.write(record)
+ else:
+ short_negs += 1
+ else:
+ for record in reader:
+ seq = record.sequence.upper()
+ result = primer.search(seq)
+ if result:
+ #Reverse primer, take everything before it
+ cut = result.start()
+ record.sequence = seq[:cut]
+ if len(record.sequence) >= min_len:
+ record.quality = record.quality[:cut]
+ clipped += 1
+ writer.write(record)
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(record) >= min_len:
+ negs += 1
+ writer.write(record)
+ else:
+ short_negs += 1
+elif seq_format.lower()=="fasta":
+ in_handle = open(in_file, "rU")
+ out_handle = open(out_file, "w")
+ reader = fastaReader(in_handle)
+ writer = fastaWriter(out_handle)
+ #Following code is identical to that for FASTQ but without editing qualities
+ if forward:
+ for record in reader:
+ seq = record.sequence.upper()
+ result = primer.search(seq)
+ if result:
+ #Forward primer, take everything after it
+ cut = result.end()
+ record.sequence = seq[cut:]
+ if len(record.sequence) >= min_len:
+ clipped += 1
+ writer.write(record)
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(record) >= min_len:
+ negs += 1
+ writer.write(record)
+ else:
+ short_negs += 1
+ else:
+ for record in reader:
+ seq = record.sequence.upper()
+ result = primer.search(seq)
+ if result:
+ #Reverse primer, take everything before it
+ cut = result.start()
+ record.sequence = seq[:cut]
+ if len(record.sequence) >= min_len:
+ clipped += 1
+ writer.write(record)
+ else:
+ short_clipped += 1
+ elif keep_negatives:
+ if len(record) >= min_len:
+ negs += 1
+ writer.write(record)
+ else:
+ short_negs += 1
+else:
+ stop_err("Unsupported file type %r" % seq_format)
+in_handle.close()
+out_handle.close()
+
+print "Kept %i clipped reads," % clipped
+print "discarded %i short." % short_clipped
+if keep_negatives:
+ print "Kept %i non-matching reads," % negs
+ print "discarded %i short." % short_neg
diff -r 8c02a91a8680 -r ee5acea162a7 tools/seq_primer_clip/seq_primer_clip.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/seq_primer_clip/seq_primer_clip.xml Thu Oct 24 09:37:25 2013 -0400
@@ -0,0 +1,143 @@
+
+ Trim off 5' or 3' primers
+
+ biopython
+ Bio
+
+ seq_primer_clip.py --version
+
+seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Bio
+
+
+
+**What it does**
+
+Looks for the given primer sequences (within the existing clipped sequence) and
+further clips the reads to remove the primers and any preceding/trailing sequence.
+
+Reads containing a forward primer are reduced to just the sequence after (and
+excluding) the forward primer.
+
+Reads containing a reverse primer are reduced to just the sequence before (and
+excluding) the reverse primer.
+
+Degenerate primers can be specified using the standard IUPAC ambiguity codes,
+thus a primer with an N would match A, C, T or G (or any of the IUPAC ambiguity
+codes) and so on.
+
+Note that for SFF files only the clip/trim positions are edited - you will still
+be able to extract the original full read (with any adapter sequence and poor
+quality sequence) if you need to.
+
+.. class:: warningmark
+
+**Note**. This tool was initially written for Roche 454 data, and should also
+work fine on Sanger or Ion Torrent as well. However, it is probably too slow
+for use on large Illumina datasets.
+
+
+**Citation**
+
+This tool uses Biopython. If you use this tool in scientific work leading to a
+publication, please cite:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_primer_clip
+
+