# HG changeset patch # User peterjc # Date 1326986230 18000 # Node ID 9cff9a1176ea1c1c7a3dae35a3f96fb25796841a Uploaded v0.0.1 diff -r 000000000000 -r 9cff9a1176ea tools/filters/get_orfs_or_cdss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.py Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,219 @@ +#!/usr/bin/env python +"""Find ORFs in a nucleotide sequence file. + +get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + +Takes ten command line options, input sequence filename, format, genetic +code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), +minimum length (in amino acids), strand (both, forward, reverse), output +nucleotide filename, and output protein filename. + +This tool is a short Python script which requires Biopython. If you use +this tool in scientific work leading to a publication, please cite the +Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI), Dundee, UK. All rights reserved. + +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.1 of the script. +""" +import sys +import re + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +try: + from Bio.Seq import Seq, reverse_complement, translate + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Data import CodonTable +except ImportError: + stop_err("Missing Biopython library") + +#Parse Command Line +try: + input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] +except ValueError: + stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + +try: + table = int(table) +except ValueError: + stop_err("Expected integer for genetic code table, got %s" % table) + +try: + table_obj = CodonTable.ambiguous_generic_by_id[table] +except KeyError: + stop_err("Unknown codon table %i" % table) + +if ftype not in ["CDS", "ORF"]: + stop_err("Expected CDS or ORF, got %s" % ftype) + +if ends not in ["open", "closed"]: + stop_err("Expected open or closed for end treatment, got %s" % ends) + +try: + min_len = int(min_len) +except ValueError: + stop_err("Expected integer for min_len, got %s" % min_len) + +if seq_format.lower()=="sff": + seq_format = "sff-trim" +elif seq_format.lower()=="fasta": + seq_format = "fasta" +elif seq_format.lower().startswith("fastq"): + seq_format = "fastq" +else: + stop_err("Unsupported file type %r" % seq_format) + +print "Genetic code table %i" % table +print "Minimum length %i aa" % min_len +#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) + +starts = sorted(table_obj.start_codons) +assert "NNN" not in starts +re_starts = re.compile("|".join(starts)) + +stops = sorted(table_obj.stop_codons) +assert "NNN" not in stops +re_stops = re.compile("|".join(stops)) + +def start_chop_and_trans(s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in stops, s + assert len(s) % 3 == 0 + for match in re_starts.finditer(s): + #Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, table, cds=True) + else: + #Use when missing stop codon, + t = "M" + translate(n[3:], table, to_stop=True) + return start, n, t + return None, None, None + +def break_up_frame(s): + """Returns offset, nuc, protein.""" + start = 0 + for match in re_stops.finditer(s): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + start = index + if ends == "open": + #No stop codon, Biopython's strict CDS translate will fail + n = s[start:] + #Ensure we have whole codons + #TODO - Try appending N instead? + #TODO - Do the next four lines more elegantly + if len(n) % 3: + n = n[:-1] + if len(n) % 3: + n = n[:-1] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n, strict=False) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + + +def get_all_peptides(nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + #TODO - Refactor to use a generator function (in start order) + #rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + if strand != "reverse": + for frame in range(0,3): + for offset, n, t in break_up_frame(nuc_seq[frame:]): + start = frame + offset #zero based + answer.append((start, start + len(n), +1, n, t)) + if strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0,3) : + for offset, n, t in break_up_frame(rc[frame:]): + start = full_len - frame - offset #zero based + answer.append((start, start + len(n), -1, n ,t)) + answer.sort() + return answer + +def get_top_peptides(nuc_seq): + """Returns all peptides of max length.""" + values = list(get_all_peptides(nuc_seq)) + if not values: + raise StopIteration + max_len = max(len(x[-1]) for x in values) + for x in values: + if len(x[-1]) == max_len: + yield x + +def get_one_peptide(nuc_seq): + """Returns first (left most) peptide with max length.""" + values = list(get_top_peptides(nuc_seq)) + if not values: + raise StopIteration + yield values[0] + +if mode == "all": + get_peptides = get_all_peptides +elif mode == "top": + get_peptides = get_top_peptides +elif mode == "one": + get_peptides = get_one_peptide + +in_count = 0 +out_count = 0 +if out_nuc_file == "-": + out_nuc = sys.stdout +else: + out_nuc = open(out_nuc_file, "w") +if out_prot_file == "-": + out_prot = sys.stdout +else: + out_prot = open(out_prot_file, "w") +for record in SeqIO.parse(input_file, seq_format): + for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): + out_count += 1 + if f_strand == +1: + loc = "%i..%i" % (f_start+1, f_end) + else: + loc = "complement(%i..%i)" % (f_start+1, f_end) + descr = "length %i aa, %i bp, from %s of %s" \ + % (len(t), len(n), loc, record.description) + r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + in_count += 1 +if out_nuc is not sys.stdout: + out_nuc.close() +if out_prot is not sys.stdout: + out_prot.close() + +print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) diff -r 000000000000 -r 9cff9a1176ea tools/filters/get_orfs_or_cdss.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.txt Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,75 @@ +Galaxy tool to find ORFs or simple CDSs +======================================= + +This tool is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using Biopython library functions) +to search nucleotide sequences for open reading frames (ORFs) or coding +sequences (CDSs) where the first potential start codon is used. See the +help text in the XML file for more information. + +There are just two files to install: + +* get_orfs_or_cdss.py (the Python script) +* get_orfs_or_cdss.xml (the Galaxy tool definition) + +The suggested location is in the Galaxy folder tools/filters next to the tool +for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line: + + + +You will also need to install Biopython 1.54 or later. If you want to run +the unit tests, include this line in tools_conf.xml.sample and the sample +FASTA files under the test-data directory. That's it. + + +History +======= + +v0.0.1 - Initial version. + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.* + +Check this worked: + +$ tar -tzf get_orfs_or_cdss.tar.gz +filter/get_orfs_or_cdss.py +filter/get_orfs_or_cdss.txt +filter/get_orfs_or_cdss.xml + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. diff -r 000000000000 -r 9cff9a1176ea tools/filters/get_orfs_or_cdss.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/get_orfs_or_cdss.xml Thu Jan 19 10:17:10 2012 -0500 @@ -0,0 +1,147 @@ + + e.g. to get peptides from ESTs + +get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Bio + + + +**What it does** + +Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ +and Standard Flowgram Format (SFF) are supported), and searches each sequence +for open reading frames (ORFs) or potential coding sequences (CDSs) of the +given minimum length. These are returned as FASTA files of nucleotides and +protein sequences. + +You can choose to have all the ORFs/CDSs above the minimum length for each +sequence (similar to the EMBOSS getorf tool), those with the longest length +equal, or the first ORF/CDS with the longest length (in the special case +where a sequence encodes two or more long ORFs/CDSs of the same length). The +last option is a reasonable choice when the input sequences represent EST or +mRNA sequences, where only one ORF/CDS is expected. + +Note that if no ORFs/CDSs in a sequence match the criteria, there will be no +output for that sequence. + +Also note that the ORFs/CDSs are assigned modified identifiers to distinguish +them from the original full length sequences, by appending a suffix. + +The start and stop codons are taken from the `NCBI Genetic Codes +<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_. +When searching for ORFs, the sequences will run from stop codon to stop +codon, and any start codons are ignored. When searching for CDSs, the first +potential start codon will be used, giving the longest possible CDS within +each ORF, and thus the longest possible protein sequence. This is useful +for things like BLAST or domain searching, but since this may not be the +correct start codon may not be appropriate for signal peptide detection +etc. + +**Example Usage** + +Given some EST sequences (Sanger capillary reads) assembled into unigenes, +or a transcriptome assembly from some RNA-Seq, each of your nucleotide +sequences should (barring sequencing, assembly errors, frame-shifts etc) +encode one protein as a single ORF/CDS, which you wish to extract (and +perhaps translate into amino acids). + +If your RNS-Seq data was strand specific, and assembled taking this into +account, you should only search for ORFs/CDSs on the forward strand. + +**Citation** + +This tool uses Biopython. If you use this tool in scientific work leading +to a publication, please cite the Biopython application note (and Galaxy +too of course): + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + + +