Mercurial > repos > peterjc > get_orfs_or_cdss
diff tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 7:705a2e2df7fb draft
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
author | peterjc |
---|---|
date | Thu, 30 Jul 2015 12:35:31 -0400 |
parents | 5208c15805ec |
children | 09a8be9247ca |
line wrap: on
line diff
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Thu Nov 21 10:47:53 2013 -0500 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Thu Jul 30 12:35:31 2015 -0400 @@ -1,12 +1,8 @@ #!/usr/bin/env python """Find ORFs in a nucleotide sequence file. -get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file - -Takes ten command line options, input sequence filename, format, genetic -code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), -minimum length (in amino acids), strand (both, forward, reverse), output -nucleotide filename, and output protein filename. +For more details, see the help text and argument descriptions in the +accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. This tool is a short Python script which requires Biopython. If you use this tool in scientific work leading to a publication, please cite the @@ -19,67 +15,90 @@ This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute (formerly SCRI), Dundee, UK. All rights reserved. -See accompanying text file for licence details (MIT/BSD style). +See accompanying text file for licence details (MIT licence). -This is version 0.0.3 of the script. +This is version 0.1.0 of the script. """ import sys import re +from optparse import OptionParser -if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.3" - sys.exit(0) - -def stop_err(msg, err=1): +def sys_exit(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) +usage = """Use as follows: + +$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed +""" + try: from Bio.Seq import Seq, reverse_complement, translate from Bio.SeqRecord import SeqRecord from Bio import SeqIO from Bio.Data import CodonTable except ImportError: - stop_err("Missing Biopython library") + sys_exit("Missing Biopython library") + -#Parse Command Line -try: - input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] -except ValueError: - stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) +parser = OptionParser(usage=usage) +parser.add_option('-i', '--input', dest='input_file', + default=None, help='Input fasta file', + metavar='FILE') +parser.add_option('-f', '--format', dest='seq_format', + default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') +parser.add_option('--table', dest='table', + default=1, help='NCBI Translation table', type='int') +parser.add_option('-t', '--ftype', dest='ftype', type='choice', + choices=['CDS', 'ORF'], default='ORF', + help='Find ORF or CDSs') +parser.add_option('-e', '--ends', dest='ends', type='choice', + choices=['open', 'closed'], default='closed', + help='Open or closed. Closed ensures start/stop codons are present') +parser.add_option('-m', '--mode', dest='mode', type='choice', + choices=['all', 'top', 'one'], default='all', + help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' + 'with max length, or first with maximum length') +parser.add_option('--min_len', dest='min_len', + default=10, help='Minimum ORF/CDS length', type='int') +parser.add_option('-s', '--strand', dest='strand', type='choice', + choices=['forward', 'reverse', 'both'], default='both', + help='Strand to search for features on') +parser.add_option('--on', dest='out_nuc_file', + default=None, help='Output nucleotide sequences, or - for STDOUT', + metavar='FILE') +parser.add_option('--op', dest='out_prot_file', + default=None, help='Output protein sequences, or - for STDOUT', + metavar='FILE') +parser.add_option('--ob', dest='out_bed_file', + default=None, help='Output BED file, or - for STDOUT', + metavar='FILE') +parser.add_option('-v', '--version', dest='version', + default=False, action='store_true', + help='Show version and quit') -try: - table = int(table) -except ValueError: - stop_err("Expected integer for genetic code table, got %s" % table) +options, args = parser.parse_args() + +if options.version: + print "v0.1.0" + sys.exit(0) try: - table_obj = CodonTable.ambiguous_generic_by_id[table] + table_obj = CodonTable.ambiguous_generic_by_id[options.table] except KeyError: - stop_err("Unknown codon table %i" % table) - -if ftype not in ["CDS", "ORF"]: - stop_err("Expected CDS or ORF, got %s" % ftype) - -if ends not in ["open", "closed"]: - stop_err("Expected open or closed for end treatment, got %s" % ends) + sys_exit("Unknown codon table %i" % options.table) -try: - min_len = int(min_len) -except ValueError: - stop_err("Expected integer for min_len, got %s" % min_len) - -if seq_format.lower()=="sff": +if options.seq_format.lower()=="sff": seq_format = "sff-trim" -elif seq_format.lower()=="fasta": +elif options.seq_format.lower()=="fasta": seq_format = "fasta" -elif seq_format.lower().startswith("fastq"): +elif options.seq_format.lower().startswith("fastq"): seq_format = "fastq" else: - stop_err("Unsupported file type %r" % seq_format) + sys_exit("Unsupported file type %r" % options.seq_format) -print "Genetic code table %i" % table -print "Minimum length %i aa" % min_len +print "Genetic code table %i" % options.table +print "Minimum length %i aa" % options.min_len #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) starts = sorted(table_obj.start_codons) @@ -102,10 +121,10 @@ n = s[start:] assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) if strict: - t = translate(n, table, cds=True) + t = translate(n, options.table, cds=True) else: #Use when missing stop codon, - t = "M" + translate(n[3:], table, to_stop=True) + t = "M" + translate(n[3:], options.table, to_stop=True) return start, n, t return None, None, None @@ -117,15 +136,15 @@ if index % 3 != 0: continue n = s[start:index] - if ftype=="CDS": + if options.ftype=="CDS": offset, n, t = start_chop_and_trans(n) else: offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: + t = translate(n, options.table, to_stop=True) + if n and len(t) >= options.min_len: yield start + offset, n, t start = index - if ends == "open": + if options.ends == "open": #No stop codon, Biopython's strict CDS translate will fail n = s[start:] #Ensure we have whole codons @@ -135,14 +154,14 @@ n = n[:-1] if len(n) % 3: n = n[:-1] - if ftype=="CDS": + if options.ftype=="CDS": offset, n, t = start_chop_and_trans(n, strict=False) else: offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: + t = translate(n, options.table, to_stop=True) + if n and len(t) >= options.min_len: yield start + offset, n, t - + def get_all_peptides(nuc_seq): """Returns start, end, strand, nucleotides, protein. @@ -153,12 +172,12 @@ #rather than making a list and sorting? answer = [] full_len = len(nuc_seq) - if strand != "reverse": + if options.strand != "reverse": for frame in range(0,3): for offset, n, t in break_up_frame(nuc_seq[frame:]): start = frame + offset #zero based answer.append((start, start + len(n), +1, n, t)) - if strand != "forward": + if options.strand != "forward": rc = reverse_complement(nuc_seq) for frame in range(0,3) : for offset, n, t in break_up_frame(rc[frame:]): @@ -184,24 +203,31 @@ raise StopIteration yield values[0] -if mode == "all": +if options.mode == "all": get_peptides = get_all_peptides -elif mode == "top": +elif options.mode == "top": get_peptides = get_top_peptides -elif mode == "one": +elif options.mode == "one": get_peptides = get_one_peptide in_count = 0 out_count = 0 -if out_nuc_file == "-": +if options.out_nuc_file == "-": out_nuc = sys.stdout else: - out_nuc = open(out_nuc_file, "w") -if out_prot_file == "-": + out_nuc = open(options.out_nuc_file, "w") + +if options.out_prot_file == "-": out_prot = sys.stdout else: - out_prot = open(out_prot_file, "w") -for record in SeqIO.parse(input_file, seq_format): + out_prot = open(options.out_prot_file, "w") + +if options.out_bed_file == "-": + out_bed = sys.stdout +else: + out_bed = open(options.out_bed_file, "w") + +for record in SeqIO.parse(options.input_file, seq_format): for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): out_count += 1 if f_strand == +1: @@ -210,14 +236,18 @@ loc = "complement(%i..%i)" % (f_start+1, f_end) descr = "length %i aa, %i bp, from %s of %s" \ % (len(t), len(n), loc, record.description) - r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) - t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + fid = record.id + "|%s%i" % (options.ftype, i+1) + r = SeqRecord(Seq(n), id = fid, name = "", description= descr) + t = SeqRecord(Seq(t), id = fid, name = "", description= descr) SeqIO.write(r, out_nuc, "fasta") SeqIO.write(t, out_prot, "fasta") + out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') in_count += 1 if out_nuc is not sys.stdout: out_nuc.close() if out_prot is not sys.stdout: out_prot.close() +if out_bed is not sys.stdout: + out_bed.close() -print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) +print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)