get_orfs_or_cdss: tools/get_orfs_or_cdss/get_orfs_or

comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 8:09a8be9247ca draft

v0.2.0 with GFF3 output

author	peterjc
date	Sat, 09 Jan 2016 23:42:32 -0500
parents	705a2e2df7fb
children	a06ad07431ba

comparison

equal deleted inserted replaced

-:705a2e2df7fb
+:09a8be9247ca
 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
 (formerly SCRI), Dundee, UK. All rights reserved.
 See accompanying text file for licence details (MIT licence).
-This is version 0.1.0 of the script.
 """
 import sys
 import re
 from optparse import OptionParser
-def sys_exit(msg, err=1):
-sys.stderr.write(msg.rstrip() + "\n")
-sys.exit(err)
 usage = """Use as follows:
-$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed
+$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3
 """
 try:
 from Bio.Seq import Seq, reverse_complement, translate
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
 from Bio.Data import CodonTable
 except ImportError:
-sys_exit("Missing Biopython library")
+sys.exit("Missing Biopython library")
 parser = OptionParser(usage=usage)
 parser.add_option('-i', '--input', dest='input_file',
 default=None, help='Input fasta file',
 default=None, help='Output protein sequences, or - for STDOUT',
 metavar='FILE')
 parser.add_option('--ob', dest='out_bed_file',
 default=None, help='Output BED file, or - for STDOUT',
 metavar='FILE')
+parser.add_option('--og', dest='out_gff3_file',
+default=None, help='Output GFF3 file, or - for STDOUT',
+metavar='FILE')
 parser.add_option('-v', '--version', dest='version',
 default=False, action='store_true',
 help='Show version and quit')
 options, args = parser.parse_args()
 if options.version:
-print "v0.1.0"
+print("v0.2.0")
 sys.exit(0)
+if not options.input_file:
+sys.exit("Input file is required")
+if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):
+sys.exit("At least one output file is required")
 try:
 table_obj = CodonTable.ambiguous_generic_by_id[options.table]
 except KeyError:
-sys_exit("Unknown codon table %i" % options.table)
+sys.exit("Unknown codon table %i" % options.table)
-if options.seq_format.lower()=="sff":
+if options.seq_format.lower() == "sff":
 seq_format = "sff-trim"
-elif options.seq_format.lower()=="fasta":
+elif options.seq_format.lower() == "fasta":
 seq_format = "fasta"
 elif options.seq_format.lower().startswith("fastq"):
 seq_format = "fastq"
 else:
-sys_exit("Unsupported file type %r" % options.seq_format)
+sys.exit("Unsupported file type %r" % options.seq_format)
 print "Genetic code table %i" % options.table
 print "Minimum length %i aa" % options.min_len
-#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
+# print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
 starts = sorted(table_obj.start_codons)
 assert "NNN" not in starts
 re_starts = re.compile("|".join(starts))
 stops = sorted(table_obj.stop_codons)
 assert "NNN" not in stops
 re_stops = re.compile("|".join(stops))
 def start_chop_and_trans(s, strict=True):
 """Returns offset, trimmed nuc, protein."""
 if strict:
 assert s[-3:] in stops, s
 assert len(s) % 3 == 0
 for match in re_starts.finditer(s):
-#Must check the start is in frame
+# Must check the start is in frame
 start = match.start()
 if start % 3 == 0:
 n = s[start:]
 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
 if strict:
 t = translate(n, options.table, cds=True)
 else:
-#Use when missing stop codon,
+# Use when missing stop codon,
 t = "M" + translate(n[3:], options.table, to_stop=True)
 return start, n, t
 return None, None, None
 def break_up_frame(s):
 """Returns offset, nuc, protein."""
 start = 0
 for match in re_stops.finditer(s):
 index = match.start() + 3
 if index % 3 != 0:
 continue
 n = s[start:index]
-if options.ftype=="CDS":
+if options.ftype == "CDS":
 offset, n, t = start_chop_and_trans(n)
 else:
 offset = 0
 t = translate(n, options.table, to_stop=True)
 if n and len(t) >= options.min_len:
 yield start + offset, n, t
 start = index
 if options.ends == "open":
-#No stop codon, Biopython's strict CDS translate will fail
+# No stop codon, Biopython's strict CDS translate will fail
 n = s[start:]
-#Ensure we have whole codons
+# Ensure we have whole codons
-#TODO - Try appending N instead?
+# TODO - Try appending N instead?
-#TODO - Do the next four lines more elegantly
+# TODO - Do the next four lines more elegantly
 if len(n) % 3:
 n = n[:-1]
 if len(n) % 3:
 n = n[:-1]
-if options.ftype=="CDS":
+if options.ftype == "CDS":
 offset, n, t = start_chop_and_trans(n, strict=False)
 else:
 offset = 0
 t = translate(n, options.table, to_stop=True)
 if n and len(t) >= options.min_len:
 def get_all_peptides(nuc_seq):
 """Returns start, end, strand, nucleotides, protein.
 Co-ordinates are Python style zero-based.
 """
-#TODO - Refactor to use a generator function (in start order)
+# TODO - Refactor to use a generator function (in start order)
-#rather than making a list and sorting?
+# rather than making a list and sorting?
 answer = []
 full_len = len(nuc_seq)
 if options.strand != "reverse":
-for frame in range(0,3):
+for frame in range(0, 3):
 for offset, n, t in break_up_frame(nuc_seq[frame:]):
-start = frame + offset #zero based
+start = frame + offset  # zero based
 answer.append((start, start + len(n), +1, n, t))
 if options.strand != "forward":
 rc = reverse_complement(nuc_seq)
-for frame in range(0,3) :
+for frame in range(0, 3):
 for offset, n, t in break_up_frame(rc[frame:]):
-start = full_len - frame - offset #zero based
+start = full_len - frame - offset  # zero based
-answer.append((start - len(n), start, -1, n ,t))
+answer.append((start - len(n), start, -1, n, t))
 answer.sort()
 return answer
 def get_top_peptides(nuc_seq):
 """Returns all peptides of max length."""
 values = list(get_all_peptides(nuc_seq))
 if not values:
 max_len = max(len(x[-1]) for x in values)
 for x in values:
 if len(x[-1]) == max_len:
 yield x
 def get_one_peptide(nuc_seq):
 """Returns first (left most) peptide with max length."""
 values = list(get_top_peptides(nuc_seq))
 if not values:
 raise StopIteration
 in_count = 0
 out_count = 0
 if options.out_nuc_file == "-":
 out_nuc = sys.stdout
-else:
+elif options.out_nuc_file:
 out_nuc = open(options.out_nuc_file, "w")
+else:
+out_nuc = None
 if options.out_prot_file == "-":
 out_prot = sys.stdout
-else:
+elif options.out_prot_file:
 out_prot = open(options.out_prot_file, "w")
+else:
+out_prot = None
 if options.out_bed_file == "-":
 out_bed = sys.stdout
-else:
+elif options.out_bed_file:
 out_bed = open(options.out_bed_file, "w")
+else:
+out_bed = None
+if options.out_gff3_file == "-":
+out_gff3 = sys.stdout
+elif options.out_gff3_file:
+out_gff3 = open(options.out_gff3_file, "w")
+else:
+out_gff3 = None
+if out_gff3:
+out_gff3.write('##gff-version 3\n')
 for record in SeqIO.parse(options.input_file, seq_format):
 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
 out_count += 1
 if f_strand == +1:
-loc = "%i..%i" % (f_start+1, f_end)
+loc = "%i..%i" % (f_start + 1, f_end)
 else:
-loc = "complement(%i..%i)" % (f_start+1, f_end)
+loc = "complement(%i..%i)" % (f_start + 1, f_end)
 descr = "length %i aa, %i bp, from %s of %s" \
 % (len(t), len(n), loc, record.description)
-fid = record.id + "|%s%i" % (options.ftype, i+1)
+fid = record.id + "|%s%i" % (options.ftype, i + 1)
-r = SeqRecord(Seq(n), id = fid, name = "", description= descr)
+r = SeqRecord(Seq(n), id=fid, name="", description=descr)
-t = SeqRecord(Seq(t), id = fid, name = "", description= descr)
+t = SeqRecord(Seq(t), id=fid, name="", description=descr)
-SeqIO.write(r, out_nuc, "fasta")
+if out_nuc:
-SeqIO.write(t, out_prot, "fasta")
+SeqIO.write(r, out_nuc, "fasta")
-out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n')
+if out_prot:
+SeqIO.write(t, out_prot, "fasta")
+nice_strand = '+' if f_strand == +1 else '-'
+if out_bed:
+out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n')
+if out_gff3:
+out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.',
+nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n')
 in_count += 1
-if out_nuc is not sys.stdout:
+if out_nuc and out_nuc is not sys.stdout:
 out_nuc.close()
-if out_prot is not sys.stdout:
+if out_prot and out_prot is not sys.stdout:
 out_prot.close()
-if out_bed is not sys.stdout:
+if out_bed and out_bed is not sys.stdout:
 out_bed.close()
 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)

Mercurial > repos > peterjc > get_orfs_or_cdss

comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 8:09a8be9247ca draft