comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 7:705a2e2df7fb draft

v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
author peterjc
date Thu, 30 Jul 2015 12:35:31 -0400
parents 5208c15805ec
children 09a8be9247ca
comparison
equal deleted inserted replaced
6:64e67f172188 7:705a2e2df7fb
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Find ORFs in a nucleotide sequence file. 2 """Find ORFs in a nucleotide sequence file.
3 3
4 get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file 4 For more details, see the help text and argument descriptions in the
5 5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface.
6 Takes ten command line options, input sequence filename, format, genetic
7 code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
8 minimum length (in amino acids), strand (both, forward, reverse), output
9 nucleotide filename, and output protein filename.
10 6
11 This tool is a short Python script which requires Biopython. If you use 7 This tool is a short Python script which requires Biopython. If you use
12 this tool in scientific work leading to a publication, please cite the 8 this tool in scientific work leading to a publication, please cite the
13 Biopython application note: 9 Biopython application note:
14 10
17 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
18 14
19 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
20 (formerly SCRI), Dundee, UK. All rights reserved. 16 (formerly SCRI), Dundee, UK. All rights reserved.
21 17
22 See accompanying text file for licence details (MIT/BSD style). 18 See accompanying text file for licence details (MIT licence).
23 19
24 This is version 0.0.3 of the script. 20 This is version 0.1.0 of the script.
25 """ 21 """
26 import sys 22 import sys
27 import re 23 import re
28 24 from optparse import OptionParser
29 if "-v" in sys.argv or "--version" in sys.argv: 25
30 print "v0.0.3" 26 def sys_exit(msg, err=1):
31 sys.exit(0)
32
33 def stop_err(msg, err=1):
34 sys.stderr.write(msg.rstrip() + "\n") 27 sys.stderr.write(msg.rstrip() + "\n")
35 sys.exit(err) 28 sys.exit(err)
29
30 usage = """Use as follows:
31
32 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed
33 """
36 34
37 try: 35 try:
38 from Bio.Seq import Seq, reverse_complement, translate 36 from Bio.Seq import Seq, reverse_complement, translate
39 from Bio.SeqRecord import SeqRecord 37 from Bio.SeqRecord import SeqRecord
40 from Bio import SeqIO 38 from Bio import SeqIO
41 from Bio.Data import CodonTable 39 from Bio.Data import CodonTable
42 except ImportError: 40 except ImportError:
43 stop_err("Missing Biopython library") 41 sys_exit("Missing Biopython library")
44 42
45 #Parse Command Line 43
44 parser = OptionParser(usage=usage)
45 parser.add_option('-i', '--input', dest='input_file',
46 default=None, help='Input fasta file',
47 metavar='FILE')
48 parser.add_option('-f', '--format', dest='seq_format',
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)')
50 parser.add_option('--table', dest='table',
51 default=1, help='NCBI Translation table', type='int')
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice',
53 choices=['CDS', 'ORF'], default='ORF',
54 help='Find ORF or CDSs')
55 parser.add_option('-e', '--ends', dest='ends', type='choice',
56 choices=['open', 'closed'], default='closed',
57 help='Open or closed. Closed ensures start/stop codons are present')
58 parser.add_option('-m', '--mode', dest='mode', type='choice',
59 choices=['all', 'top', 'one'], default='all',
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs '
61 'with max length, or first with maximum length')
62 parser.add_option('--min_len', dest='min_len',
63 default=10, help='Minimum ORF/CDS length', type='int')
64 parser.add_option('-s', '--strand', dest='strand', type='choice',
65 choices=['forward', 'reverse', 'both'], default='both',
66 help='Strand to search for features on')
67 parser.add_option('--on', dest='out_nuc_file',
68 default=None, help='Output nucleotide sequences, or - for STDOUT',
69 metavar='FILE')
70 parser.add_option('--op', dest='out_prot_file',
71 default=None, help='Output protein sequences, or - for STDOUT',
72 metavar='FILE')
73 parser.add_option('--ob', dest='out_bed_file',
74 default=None, help='Output BED file, or - for STDOUT',
75 metavar='FILE')
76 parser.add_option('-v', '--version', dest='version',
77 default=False, action='store_true',
78 help='Show version and quit')
79
80 options, args = parser.parse_args()
81
82 if options.version:
83 print "v0.1.0"
84 sys.exit(0)
85
46 try: 86 try:
47 input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] 87 table_obj = CodonTable.ambiguous_generic_by_id[options.table]
48 except ValueError:
49 stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
50
51 try:
52 table = int(table)
53 except ValueError:
54 stop_err("Expected integer for genetic code table, got %s" % table)
55
56 try:
57 table_obj = CodonTable.ambiguous_generic_by_id[table]
58 except KeyError: 88 except KeyError:
59 stop_err("Unknown codon table %i" % table) 89 sys_exit("Unknown codon table %i" % options.table)
60 90
61 if ftype not in ["CDS", "ORF"]: 91 if options.seq_format.lower()=="sff":
62 stop_err("Expected CDS or ORF, got %s" % ftype)
63
64 if ends not in ["open", "closed"]:
65 stop_err("Expected open or closed for end treatment, got %s" % ends)
66
67 try:
68 min_len = int(min_len)
69 except ValueError:
70 stop_err("Expected integer for min_len, got %s" % min_len)
71
72 if seq_format.lower()=="sff":
73 seq_format = "sff-trim" 92 seq_format = "sff-trim"
74 elif seq_format.lower()=="fasta": 93 elif options.seq_format.lower()=="fasta":
75 seq_format = "fasta" 94 seq_format = "fasta"
76 elif seq_format.lower().startswith("fastq"): 95 elif options.seq_format.lower().startswith("fastq"):
77 seq_format = "fastq" 96 seq_format = "fastq"
78 else: 97 else:
79 stop_err("Unsupported file type %r" % seq_format) 98 sys_exit("Unsupported file type %r" % options.seq_format)
80 99
81 print "Genetic code table %i" % table 100 print "Genetic code table %i" % options.table
82 print "Minimum length %i aa" % min_len 101 print "Minimum length %i aa" % options.min_len
83 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) 102 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
84 103
85 starts = sorted(table_obj.start_codons) 104 starts = sorted(table_obj.start_codons)
86 assert "NNN" not in starts 105 assert "NNN" not in starts
87 re_starts = re.compile("|".join(starts)) 106 re_starts = re.compile("|".join(starts))
100 start = match.start() 119 start = match.start()
101 if start % 3 == 0: 120 if start % 3 == 0:
102 n = s[start:] 121 n = s[start:]
103 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) 122 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
104 if strict: 123 if strict:
105 t = translate(n, table, cds=True) 124 t = translate(n, options.table, cds=True)
106 else: 125 else:
107 #Use when missing stop codon, 126 #Use when missing stop codon,
108 t = "M" + translate(n[3:], table, to_stop=True) 127 t = "M" + translate(n[3:], options.table, to_stop=True)
109 return start, n, t 128 return start, n, t
110 return None, None, None 129 return None, None, None
111 130
112 def break_up_frame(s): 131 def break_up_frame(s):
113 """Returns offset, nuc, protein.""" 132 """Returns offset, nuc, protein."""
115 for match in re_stops.finditer(s): 134 for match in re_stops.finditer(s):
116 index = match.start() + 3 135 index = match.start() + 3
117 if index % 3 != 0: 136 if index % 3 != 0:
118 continue 137 continue
119 n = s[start:index] 138 n = s[start:index]
120 if ftype=="CDS": 139 if options.ftype=="CDS":
121 offset, n, t = start_chop_and_trans(n) 140 offset, n, t = start_chop_and_trans(n)
122 else: 141 else:
123 offset = 0 142 offset = 0
124 t = translate(n, table, to_stop=True) 143 t = translate(n, options.table, to_stop=True)
125 if n and len(t) >= min_len: 144 if n and len(t) >= options.min_len:
126 yield start + offset, n, t 145 yield start + offset, n, t
127 start = index 146 start = index
128 if ends == "open": 147 if options.ends == "open":
129 #No stop codon, Biopython's strict CDS translate will fail 148 #No stop codon, Biopython's strict CDS translate will fail
130 n = s[start:] 149 n = s[start:]
131 #Ensure we have whole codons 150 #Ensure we have whole codons
132 #TODO - Try appending N instead? 151 #TODO - Try appending N instead?
133 #TODO - Do the next four lines more elegantly 152 #TODO - Do the next four lines more elegantly
134 if len(n) % 3: 153 if len(n) % 3:
135 n = n[:-1] 154 n = n[:-1]
136 if len(n) % 3: 155 if len(n) % 3:
137 n = n[:-1] 156 n = n[:-1]
138 if ftype=="CDS": 157 if options.ftype=="CDS":
139 offset, n, t = start_chop_and_trans(n, strict=False) 158 offset, n, t = start_chop_and_trans(n, strict=False)
140 else: 159 else:
141 offset = 0 160 offset = 0
142 t = translate(n, table, to_stop=True) 161 t = translate(n, options.table, to_stop=True)
143 if n and len(t) >= min_len: 162 if n and len(t) >= options.min_len:
144 yield start + offset, n, t 163 yield start + offset, n, t
145 164
146 165
147 def get_all_peptides(nuc_seq): 166 def get_all_peptides(nuc_seq):
148 """Returns start, end, strand, nucleotides, protein. 167 """Returns start, end, strand, nucleotides, protein.
149 168
150 Co-ordinates are Python style zero-based. 169 Co-ordinates are Python style zero-based.
151 """ 170 """
152 #TODO - Refactor to use a generator function (in start order) 171 #TODO - Refactor to use a generator function (in start order)
153 #rather than making a list and sorting? 172 #rather than making a list and sorting?
154 answer = [] 173 answer = []
155 full_len = len(nuc_seq) 174 full_len = len(nuc_seq)
156 if strand != "reverse": 175 if options.strand != "reverse":
157 for frame in range(0,3): 176 for frame in range(0,3):
158 for offset, n, t in break_up_frame(nuc_seq[frame:]): 177 for offset, n, t in break_up_frame(nuc_seq[frame:]):
159 start = frame + offset #zero based 178 start = frame + offset #zero based
160 answer.append((start, start + len(n), +1, n, t)) 179 answer.append((start, start + len(n), +1, n, t))
161 if strand != "forward": 180 if options.strand != "forward":
162 rc = reverse_complement(nuc_seq) 181 rc = reverse_complement(nuc_seq)
163 for frame in range(0,3) : 182 for frame in range(0,3) :
164 for offset, n, t in break_up_frame(rc[frame:]): 183 for offset, n, t in break_up_frame(rc[frame:]):
165 start = full_len - frame - offset #zero based 184 start = full_len - frame - offset #zero based
166 answer.append((start - len(n), start, -1, n ,t)) 185 answer.append((start - len(n), start, -1, n ,t))
182 values = list(get_top_peptides(nuc_seq)) 201 values = list(get_top_peptides(nuc_seq))
183 if not values: 202 if not values:
184 raise StopIteration 203 raise StopIteration
185 yield values[0] 204 yield values[0]
186 205
187 if mode == "all": 206 if options.mode == "all":
188 get_peptides = get_all_peptides 207 get_peptides = get_all_peptides
189 elif mode == "top": 208 elif options.mode == "top":
190 get_peptides = get_top_peptides 209 get_peptides = get_top_peptides
191 elif mode == "one": 210 elif options.mode == "one":
192 get_peptides = get_one_peptide 211 get_peptides = get_one_peptide
193 212
194 in_count = 0 213 in_count = 0
195 out_count = 0 214 out_count = 0
196 if out_nuc_file == "-": 215 if options.out_nuc_file == "-":
197 out_nuc = sys.stdout 216 out_nuc = sys.stdout
198 else: 217 else:
199 out_nuc = open(out_nuc_file, "w") 218 out_nuc = open(options.out_nuc_file, "w")
200 if out_prot_file == "-": 219
220 if options.out_prot_file == "-":
201 out_prot = sys.stdout 221 out_prot = sys.stdout
202 else: 222 else:
203 out_prot = open(out_prot_file, "w") 223 out_prot = open(options.out_prot_file, "w")
204 for record in SeqIO.parse(input_file, seq_format): 224
225 if options.out_bed_file == "-":
226 out_bed = sys.stdout
227 else:
228 out_bed = open(options.out_bed_file, "w")
229
230 for record in SeqIO.parse(options.input_file, seq_format):
205 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): 231 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
206 out_count += 1 232 out_count += 1
207 if f_strand == +1: 233 if f_strand == +1:
208 loc = "%i..%i" % (f_start+1, f_end) 234 loc = "%i..%i" % (f_start+1, f_end)
209 else: 235 else:
210 loc = "complement(%i..%i)" % (f_start+1, f_end) 236 loc = "complement(%i..%i)" % (f_start+1, f_end)
211 descr = "length %i aa, %i bp, from %s of %s" \ 237 descr = "length %i aa, %i bp, from %s of %s" \
212 % (len(t), len(n), loc, record.description) 238 % (len(t), len(n), loc, record.description)
213 r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) 239 fid = record.id + "|%s%i" % (options.ftype, i+1)
214 t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) 240 r = SeqRecord(Seq(n), id = fid, name = "", description= descr)
241 t = SeqRecord(Seq(t), id = fid, name = "", description= descr)
215 SeqIO.write(r, out_nuc, "fasta") 242 SeqIO.write(r, out_nuc, "fasta")
216 SeqIO.write(t, out_prot, "fasta") 243 SeqIO.write(t, out_prot, "fasta")
244 out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n')
217 in_count += 1 245 in_count += 1
218 if out_nuc is not sys.stdout: 246 if out_nuc is not sys.stdout:
219 out_nuc.close() 247 out_nuc.close()
220 if out_prot is not sys.stdout: 248 if out_prot is not sys.stdout:
221 out_prot.close() 249 out_prot.close()
222 250 if out_bed is not sys.stdout:
223 print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) 251 out_bed.close()
252
253 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)