comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 12:71905a6d52a7 draft default tip

"Update all the pico_galaxy tools on main Tool Shed"
author peterjc
date Fri, 16 Apr 2021 22:37:04 +0000
parents d51db443aaa4
children
comparison
equal deleted inserted replaced
11:d51db443aaa4 12:71905a6d52a7
8 this tool in scientific work leading to a publication, please cite the 8 this tool in scientific work leading to a publication, please cite the
9 Biopython application note: 9 Biopython application note:
10 10
11 Cock et al 2009. Biopython: freely available Python tools for computational 11 Cock et al 2009. Biopython: freely available Python tools for computational
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. 12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. 13 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
14 14
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
16 (formerly SCRI), Dundee, UK. All rights reserved. 16 (formerly SCRI), Dundee, UK. All rights reserved.
17 17
18 See accompanying text file for licence details (MIT licence). 18 See accompanying text file for licence details (MIT licence).
40 except ImportError: 40 except ImportError:
41 sys.exit("Missing Biopython library") 41 sys.exit("Missing Biopython library")
42 42
43 43
44 parser = OptionParser(usage=usage) 44 parser = OptionParser(usage=usage)
45 parser.add_option('-i', '--input', dest='input_file', 45 parser.add_option(
46 default=None, help='Input fasta file', 46 "-i",
47 metavar='FILE') 47 "--input",
48 parser.add_option('-f', '--format', dest='seq_format', 48 dest="input_file",
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') 49 default=None,
50 parser.add_option('--table', dest='table', 50 help="Input fasta file",
51 default=1, help='NCBI Translation table', type='int') 51 metavar="FILE",
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice', 52 )
53 choices=['CDS', 'ORF'], default='ORF', 53 parser.add_option(
54 help='Find ORF or CDSs') 54 "-f",
55 parser.add_option('-e', '--ends', dest='ends', type='choice', 55 "--format",
56 choices=['open', 'closed'], default='closed', 56 dest="seq_format",
57 help='Open or closed. Closed ensures start/stop codons are present') 57 default="fasta",
58 parser.add_option('-m', '--mode', dest='mode', type='choice', 58 help="Sequence format (e.g. fasta, fastq, sff)",
59 choices=['all', 'top', 'one'], default='all', 59 )
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' 60 parser.add_option(
61 'with max length, or first with maximum length') 61 "--table", dest="table", default=1, help="NCBI Translation table", type="int"
62 parser.add_option('--min_len', dest='min_len', 62 )
63 default=10, help='Minimum ORF/CDS length', type='int') 63 parser.add_option(
64 parser.add_option('-s', '--strand', dest='strand', type='choice', 64 "-t",
65 choices=['forward', 'reverse', 'both'], default='both', 65 "--ftype",
66 help='Strand to search for features on') 66 dest="ftype",
67 parser.add_option('--on', dest='out_nuc_file', 67 type="choice",
68 default=None, help='Output nucleotide sequences, or - for STDOUT', 68 choices=["CDS", "ORF"],
69 metavar='FILE') 69 default="ORF",
70 parser.add_option('--op', dest='out_prot_file', 70 help="Find ORF or CDSs",
71 default=None, help='Output protein sequences, or - for STDOUT', 71 )
72 metavar='FILE') 72 parser.add_option(
73 parser.add_option('--ob', dest='out_bed_file', 73 "-e",
74 default=None, help='Output BED file, or - for STDOUT', 74 "--ends",
75 metavar='FILE') 75 dest="ends",
76 parser.add_option('--og', dest='out_gff3_file', 76 type="choice",
77 default=None, help='Output GFF3 file, or - for STDOUT', 77 choices=["open", "closed"],
78 metavar='FILE') 78 default="closed",
79 parser.add_option('-v', '--version', dest='version', 79 help="Open or closed. Closed ensures start/stop codons are present",
80 default=False, action='store_true', 80 )
81 help='Show version and quit') 81 parser.add_option(
82 "-m",
83 "--mode",
84 dest="mode",
85 type="choice",
86 choices=["all", "top", "one"],
87 default="all",
88 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs "
89 "with max length, or first with maximum length",
90 )
91 parser.add_option(
92 "--min_len", dest="min_len", default=10, help="Minimum ORF/CDS length", type="int"
93 )
94 parser.add_option(
95 "-s",
96 "--strand",
97 dest="strand",
98 type="choice",
99 choices=["forward", "reverse", "both"],
100 default="both",
101 help="Strand to search for features on",
102 )
103 parser.add_option(
104 "--on",
105 dest="out_nuc_file",
106 default=None,
107 help="Output nucleotide sequences, or - for STDOUT",
108 metavar="FILE",
109 )
110 parser.add_option(
111 "--op",
112 dest="out_prot_file",
113 default=None,
114 help="Output protein sequences, or - for STDOUT",
115 metavar="FILE",
116 )
117 parser.add_option(
118 "--ob",
119 dest="out_bed_file",
120 default=None,
121 help="Output BED file, or - for STDOUT",
122 metavar="FILE",
123 )
124 parser.add_option(
125 "--og",
126 dest="out_gff3_file",
127 default=None,
128 help="Output GFF3 file, or - for STDOUT",
129 metavar="FILE",
130 )
131 parser.add_option(
132 "-v",
133 "--version",
134 dest="version",
135 default=False,
136 action="store_true",
137 help="Show version and quit",
138 )
82 139
83 options, args = parser.parse_args() 140 options, args = parser.parse_args()
84 141
85 if options.version: 142 if options.version:
86 print("v0.2.3") 143 print("v0.2.3")
87 sys.exit(0) 144 sys.exit(0)
88 145
89 if not options.input_file: 146 if not options.input_file:
90 sys.exit("Input file is required") 147 sys.exit("Input file is required")
91 148
92 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): 149 if not any(
150 (
151 options.out_nuc_file,
152 options.out_prot_file,
153 options.out_bed_file,
154 options.out_gff3_file,
155 )
156 ):
93 sys.exit("At least one output file is required") 157 sys.exit("At least one output file is required")
94 158
95 try: 159 try:
96 table_obj = CodonTable.ambiguous_generic_by_id[options.table] 160 table_obj = CodonTable.ambiguous_generic_by_id[options.table]
97 except KeyError: 161 except KeyError:
118 assert "NNN" not in stops 182 assert "NNN" not in stops
119 re_stops = re.compile("|".join(stops)) 183 re_stops = re.compile("|".join(stops))
120 184
121 185
122 def start_chop_and_trans(s, strict=True): 186 def start_chop_and_trans(s, strict=True):
123 """Returns offset, trimmed nuc, protein.""" 187 """Return offset, trimmed nuc, protein."""
124 if strict: 188 if strict:
125 assert s[-3:] in stops, s 189 assert s[-3:] in stops, s
126 assert len(s) % 3 == 0 190 assert len(s) % 3 == 0
127 for match in re_starts.finditer(s): 191 for match in re_starts.finditer(s):
128 # Must check the start is in frame 192 # Must check the start is in frame
138 return start, n, t 202 return start, n, t
139 return None, None, None 203 return None, None, None
140 204
141 205
142 def break_up_frame(s): 206 def break_up_frame(s):
143 """Returns offset, nuc, protein.""" 207 """Return offset, nuc, protein."""
144 start = 0 208 start = 0
145 for match in re_stops.finditer(s): 209 for match in re_stops.finditer(s):
146 index = match.start() + 3 210 index = match.start() + 3
147 if index % 3 != 0: 211 if index % 3 != 0:
148 continue 212 continue
173 if n and len(t) >= options.min_len: 237 if n and len(t) >= options.min_len:
174 yield start + offset, n, t 238 yield start + offset, n, t
175 239
176 240
177 def get_all_peptides(nuc_seq): 241 def get_all_peptides(nuc_seq):
178 """Returns start, end, strand, nucleotides, protein. 242 """Return start, end, strand, nucleotides, protein.
179 243
180 Co-ordinates are Python style zero-based. 244 Co-ordinates are Python style zero-based.
181 """ 245 """
182 # TODO - Refactor to use a generator function (in start order) 246 # TODO - Refactor to use a generator function (in start order)
183 # rather than making a list and sorting? 247 # rather than making a list and sorting?
197 answer.sort() 261 answer.sort()
198 return answer 262 return answer
199 263
200 264
201 def get_top_peptides(nuc_seq): 265 def get_top_peptides(nuc_seq):
202 """Returns all peptides of max length.""" 266 """Return all peptides of max length."""
203 values = list(get_all_peptides(nuc_seq)) 267 values = list(get_all_peptides(nuc_seq))
204 if not values: 268 if not values:
205 raise StopIteration 269 raise StopIteration
206 max_len = max(len(x[-1]) for x in values) 270 max_len = max(len(x[-1]) for x in values)
207 for x in values: 271 for x in values:
208 if len(x[-1]) == max_len: 272 if len(x[-1]) == max_len:
209 yield x 273 yield x
210 274
211 275
212 def get_one_peptide(nuc_seq): 276 def get_one_peptide(nuc_seq):
213 """Returns first (left most) peptide with max length.""" 277 """Return first (left most) peptide with max length."""
214 values = list(get_top_peptides(nuc_seq)) 278 values = list(get_top_peptides(nuc_seq))
215 if not values: 279 if not values:
216 raise StopIteration 280 raise StopIteration
217 yield values[0] 281 yield values[0]
218 282
253 out_gff3 = open(options.out_gff3_file, "w") 317 out_gff3 = open(options.out_gff3_file, "w")
254 else: 318 else:
255 out_gff3 = None 319 out_gff3 = None
256 320
257 if out_gff3: 321 if out_gff3:
258 out_gff3.write('##gff-version 3\n') 322 out_gff3.write("##gff-version 3\n")
259 323
260 for record in SeqIO.parse(options.input_file, seq_format): 324 for record in SeqIO.parse(options.input_file, seq_format):
261 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): 325 for i, (f_start, f_end, f_strand, n, t) in enumerate(
326 get_peptides(str(record.seq).upper())
327 ):
262 out_count += 1 328 out_count += 1
263 if f_strand == +1: 329 if f_strand == +1:
264 loc = "%i..%i" % (f_start + 1, f_end) 330 loc = "%i..%i" % (f_start + 1, f_end)
265 else: 331 else:
266 loc = "complement(%i..%i)" % (f_start + 1, f_end) 332 loc = "complement(%i..%i)" % (f_start + 1, f_end)
267 descr = "length %i aa, %i bp, from %s of %s" \ 333 descr = "length %i aa, %i bp, from %s of %s" % (
268 % (len(t), len(n), loc, record.description) 334 len(t),
335 len(n),
336 loc,
337 record.description,
338 )
269 fid = record.id + "|%s%i" % (options.ftype, i + 1) 339 fid = record.id + "|%s%i" % (options.ftype, i + 1)
270 r = SeqRecord(Seq(n), id=fid, name="", description=descr) 340 r = SeqRecord(Seq(n), id=fid, name="", description=descr)
271 t = SeqRecord(Seq(t), id=fid, name="", description=descr) 341 t = SeqRecord(Seq(t), id=fid, name="", description=descr)
272 if out_nuc: 342 if out_nuc:
273 SeqIO.write(r, out_nuc, "fasta") 343 SeqIO.write(r, out_nuc, "fasta")
274 if out_prot: 344 if out_prot:
275 SeqIO.write(t, out_prot, "fasta") 345 SeqIO.write(t, out_prot, "fasta")
276 nice_strand = '+' if f_strand == +1 else '-' 346 nice_strand = "+" if f_strand == +1 else "-"
277 if out_bed: 347 if out_bed:
278 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') 348 out_bed.write(
349 "\t".join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand]))
350 + "\n"
351 )
279 if out_gff3: 352 if out_gff3:
280 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', 353 out_gff3.write(
281 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') 354 "\t".join(
355 map(
356 str,
357 [
358 record.id,
359 "getOrfsOrCds",
360 "CDS",
361 f_start + 1,
362 f_end,
363 ".",
364 nice_strand,
365 0,
366 "ID=%s%s" % (options.ftype, i + 1),
367 ],
368 )
369 )
370 + "\n"
371 )
282 in_count += 1 372 in_count += 1
283 if out_nuc and out_nuc is not sys.stdout: 373 if out_nuc and out_nuc is not sys.stdout:
284 out_nuc.close() 374 out_nuc.close()
285 if out_prot and out_prot is not sys.stdout: 375 if out_prot and out_prot is not sys.stdout:
286 out_prot.close() 376 out_prot.close()