Mercurial > repos > peterjc > get_orfs_or_cdss
comparison tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 12:71905a6d52a7 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:37:04 +0000 |
parents | d51db443aaa4 |
children |
comparison
equal
deleted
inserted
replaced
11:d51db443aaa4 | 12:71905a6d52a7 |
---|---|
8 this tool in scientific work leading to a publication, please cite the | 8 this tool in scientific work leading to a publication, please cite the |
9 Biopython application note: | 9 Biopython application note: |
10 | 10 |
11 Cock et al 2009. Biopython: freely available Python tools for computational | 11 Cock et al 2009. Biopython: freely available Python tools for computational |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | 12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | 13 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
14 | 14 |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | 15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute |
16 (formerly SCRI), Dundee, UK. All rights reserved. | 16 (formerly SCRI), Dundee, UK. All rights reserved. |
17 | 17 |
18 See accompanying text file for licence details (MIT licence). | 18 See accompanying text file for licence details (MIT licence). |
40 except ImportError: | 40 except ImportError: |
41 sys.exit("Missing Biopython library") | 41 sys.exit("Missing Biopython library") |
42 | 42 |
43 | 43 |
44 parser = OptionParser(usage=usage) | 44 parser = OptionParser(usage=usage) |
45 parser.add_option('-i', '--input', dest='input_file', | 45 parser.add_option( |
46 default=None, help='Input fasta file', | 46 "-i", |
47 metavar='FILE') | 47 "--input", |
48 parser.add_option('-f', '--format', dest='seq_format', | 48 dest="input_file", |
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') | 49 default=None, |
50 parser.add_option('--table', dest='table', | 50 help="Input fasta file", |
51 default=1, help='NCBI Translation table', type='int') | 51 metavar="FILE", |
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice', | 52 ) |
53 choices=['CDS', 'ORF'], default='ORF', | 53 parser.add_option( |
54 help='Find ORF or CDSs') | 54 "-f", |
55 parser.add_option('-e', '--ends', dest='ends', type='choice', | 55 "--format", |
56 choices=['open', 'closed'], default='closed', | 56 dest="seq_format", |
57 help='Open or closed. Closed ensures start/stop codons are present') | 57 default="fasta", |
58 parser.add_option('-m', '--mode', dest='mode', type='choice', | 58 help="Sequence format (e.g. fasta, fastq, sff)", |
59 choices=['all', 'top', 'one'], default='all', | 59 ) |
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' | 60 parser.add_option( |
61 'with max length, or first with maximum length') | 61 "--table", dest="table", default=1, help="NCBI Translation table", type="int" |
62 parser.add_option('--min_len', dest='min_len', | 62 ) |
63 default=10, help='Minimum ORF/CDS length', type='int') | 63 parser.add_option( |
64 parser.add_option('-s', '--strand', dest='strand', type='choice', | 64 "-t", |
65 choices=['forward', 'reverse', 'both'], default='both', | 65 "--ftype", |
66 help='Strand to search for features on') | 66 dest="ftype", |
67 parser.add_option('--on', dest='out_nuc_file', | 67 type="choice", |
68 default=None, help='Output nucleotide sequences, or - for STDOUT', | 68 choices=["CDS", "ORF"], |
69 metavar='FILE') | 69 default="ORF", |
70 parser.add_option('--op', dest='out_prot_file', | 70 help="Find ORF or CDSs", |
71 default=None, help='Output protein sequences, or - for STDOUT', | 71 ) |
72 metavar='FILE') | 72 parser.add_option( |
73 parser.add_option('--ob', dest='out_bed_file', | 73 "-e", |
74 default=None, help='Output BED file, or - for STDOUT', | 74 "--ends", |
75 metavar='FILE') | 75 dest="ends", |
76 parser.add_option('--og', dest='out_gff3_file', | 76 type="choice", |
77 default=None, help='Output GFF3 file, or - for STDOUT', | 77 choices=["open", "closed"], |
78 metavar='FILE') | 78 default="closed", |
79 parser.add_option('-v', '--version', dest='version', | 79 help="Open or closed. Closed ensures start/stop codons are present", |
80 default=False, action='store_true', | 80 ) |
81 help='Show version and quit') | 81 parser.add_option( |
82 "-m", | |
83 "--mode", | |
84 dest="mode", | |
85 type="choice", | |
86 choices=["all", "top", "one"], | |
87 default="all", | |
88 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs " | |
89 "with max length, or first with maximum length", | |
90 ) | |
91 parser.add_option( | |
92 "--min_len", dest="min_len", default=10, help="Minimum ORF/CDS length", type="int" | |
93 ) | |
94 parser.add_option( | |
95 "-s", | |
96 "--strand", | |
97 dest="strand", | |
98 type="choice", | |
99 choices=["forward", "reverse", "both"], | |
100 default="both", | |
101 help="Strand to search for features on", | |
102 ) | |
103 parser.add_option( | |
104 "--on", | |
105 dest="out_nuc_file", | |
106 default=None, | |
107 help="Output nucleotide sequences, or - for STDOUT", | |
108 metavar="FILE", | |
109 ) | |
110 parser.add_option( | |
111 "--op", | |
112 dest="out_prot_file", | |
113 default=None, | |
114 help="Output protein sequences, or - for STDOUT", | |
115 metavar="FILE", | |
116 ) | |
117 parser.add_option( | |
118 "--ob", | |
119 dest="out_bed_file", | |
120 default=None, | |
121 help="Output BED file, or - for STDOUT", | |
122 metavar="FILE", | |
123 ) | |
124 parser.add_option( | |
125 "--og", | |
126 dest="out_gff3_file", | |
127 default=None, | |
128 help="Output GFF3 file, or - for STDOUT", | |
129 metavar="FILE", | |
130 ) | |
131 parser.add_option( | |
132 "-v", | |
133 "--version", | |
134 dest="version", | |
135 default=False, | |
136 action="store_true", | |
137 help="Show version and quit", | |
138 ) | |
82 | 139 |
83 options, args = parser.parse_args() | 140 options, args = parser.parse_args() |
84 | 141 |
85 if options.version: | 142 if options.version: |
86 print("v0.2.3") | 143 print("v0.2.3") |
87 sys.exit(0) | 144 sys.exit(0) |
88 | 145 |
89 if not options.input_file: | 146 if not options.input_file: |
90 sys.exit("Input file is required") | 147 sys.exit("Input file is required") |
91 | 148 |
92 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): | 149 if not any( |
150 ( | |
151 options.out_nuc_file, | |
152 options.out_prot_file, | |
153 options.out_bed_file, | |
154 options.out_gff3_file, | |
155 ) | |
156 ): | |
93 sys.exit("At least one output file is required") | 157 sys.exit("At least one output file is required") |
94 | 158 |
95 try: | 159 try: |
96 table_obj = CodonTable.ambiguous_generic_by_id[options.table] | 160 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
97 except KeyError: | 161 except KeyError: |
118 assert "NNN" not in stops | 182 assert "NNN" not in stops |
119 re_stops = re.compile("|".join(stops)) | 183 re_stops = re.compile("|".join(stops)) |
120 | 184 |
121 | 185 |
122 def start_chop_and_trans(s, strict=True): | 186 def start_chop_and_trans(s, strict=True): |
123 """Returns offset, trimmed nuc, protein.""" | 187 """Return offset, trimmed nuc, protein.""" |
124 if strict: | 188 if strict: |
125 assert s[-3:] in stops, s | 189 assert s[-3:] in stops, s |
126 assert len(s) % 3 == 0 | 190 assert len(s) % 3 == 0 |
127 for match in re_starts.finditer(s): | 191 for match in re_starts.finditer(s): |
128 # Must check the start is in frame | 192 # Must check the start is in frame |
138 return start, n, t | 202 return start, n, t |
139 return None, None, None | 203 return None, None, None |
140 | 204 |
141 | 205 |
142 def break_up_frame(s): | 206 def break_up_frame(s): |
143 """Returns offset, nuc, protein.""" | 207 """Return offset, nuc, protein.""" |
144 start = 0 | 208 start = 0 |
145 for match in re_stops.finditer(s): | 209 for match in re_stops.finditer(s): |
146 index = match.start() + 3 | 210 index = match.start() + 3 |
147 if index % 3 != 0: | 211 if index % 3 != 0: |
148 continue | 212 continue |
173 if n and len(t) >= options.min_len: | 237 if n and len(t) >= options.min_len: |
174 yield start + offset, n, t | 238 yield start + offset, n, t |
175 | 239 |
176 | 240 |
177 def get_all_peptides(nuc_seq): | 241 def get_all_peptides(nuc_seq): |
178 """Returns start, end, strand, nucleotides, protein. | 242 """Return start, end, strand, nucleotides, protein. |
179 | 243 |
180 Co-ordinates are Python style zero-based. | 244 Co-ordinates are Python style zero-based. |
181 """ | 245 """ |
182 # TODO - Refactor to use a generator function (in start order) | 246 # TODO - Refactor to use a generator function (in start order) |
183 # rather than making a list and sorting? | 247 # rather than making a list and sorting? |
197 answer.sort() | 261 answer.sort() |
198 return answer | 262 return answer |
199 | 263 |
200 | 264 |
201 def get_top_peptides(nuc_seq): | 265 def get_top_peptides(nuc_seq): |
202 """Returns all peptides of max length.""" | 266 """Return all peptides of max length.""" |
203 values = list(get_all_peptides(nuc_seq)) | 267 values = list(get_all_peptides(nuc_seq)) |
204 if not values: | 268 if not values: |
205 raise StopIteration | 269 raise StopIteration |
206 max_len = max(len(x[-1]) for x in values) | 270 max_len = max(len(x[-1]) for x in values) |
207 for x in values: | 271 for x in values: |
208 if len(x[-1]) == max_len: | 272 if len(x[-1]) == max_len: |
209 yield x | 273 yield x |
210 | 274 |
211 | 275 |
212 def get_one_peptide(nuc_seq): | 276 def get_one_peptide(nuc_seq): |
213 """Returns first (left most) peptide with max length.""" | 277 """Return first (left most) peptide with max length.""" |
214 values = list(get_top_peptides(nuc_seq)) | 278 values = list(get_top_peptides(nuc_seq)) |
215 if not values: | 279 if not values: |
216 raise StopIteration | 280 raise StopIteration |
217 yield values[0] | 281 yield values[0] |
218 | 282 |
253 out_gff3 = open(options.out_gff3_file, "w") | 317 out_gff3 = open(options.out_gff3_file, "w") |
254 else: | 318 else: |
255 out_gff3 = None | 319 out_gff3 = None |
256 | 320 |
257 if out_gff3: | 321 if out_gff3: |
258 out_gff3.write('##gff-version 3\n') | 322 out_gff3.write("##gff-version 3\n") |
259 | 323 |
260 for record in SeqIO.parse(options.input_file, seq_format): | 324 for record in SeqIO.parse(options.input_file, seq_format): |
261 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): | 325 for i, (f_start, f_end, f_strand, n, t) in enumerate( |
326 get_peptides(str(record.seq).upper()) | |
327 ): | |
262 out_count += 1 | 328 out_count += 1 |
263 if f_strand == +1: | 329 if f_strand == +1: |
264 loc = "%i..%i" % (f_start + 1, f_end) | 330 loc = "%i..%i" % (f_start + 1, f_end) |
265 else: | 331 else: |
266 loc = "complement(%i..%i)" % (f_start + 1, f_end) | 332 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
267 descr = "length %i aa, %i bp, from %s of %s" \ | 333 descr = "length %i aa, %i bp, from %s of %s" % ( |
268 % (len(t), len(n), loc, record.description) | 334 len(t), |
335 len(n), | |
336 loc, | |
337 record.description, | |
338 ) | |
269 fid = record.id + "|%s%i" % (options.ftype, i + 1) | 339 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
270 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | 340 r = SeqRecord(Seq(n), id=fid, name="", description=descr) |
271 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | 341 t = SeqRecord(Seq(t), id=fid, name="", description=descr) |
272 if out_nuc: | 342 if out_nuc: |
273 SeqIO.write(r, out_nuc, "fasta") | 343 SeqIO.write(r, out_nuc, "fasta") |
274 if out_prot: | 344 if out_prot: |
275 SeqIO.write(t, out_prot, "fasta") | 345 SeqIO.write(t, out_prot, "fasta") |
276 nice_strand = '+' if f_strand == +1 else '-' | 346 nice_strand = "+" if f_strand == +1 else "-" |
277 if out_bed: | 347 if out_bed: |
278 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') | 348 out_bed.write( |
349 "\t".join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) | |
350 + "\n" | |
351 ) | |
279 if out_gff3: | 352 if out_gff3: |
280 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', | 353 out_gff3.write( |
281 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') | 354 "\t".join( |
355 map( | |
356 str, | |
357 [ | |
358 record.id, | |
359 "getOrfsOrCds", | |
360 "CDS", | |
361 f_start + 1, | |
362 f_end, | |
363 ".", | |
364 nice_strand, | |
365 0, | |
366 "ID=%s%s" % (options.ftype, i + 1), | |
367 ], | |
368 ) | |
369 ) | |
370 + "\n" | |
371 ) | |
282 in_count += 1 | 372 in_count += 1 |
283 if out_nuc and out_nuc is not sys.stdout: | 373 if out_nuc and out_nuc is not sys.stdout: |
284 out_nuc.close() | 374 out_nuc.close() |
285 if out_prot and out_prot is not sys.stdout: | 375 if out_prot and out_prot is not sys.stdout: |
286 out_prot.close() | 376 out_prot.close() |