Mercurial > repos > peterjc > get_orfs_or_cdss
annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 11:d51db443aaa4 draft
v0.2.3 Python 3 compatible print function.
author | peterjc |
---|---|
date | Wed, 30 May 2018 08:33:20 -0400 |
parents | a06ad07431ba |
children | 71905a6d52a7 |
rev | line source |
---|---|
5 | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
4 For more details, see the help text and argument descriptions in the |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
5 | 6 |
7 This tool is a short Python script which requires Biopython. If you use | |
8 this tool in scientific work leading to a publication, please cite the | |
9 Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
14 | |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
16 (formerly SCRI), Dundee, UK. All rights reserved. | |
17 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
18 See accompanying text file for licence details (MIT licence). |
5 | 19 """ |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
20 |
11 | 21 from __future__ import print_function |
22 | |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
23 import re |
5 | 24 import sys |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
25 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
26 from optparse import OptionParser |
5 | 27 |
11 | 28 usage = r"""Use as follows: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
29 |
11 | 30 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 \ |
31 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa \ | |
32 --ob cds.bed --og cds.gff3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
33 """ |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
34 |
5 | 35 try: |
36 from Bio.Seq import Seq, reverse_complement, translate | |
37 from Bio.SeqRecord import SeqRecord | |
38 from Bio import SeqIO | |
39 from Bio.Data import CodonTable | |
40 except ImportError: | |
8 | 41 sys.exit("Missing Biopython library") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
42 |
5 | 43 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
44 parser = OptionParser(usage=usage) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
45 parser.add_option('-i', '--input', dest='input_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
46 default=None, help='Input fasta file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
47 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
48 parser.add_option('-f', '--format', dest='seq_format', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
50 parser.add_option('--table', dest='table', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
51 default=1, help='NCBI Translation table', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
53 choices=['CDS', 'ORF'], default='ORF', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
54 help='Find ORF or CDSs') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
55 parser.add_option('-e', '--ends', dest='ends', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
56 choices=['open', 'closed'], default='closed', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
57 help='Open or closed. Closed ensures start/stop codons are present') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
58 parser.add_option('-m', '--mode', dest='mode', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
59 choices=['all', 'top', 'one'], default='all', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
61 'with max length, or first with maximum length') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
62 parser.add_option('--min_len', dest='min_len', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
63 default=10, help='Minimum ORF/CDS length', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
64 parser.add_option('-s', '--strand', dest='strand', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
65 choices=['forward', 'reverse', 'both'], default='both', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
66 help='Strand to search for features on') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
67 parser.add_option('--on', dest='out_nuc_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
68 default=None, help='Output nucleotide sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
69 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
70 parser.add_option('--op', dest='out_prot_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
71 default=None, help='Output protein sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
72 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
73 parser.add_option('--ob', dest='out_bed_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
74 default=None, help='Output BED file, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
75 metavar='FILE') |
8 | 76 parser.add_option('--og', dest='out_gff3_file', |
77 default=None, help='Output GFF3 file, or - for STDOUT', | |
78 metavar='FILE') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
79 parser.add_option('-v', '--version', dest='version', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
80 default=False, action='store_true', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
81 help='Show version and quit') |
5 | 82 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
83 options, args = parser.parse_args() |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
84 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
85 if options.version: |
11 | 86 print("v0.2.3") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
87 sys.exit(0) |
5 | 88 |
8 | 89 if not options.input_file: |
90 sys.exit("Input file is required") | |
91 | |
92 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): | |
93 sys.exit("At least one output file is required") | |
94 | |
5 | 95 try: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
96 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
5 | 97 except KeyError: |
8 | 98 sys.exit("Unknown codon table %i" % options.table) |
5 | 99 |
8 | 100 if options.seq_format.lower() == "sff": |
5 | 101 seq_format = "sff-trim" |
8 | 102 elif options.seq_format.lower() == "fasta": |
5 | 103 seq_format = "fasta" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
104 elif options.seq_format.lower().startswith("fastq"): |
5 | 105 seq_format = "fastq" |
106 else: | |
8 | 107 sys.exit("Unsupported file type %r" % options.seq_format) |
5 | 108 |
11 | 109 print("Genetic code table %i" % options.table) |
110 print("Minimum length %i aa" % options.min_len) | |
8 | 111 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
5 | 112 |
113 starts = sorted(table_obj.start_codons) | |
114 assert "NNN" not in starts | |
115 re_starts = re.compile("|".join(starts)) | |
116 | |
117 stops = sorted(table_obj.stop_codons) | |
118 assert "NNN" not in stops | |
119 re_stops = re.compile("|".join(stops)) | |
120 | |
8 | 121 |
5 | 122 def start_chop_and_trans(s, strict=True): |
123 """Returns offset, trimmed nuc, protein.""" | |
124 if strict: | |
125 assert s[-3:] in stops, s | |
126 assert len(s) % 3 == 0 | |
127 for match in re_starts.finditer(s): | |
8 | 128 # Must check the start is in frame |
5 | 129 start = match.start() |
130 if start % 3 == 0: | |
131 n = s[start:] | |
132 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
133 if strict: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
134 t = translate(n, options.table, cds=True) |
5 | 135 else: |
8 | 136 # Use when missing stop codon, |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
137 t = "M" + translate(n[3:], options.table, to_stop=True) |
5 | 138 return start, n, t |
139 return None, None, None | |
140 | |
8 | 141 |
5 | 142 def break_up_frame(s): |
143 """Returns offset, nuc, protein.""" | |
144 start = 0 | |
145 for match in re_stops.finditer(s): | |
146 index = match.start() + 3 | |
147 if index % 3 != 0: | |
148 continue | |
149 n = s[start:index] | |
8 | 150 if options.ftype == "CDS": |
5 | 151 offset, n, t = start_chop_and_trans(n) |
152 else: | |
153 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
154 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
155 if n and len(t) >= options.min_len: |
5 | 156 yield start + offset, n, t |
157 start = index | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
158 if options.ends == "open": |
8 | 159 # No stop codon, Biopython's strict CDS translate will fail |
5 | 160 n = s[start:] |
8 | 161 # Ensure we have whole codons |
162 # TODO - Try appending N instead? | |
163 # TODO - Do the next four lines more elegantly | |
5 | 164 if len(n) % 3: |
165 n = n[:-1] | |
166 if len(n) % 3: | |
167 n = n[:-1] | |
8 | 168 if options.ftype == "CDS": |
5 | 169 offset, n, t = start_chop_and_trans(n, strict=False) |
170 else: | |
171 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
172 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
173 if n and len(t) >= options.min_len: |
5 | 174 yield start + offset, n, t |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
175 |
5 | 176 |
177 def get_all_peptides(nuc_seq): | |
178 """Returns start, end, strand, nucleotides, protein. | |
179 | |
180 Co-ordinates are Python style zero-based. | |
181 """ | |
8 | 182 # TODO - Refactor to use a generator function (in start order) |
183 # rather than making a list and sorting? | |
5 | 184 answer = [] |
185 full_len = len(nuc_seq) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
186 if options.strand != "reverse": |
8 | 187 for frame in range(0, 3): |
5 | 188 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
8 | 189 start = frame + offset # zero based |
5 | 190 answer.append((start, start + len(n), +1, n, t)) |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
191 if options.strand != "forward": |
5 | 192 rc = reverse_complement(nuc_seq) |
8 | 193 for frame in range(0, 3): |
5 | 194 for offset, n, t in break_up_frame(rc[frame:]): |
8 | 195 start = full_len - frame - offset # zero based |
196 answer.append((start - len(n), start, -1, n, t)) | |
5 | 197 answer.sort() |
198 return answer | |
199 | |
8 | 200 |
5 | 201 def get_top_peptides(nuc_seq): |
202 """Returns all peptides of max length.""" | |
203 values = list(get_all_peptides(nuc_seq)) | |
204 if not values: | |
205 raise StopIteration | |
206 max_len = max(len(x[-1]) for x in values) | |
207 for x in values: | |
208 if len(x[-1]) == max_len: | |
209 yield x | |
210 | |
8 | 211 |
5 | 212 def get_one_peptide(nuc_seq): |
213 """Returns first (left most) peptide with max length.""" | |
214 values = list(get_top_peptides(nuc_seq)) | |
215 if not values: | |
216 raise StopIteration | |
217 yield values[0] | |
218 | |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
219 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
220 if options.mode == "all": |
5 | 221 get_peptides = get_all_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
222 elif options.mode == "top": |
5 | 223 get_peptides = get_top_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
224 elif options.mode == "one": |
5 | 225 get_peptides = get_one_peptide |
226 | |
227 in_count = 0 | |
228 out_count = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
229 if options.out_nuc_file == "-": |
5 | 230 out_nuc = sys.stdout |
8 | 231 elif options.out_nuc_file: |
232 out_nuc = open(options.out_nuc_file, "w") | |
5 | 233 else: |
8 | 234 out_nuc = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
235 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
236 if options.out_prot_file == "-": |
5 | 237 out_prot = sys.stdout |
8 | 238 elif options.out_prot_file: |
239 out_prot = open(options.out_prot_file, "w") | |
5 | 240 else: |
8 | 241 out_prot = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
242 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
243 if options.out_bed_file == "-": |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
244 out_bed = sys.stdout |
8 | 245 elif options.out_bed_file: |
246 out_bed = open(options.out_bed_file, "w") | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
247 else: |
8 | 248 out_bed = None |
249 | |
250 if options.out_gff3_file == "-": | |
251 out_gff3 = sys.stdout | |
252 elif options.out_gff3_file: | |
253 out_gff3 = open(options.out_gff3_file, "w") | |
254 else: | |
255 out_gff3 = None | |
256 | |
257 if out_gff3: | |
258 out_gff3.write('##gff-version 3\n') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
259 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
260 for record in SeqIO.parse(options.input_file, seq_format): |
5 | 261 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
262 out_count += 1 | |
263 if f_strand == +1: | |
8 | 264 loc = "%i..%i" % (f_start + 1, f_end) |
5 | 265 else: |
8 | 266 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
5 | 267 descr = "length %i aa, %i bp, from %s of %s" \ |
268 % (len(t), len(n), loc, record.description) | |
8 | 269 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
270 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
271 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
272 if out_nuc: | |
273 SeqIO.write(r, out_nuc, "fasta") | |
274 if out_prot: | |
275 SeqIO.write(t, out_prot, "fasta") | |
276 nice_strand = '+' if f_strand == +1 else '-' | |
277 if out_bed: | |
278 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') | |
279 if out_gff3: | |
280 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', | |
281 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') | |
5 | 282 in_count += 1 |
8 | 283 if out_nuc and out_nuc is not sys.stdout: |
5 | 284 out_nuc.close() |
8 | 285 if out_prot and out_prot is not sys.stdout: |
5 | 286 out_prot.close() |
8 | 287 if out_bed and out_bed is not sys.stdout: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
288 out_bed.close() |
5 | 289 |
11 | 290 print("Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)) |