Mercurial > repos > peterjc > get_orfs_or_cdss
annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 9:a06ad07431ba draft
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Wed, 10 May 2017 13:24:46 -0400 |
parents | 09a8be9247ca |
children | d51db443aaa4 |
rev | line source |
---|---|
5 | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
4 For more details, see the help text and argument descriptions in the |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
5 | 6 |
7 This tool is a short Python script which requires Biopython. If you use | |
8 this tool in scientific work leading to a publication, please cite the | |
9 Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
14 | |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
16 (formerly SCRI), Dundee, UK. All rights reserved. | |
17 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
18 See accompanying text file for licence details (MIT licence). |
5 | 19 """ |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
20 |
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
21 import re |
5 | 22 import sys |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
23 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
24 from optparse import OptionParser |
5 | 25 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
26 usage = """Use as follows: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
27 |
8 | 28 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
29 """ |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
30 |
5 | 31 try: |
32 from Bio.Seq import Seq, reverse_complement, translate | |
33 from Bio.SeqRecord import SeqRecord | |
34 from Bio import SeqIO | |
35 from Bio.Data import CodonTable | |
36 except ImportError: | |
8 | 37 sys.exit("Missing Biopython library") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
38 |
5 | 39 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
40 parser = OptionParser(usage=usage) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
41 parser.add_option('-i', '--input', dest='input_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
42 default=None, help='Input fasta file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
43 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
44 parser.add_option('-f', '--format', dest='seq_format', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
45 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
46 parser.add_option('--table', dest='table', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
47 default=1, help='NCBI Translation table', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
48 parser.add_option('-t', '--ftype', dest='ftype', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
49 choices=['CDS', 'ORF'], default='ORF', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
50 help='Find ORF or CDSs') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
51 parser.add_option('-e', '--ends', dest='ends', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
52 choices=['open', 'closed'], default='closed', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
53 help='Open or closed. Closed ensures start/stop codons are present') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
54 parser.add_option('-m', '--mode', dest='mode', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
55 choices=['all', 'top', 'one'], default='all', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
56 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
57 'with max length, or first with maximum length') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
58 parser.add_option('--min_len', dest='min_len', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
59 default=10, help='Minimum ORF/CDS length', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
60 parser.add_option('-s', '--strand', dest='strand', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
61 choices=['forward', 'reverse', 'both'], default='both', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
62 help='Strand to search for features on') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
63 parser.add_option('--on', dest='out_nuc_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
64 default=None, help='Output nucleotide sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
65 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
66 parser.add_option('--op', dest='out_prot_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
67 default=None, help='Output protein sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
68 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
69 parser.add_option('--ob', dest='out_bed_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
70 default=None, help='Output BED file, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
71 metavar='FILE') |
8 | 72 parser.add_option('--og', dest='out_gff3_file', |
73 default=None, help='Output GFF3 file, or - for STDOUT', | |
74 metavar='FILE') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
75 parser.add_option('-v', '--version', dest='version', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
76 default=False, action='store_true', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
77 help='Show version and quit') |
5 | 78 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
79 options, args = parser.parse_args() |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
80 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
81 if options.version: |
8 | 82 print("v0.2.0") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
83 sys.exit(0) |
5 | 84 |
8 | 85 if not options.input_file: |
86 sys.exit("Input file is required") | |
87 | |
88 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): | |
89 sys.exit("At least one output file is required") | |
90 | |
5 | 91 try: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
92 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
5 | 93 except KeyError: |
8 | 94 sys.exit("Unknown codon table %i" % options.table) |
5 | 95 |
8 | 96 if options.seq_format.lower() == "sff": |
5 | 97 seq_format = "sff-trim" |
8 | 98 elif options.seq_format.lower() == "fasta": |
5 | 99 seq_format = "fasta" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
100 elif options.seq_format.lower().startswith("fastq"): |
5 | 101 seq_format = "fastq" |
102 else: | |
8 | 103 sys.exit("Unsupported file type %r" % options.seq_format) |
5 | 104 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
105 print "Genetic code table %i" % options.table |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
106 print "Minimum length %i aa" % options.min_len |
8 | 107 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
5 | 108 |
109 starts = sorted(table_obj.start_codons) | |
110 assert "NNN" not in starts | |
111 re_starts = re.compile("|".join(starts)) | |
112 | |
113 stops = sorted(table_obj.stop_codons) | |
114 assert "NNN" not in stops | |
115 re_stops = re.compile("|".join(stops)) | |
116 | |
8 | 117 |
5 | 118 def start_chop_and_trans(s, strict=True): |
119 """Returns offset, trimmed nuc, protein.""" | |
120 if strict: | |
121 assert s[-3:] in stops, s | |
122 assert len(s) % 3 == 0 | |
123 for match in re_starts.finditer(s): | |
8 | 124 # Must check the start is in frame |
5 | 125 start = match.start() |
126 if start % 3 == 0: | |
127 n = s[start:] | |
128 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
129 if strict: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
130 t = translate(n, options.table, cds=True) |
5 | 131 else: |
8 | 132 # Use when missing stop codon, |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
133 t = "M" + translate(n[3:], options.table, to_stop=True) |
5 | 134 return start, n, t |
135 return None, None, None | |
136 | |
8 | 137 |
5 | 138 def break_up_frame(s): |
139 """Returns offset, nuc, protein.""" | |
140 start = 0 | |
141 for match in re_stops.finditer(s): | |
142 index = match.start() + 3 | |
143 if index % 3 != 0: | |
144 continue | |
145 n = s[start:index] | |
8 | 146 if options.ftype == "CDS": |
5 | 147 offset, n, t = start_chop_and_trans(n) |
148 else: | |
149 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
150 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
151 if n and len(t) >= options.min_len: |
5 | 152 yield start + offset, n, t |
153 start = index | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
154 if options.ends == "open": |
8 | 155 # No stop codon, Biopython's strict CDS translate will fail |
5 | 156 n = s[start:] |
8 | 157 # Ensure we have whole codons |
158 # TODO - Try appending N instead? | |
159 # TODO - Do the next four lines more elegantly | |
5 | 160 if len(n) % 3: |
161 n = n[:-1] | |
162 if len(n) % 3: | |
163 n = n[:-1] | |
8 | 164 if options.ftype == "CDS": |
5 | 165 offset, n, t = start_chop_and_trans(n, strict=False) |
166 else: | |
167 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
168 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
169 if n and len(t) >= options.min_len: |
5 | 170 yield start + offset, n, t |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
171 |
5 | 172 |
173 def get_all_peptides(nuc_seq): | |
174 """Returns start, end, strand, nucleotides, protein. | |
175 | |
176 Co-ordinates are Python style zero-based. | |
177 """ | |
8 | 178 # TODO - Refactor to use a generator function (in start order) |
179 # rather than making a list and sorting? | |
5 | 180 answer = [] |
181 full_len = len(nuc_seq) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
182 if options.strand != "reverse": |
8 | 183 for frame in range(0, 3): |
5 | 184 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
8 | 185 start = frame + offset # zero based |
5 | 186 answer.append((start, start + len(n), +1, n, t)) |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
187 if options.strand != "forward": |
5 | 188 rc = reverse_complement(nuc_seq) |
8 | 189 for frame in range(0, 3): |
5 | 190 for offset, n, t in break_up_frame(rc[frame:]): |
8 | 191 start = full_len - frame - offset # zero based |
192 answer.append((start - len(n), start, -1, n, t)) | |
5 | 193 answer.sort() |
194 return answer | |
195 | |
8 | 196 |
5 | 197 def get_top_peptides(nuc_seq): |
198 """Returns all peptides of max length.""" | |
199 values = list(get_all_peptides(nuc_seq)) | |
200 if not values: | |
201 raise StopIteration | |
202 max_len = max(len(x[-1]) for x in values) | |
203 for x in values: | |
204 if len(x[-1]) == max_len: | |
205 yield x | |
206 | |
8 | 207 |
5 | 208 def get_one_peptide(nuc_seq): |
209 """Returns first (left most) peptide with max length.""" | |
210 values = list(get_top_peptides(nuc_seq)) | |
211 if not values: | |
212 raise StopIteration | |
213 yield values[0] | |
214 | |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
215 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
216 if options.mode == "all": |
5 | 217 get_peptides = get_all_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
218 elif options.mode == "top": |
5 | 219 get_peptides = get_top_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
220 elif options.mode == "one": |
5 | 221 get_peptides = get_one_peptide |
222 | |
223 in_count = 0 | |
224 out_count = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
225 if options.out_nuc_file == "-": |
5 | 226 out_nuc = sys.stdout |
8 | 227 elif options.out_nuc_file: |
228 out_nuc = open(options.out_nuc_file, "w") | |
5 | 229 else: |
8 | 230 out_nuc = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
231 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
232 if options.out_prot_file == "-": |
5 | 233 out_prot = sys.stdout |
8 | 234 elif options.out_prot_file: |
235 out_prot = open(options.out_prot_file, "w") | |
5 | 236 else: |
8 | 237 out_prot = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
238 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
239 if options.out_bed_file == "-": |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
240 out_bed = sys.stdout |
8 | 241 elif options.out_bed_file: |
242 out_bed = open(options.out_bed_file, "w") | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
243 else: |
8 | 244 out_bed = None |
245 | |
246 if options.out_gff3_file == "-": | |
247 out_gff3 = sys.stdout | |
248 elif options.out_gff3_file: | |
249 out_gff3 = open(options.out_gff3_file, "w") | |
250 else: | |
251 out_gff3 = None | |
252 | |
253 if out_gff3: | |
254 out_gff3.write('##gff-version 3\n') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
255 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
256 for record in SeqIO.parse(options.input_file, seq_format): |
5 | 257 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
258 out_count += 1 | |
259 if f_strand == +1: | |
8 | 260 loc = "%i..%i" % (f_start + 1, f_end) |
5 | 261 else: |
8 | 262 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
5 | 263 descr = "length %i aa, %i bp, from %s of %s" \ |
264 % (len(t), len(n), loc, record.description) | |
8 | 265 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
266 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
267 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
268 if out_nuc: | |
269 SeqIO.write(r, out_nuc, "fasta") | |
270 if out_prot: | |
271 SeqIO.write(t, out_prot, "fasta") | |
272 nice_strand = '+' if f_strand == +1 else '-' | |
273 if out_bed: | |
274 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') | |
275 if out_gff3: | |
276 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', | |
277 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') | |
5 | 278 in_count += 1 |
8 | 279 if out_nuc and out_nuc is not sys.stdout: |
5 | 280 out_nuc.close() |
8 | 281 if out_prot and out_prot is not sys.stdout: |
5 | 282 out_prot.close() |
8 | 283 if out_bed and out_bed is not sys.stdout: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
284 out_bed.close() |
5 | 285 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
286 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) |