Mercurial > repos > peterjc > get_orfs_or_cdss
annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 8:09a8be9247ca draft
v0.2.0 with GFF3 output
author | peterjc |
---|---|
date | Sat, 09 Jan 2016 23:42:32 -0500 |
parents | 705a2e2df7fb |
children | a06ad07431ba |
rev | line source |
---|---|
5 | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
4 For more details, see the help text and argument descriptions in the |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
5 | 6 |
7 This tool is a short Python script which requires Biopython. If you use | |
8 this tool in scientific work leading to a publication, please cite the | |
9 Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
14 | |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
16 (formerly SCRI), Dundee, UK. All rights reserved. | |
17 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
18 See accompanying text file for licence details (MIT licence). |
5 | 19 """ |
20 import sys | |
21 import re | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
22 from optparse import OptionParser |
5 | 23 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
24 usage = """Use as follows: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
25 |
8 | 26 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
27 """ |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
28 |
5 | 29 try: |
30 from Bio.Seq import Seq, reverse_complement, translate | |
31 from Bio.SeqRecord import SeqRecord | |
32 from Bio import SeqIO | |
33 from Bio.Data import CodonTable | |
34 except ImportError: | |
8 | 35 sys.exit("Missing Biopython library") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
36 |
5 | 37 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
38 parser = OptionParser(usage=usage) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
39 parser.add_option('-i', '--input', dest='input_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
40 default=None, help='Input fasta file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
41 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
42 parser.add_option('-f', '--format', dest='seq_format', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
43 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
44 parser.add_option('--table', dest='table', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
45 default=1, help='NCBI Translation table', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
46 parser.add_option('-t', '--ftype', dest='ftype', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
47 choices=['CDS', 'ORF'], default='ORF', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
48 help='Find ORF or CDSs') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
49 parser.add_option('-e', '--ends', dest='ends', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
50 choices=['open', 'closed'], default='closed', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
51 help='Open or closed. Closed ensures start/stop codons are present') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
52 parser.add_option('-m', '--mode', dest='mode', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
53 choices=['all', 'top', 'one'], default='all', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
54 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
55 'with max length, or first with maximum length') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
56 parser.add_option('--min_len', dest='min_len', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
57 default=10, help='Minimum ORF/CDS length', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
58 parser.add_option('-s', '--strand', dest='strand', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
59 choices=['forward', 'reverse', 'both'], default='both', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
60 help='Strand to search for features on') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
61 parser.add_option('--on', dest='out_nuc_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
62 default=None, help='Output nucleotide sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
63 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
64 parser.add_option('--op', dest='out_prot_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
65 default=None, help='Output protein sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
66 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
67 parser.add_option('--ob', dest='out_bed_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
68 default=None, help='Output BED file, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
69 metavar='FILE') |
8 | 70 parser.add_option('--og', dest='out_gff3_file', |
71 default=None, help='Output GFF3 file, or - for STDOUT', | |
72 metavar='FILE') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
73 parser.add_option('-v', '--version', dest='version', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
74 default=False, action='store_true', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
75 help='Show version and quit') |
5 | 76 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
77 options, args = parser.parse_args() |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
78 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
79 if options.version: |
8 | 80 print("v0.2.0") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
81 sys.exit(0) |
5 | 82 |
8 | 83 if not options.input_file: |
84 sys.exit("Input file is required") | |
85 | |
86 if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)): | |
87 sys.exit("At least one output file is required") | |
88 | |
5 | 89 try: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
90 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
5 | 91 except KeyError: |
8 | 92 sys.exit("Unknown codon table %i" % options.table) |
5 | 93 |
8 | 94 if options.seq_format.lower() == "sff": |
5 | 95 seq_format = "sff-trim" |
8 | 96 elif options.seq_format.lower() == "fasta": |
5 | 97 seq_format = "fasta" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
98 elif options.seq_format.lower().startswith("fastq"): |
5 | 99 seq_format = "fastq" |
100 else: | |
8 | 101 sys.exit("Unsupported file type %r" % options.seq_format) |
5 | 102 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
103 print "Genetic code table %i" % options.table |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
104 print "Minimum length %i aa" % options.min_len |
8 | 105 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
5 | 106 |
107 starts = sorted(table_obj.start_codons) | |
108 assert "NNN" not in starts | |
109 re_starts = re.compile("|".join(starts)) | |
110 | |
111 stops = sorted(table_obj.stop_codons) | |
112 assert "NNN" not in stops | |
113 re_stops = re.compile("|".join(stops)) | |
114 | |
8 | 115 |
5 | 116 def start_chop_and_trans(s, strict=True): |
117 """Returns offset, trimmed nuc, protein.""" | |
118 if strict: | |
119 assert s[-3:] in stops, s | |
120 assert len(s) % 3 == 0 | |
121 for match in re_starts.finditer(s): | |
8 | 122 # Must check the start is in frame |
5 | 123 start = match.start() |
124 if start % 3 == 0: | |
125 n = s[start:] | |
126 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
127 if strict: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
128 t = translate(n, options.table, cds=True) |
5 | 129 else: |
8 | 130 # Use when missing stop codon, |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
131 t = "M" + translate(n[3:], options.table, to_stop=True) |
5 | 132 return start, n, t |
133 return None, None, None | |
134 | |
8 | 135 |
5 | 136 def break_up_frame(s): |
137 """Returns offset, nuc, protein.""" | |
138 start = 0 | |
139 for match in re_stops.finditer(s): | |
140 index = match.start() + 3 | |
141 if index % 3 != 0: | |
142 continue | |
143 n = s[start:index] | |
8 | 144 if options.ftype == "CDS": |
5 | 145 offset, n, t = start_chop_and_trans(n) |
146 else: | |
147 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
148 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
149 if n and len(t) >= options.min_len: |
5 | 150 yield start + offset, n, t |
151 start = index | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
152 if options.ends == "open": |
8 | 153 # No stop codon, Biopython's strict CDS translate will fail |
5 | 154 n = s[start:] |
8 | 155 # Ensure we have whole codons |
156 # TODO - Try appending N instead? | |
157 # TODO - Do the next four lines more elegantly | |
5 | 158 if len(n) % 3: |
159 n = n[:-1] | |
160 if len(n) % 3: | |
161 n = n[:-1] | |
8 | 162 if options.ftype == "CDS": |
5 | 163 offset, n, t = start_chop_and_trans(n, strict=False) |
164 else: | |
165 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
166 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
167 if n and len(t) >= options.min_len: |
5 | 168 yield start + offset, n, t |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
169 |
5 | 170 |
171 def get_all_peptides(nuc_seq): | |
172 """Returns start, end, strand, nucleotides, protein. | |
173 | |
174 Co-ordinates are Python style zero-based. | |
175 """ | |
8 | 176 # TODO - Refactor to use a generator function (in start order) |
177 # rather than making a list and sorting? | |
5 | 178 answer = [] |
179 full_len = len(nuc_seq) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
180 if options.strand != "reverse": |
8 | 181 for frame in range(0, 3): |
5 | 182 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
8 | 183 start = frame + offset # zero based |
5 | 184 answer.append((start, start + len(n), +1, n, t)) |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
185 if options.strand != "forward": |
5 | 186 rc = reverse_complement(nuc_seq) |
8 | 187 for frame in range(0, 3): |
5 | 188 for offset, n, t in break_up_frame(rc[frame:]): |
8 | 189 start = full_len - frame - offset # zero based |
190 answer.append((start - len(n), start, -1, n, t)) | |
5 | 191 answer.sort() |
192 return answer | |
193 | |
8 | 194 |
5 | 195 def get_top_peptides(nuc_seq): |
196 """Returns all peptides of max length.""" | |
197 values = list(get_all_peptides(nuc_seq)) | |
198 if not values: | |
199 raise StopIteration | |
200 max_len = max(len(x[-1]) for x in values) | |
201 for x in values: | |
202 if len(x[-1]) == max_len: | |
203 yield x | |
204 | |
8 | 205 |
5 | 206 def get_one_peptide(nuc_seq): |
207 """Returns first (left most) peptide with max length.""" | |
208 values = list(get_top_peptides(nuc_seq)) | |
209 if not values: | |
210 raise StopIteration | |
211 yield values[0] | |
212 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
213 if options.mode == "all": |
5 | 214 get_peptides = get_all_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
215 elif options.mode == "top": |
5 | 216 get_peptides = get_top_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
217 elif options.mode == "one": |
5 | 218 get_peptides = get_one_peptide |
219 | |
220 in_count = 0 | |
221 out_count = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
222 if options.out_nuc_file == "-": |
5 | 223 out_nuc = sys.stdout |
8 | 224 elif options.out_nuc_file: |
225 out_nuc = open(options.out_nuc_file, "w") | |
5 | 226 else: |
8 | 227 out_nuc = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
228 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
229 if options.out_prot_file == "-": |
5 | 230 out_prot = sys.stdout |
8 | 231 elif options.out_prot_file: |
232 out_prot = open(options.out_prot_file, "w") | |
5 | 233 else: |
8 | 234 out_prot = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
235 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
236 if options.out_bed_file == "-": |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
237 out_bed = sys.stdout |
8 | 238 elif options.out_bed_file: |
239 out_bed = open(options.out_bed_file, "w") | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
240 else: |
8 | 241 out_bed = None |
242 | |
243 if options.out_gff3_file == "-": | |
244 out_gff3 = sys.stdout | |
245 elif options.out_gff3_file: | |
246 out_gff3 = open(options.out_gff3_file, "w") | |
247 else: | |
248 out_gff3 = None | |
249 | |
250 if out_gff3: | |
251 out_gff3.write('##gff-version 3\n') | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
252 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
253 for record in SeqIO.parse(options.input_file, seq_format): |
5 | 254 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
255 out_count += 1 | |
256 if f_strand == +1: | |
8 | 257 loc = "%i..%i" % (f_start + 1, f_end) |
5 | 258 else: |
8 | 259 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
5 | 260 descr = "length %i aa, %i bp, from %s of %s" \ |
261 % (len(t), len(n), loc, record.description) | |
8 | 262 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
263 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
264 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
265 if out_nuc: | |
266 SeqIO.write(r, out_nuc, "fasta") | |
267 if out_prot: | |
268 SeqIO.write(t, out_prot, "fasta") | |
269 nice_strand = '+' if f_strand == +1 else '-' | |
270 if out_bed: | |
271 out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n') | |
272 if out_gff3: | |
273 out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.', | |
274 nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n') | |
5 | 275 in_count += 1 |
8 | 276 if out_nuc and out_nuc is not sys.stdout: |
5 | 277 out_nuc.close() |
8 | 278 if out_prot and out_prot is not sys.stdout: |
5 | 279 out_prot.close() |
8 | 280 if out_bed and out_bed is not sys.stdout: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
281 out_bed.close() |
5 | 282 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
283 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) |