Mercurial > repos > peterjc > get_orfs_or_cdss
annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 7:705a2e2df7fb draft
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
author | peterjc |
---|---|
date | Thu, 30 Jul 2015 12:35:31 -0400 |
parents | 5208c15805ec |
children | 09a8be9247ca |
rev | line source |
---|---|
5 | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
4 For more details, see the help text and argument descriptions in the |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
5 | 6 |
7 This tool is a short Python script which requires Biopython. If you use | |
8 this tool in scientific work leading to a publication, please cite the | |
9 Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
13 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
14 | |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
16 (formerly SCRI), Dundee, UK. All rights reserved. | |
17 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
18 See accompanying text file for licence details (MIT licence). |
5 | 19 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
20 This is version 0.1.0 of the script. |
5 | 21 """ |
22 import sys | |
23 import re | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
24 from optparse import OptionParser |
5 | 25 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
26 def sys_exit(msg, err=1): |
5 | 27 sys.stderr.write(msg.rstrip() + "\n") |
28 sys.exit(err) | |
29 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
30 usage = """Use as follows: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
31 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
32 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
33 """ |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
34 |
5 | 35 try: |
36 from Bio.Seq import Seq, reverse_complement, translate | |
37 from Bio.SeqRecord import SeqRecord | |
38 from Bio import SeqIO | |
39 from Bio.Data import CodonTable | |
40 except ImportError: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
41 sys_exit("Missing Biopython library") |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
42 |
5 | 43 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
44 parser = OptionParser(usage=usage) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
45 parser.add_option('-i', '--input', dest='input_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
46 default=None, help='Input fasta file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
47 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
48 parser.add_option('-f', '--format', dest='seq_format', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
49 default='fasta', help='Sequence format (e.g. fasta, fastq, sff)') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
50 parser.add_option('--table', dest='table', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
51 default=1, help='NCBI Translation table', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
52 parser.add_option('-t', '--ftype', dest='ftype', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
53 choices=['CDS', 'ORF'], default='ORF', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
54 help='Find ORF or CDSs') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
55 parser.add_option('-e', '--ends', dest='ends', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
56 choices=['open', 'closed'], default='closed', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
57 help='Open or closed. Closed ensures start/stop codons are present') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
58 parser.add_option('-m', '--mode', dest='mode', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
59 choices=['all', 'top', 'one'], default='all', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
60 help='Output all ORFs/CDSs from sequence, all ORFs/CDSs ' |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
61 'with max length, or first with maximum length') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
62 parser.add_option('--min_len', dest='min_len', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
63 default=10, help='Minimum ORF/CDS length', type='int') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
64 parser.add_option('-s', '--strand', dest='strand', type='choice', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
65 choices=['forward', 'reverse', 'both'], default='both', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
66 help='Strand to search for features on') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
67 parser.add_option('--on', dest='out_nuc_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
68 default=None, help='Output nucleotide sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
69 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
70 parser.add_option('--op', dest='out_prot_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
71 default=None, help='Output protein sequences, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
72 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
73 parser.add_option('--ob', dest='out_bed_file', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
74 default=None, help='Output BED file, or - for STDOUT', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
75 metavar='FILE') |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
76 parser.add_option('-v', '--version', dest='version', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
77 default=False, action='store_true', |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
78 help='Show version and quit') |
5 | 79 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
80 options, args = parser.parse_args() |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
81 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
82 if options.version: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
83 print "v0.1.0" |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
84 sys.exit(0) |
5 | 85 |
86 try: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
87 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
5 | 88 except KeyError: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
89 sys_exit("Unknown codon table %i" % options.table) |
5 | 90 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
91 if options.seq_format.lower()=="sff": |
5 | 92 seq_format = "sff-trim" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
93 elif options.seq_format.lower()=="fasta": |
5 | 94 seq_format = "fasta" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
95 elif options.seq_format.lower().startswith("fastq"): |
5 | 96 seq_format = "fastq" |
97 else: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
98 sys_exit("Unsupported file type %r" % options.seq_format) |
5 | 99 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
100 print "Genetic code table %i" % options.table |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
101 print "Minimum length %i aa" % options.min_len |
5 | 102 #print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
103 | |
104 starts = sorted(table_obj.start_codons) | |
105 assert "NNN" not in starts | |
106 re_starts = re.compile("|".join(starts)) | |
107 | |
108 stops = sorted(table_obj.stop_codons) | |
109 assert "NNN" not in stops | |
110 re_stops = re.compile("|".join(stops)) | |
111 | |
112 def start_chop_and_trans(s, strict=True): | |
113 """Returns offset, trimmed nuc, protein.""" | |
114 if strict: | |
115 assert s[-3:] in stops, s | |
116 assert len(s) % 3 == 0 | |
117 for match in re_starts.finditer(s): | |
118 #Must check the start is in frame | |
119 start = match.start() | |
120 if start % 3 == 0: | |
121 n = s[start:] | |
122 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
123 if strict: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
124 t = translate(n, options.table, cds=True) |
5 | 125 else: |
126 #Use when missing stop codon, | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
127 t = "M" + translate(n[3:], options.table, to_stop=True) |
5 | 128 return start, n, t |
129 return None, None, None | |
130 | |
131 def break_up_frame(s): | |
132 """Returns offset, nuc, protein.""" | |
133 start = 0 | |
134 for match in re_stops.finditer(s): | |
135 index = match.start() + 3 | |
136 if index % 3 != 0: | |
137 continue | |
138 n = s[start:index] | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
139 if options.ftype=="CDS": |
5 | 140 offset, n, t = start_chop_and_trans(n) |
141 else: | |
142 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
143 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
144 if n and len(t) >= options.min_len: |
5 | 145 yield start + offset, n, t |
146 start = index | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
147 if options.ends == "open": |
5 | 148 #No stop codon, Biopython's strict CDS translate will fail |
149 n = s[start:] | |
150 #Ensure we have whole codons | |
151 #TODO - Try appending N instead? | |
152 #TODO - Do the next four lines more elegantly | |
153 if len(n) % 3: | |
154 n = n[:-1] | |
155 if len(n) % 3: | |
156 n = n[:-1] | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
157 if options.ftype=="CDS": |
5 | 158 offset, n, t = start_chop_and_trans(n, strict=False) |
159 else: | |
160 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
161 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
162 if n and len(t) >= options.min_len: |
5 | 163 yield start + offset, n, t |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
164 |
5 | 165 |
166 def get_all_peptides(nuc_seq): | |
167 """Returns start, end, strand, nucleotides, protein. | |
168 | |
169 Co-ordinates are Python style zero-based. | |
170 """ | |
171 #TODO - Refactor to use a generator function (in start order) | |
172 #rather than making a list and sorting? | |
173 answer = [] | |
174 full_len = len(nuc_seq) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
175 if options.strand != "reverse": |
5 | 176 for frame in range(0,3): |
177 for offset, n, t in break_up_frame(nuc_seq[frame:]): | |
178 start = frame + offset #zero based | |
179 answer.append((start, start + len(n), +1, n, t)) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
180 if options.strand != "forward": |
5 | 181 rc = reverse_complement(nuc_seq) |
182 for frame in range(0,3) : | |
183 for offset, n, t in break_up_frame(rc[frame:]): | |
184 start = full_len - frame - offset #zero based | |
185 answer.append((start - len(n), start, -1, n ,t)) | |
186 answer.sort() | |
187 return answer | |
188 | |
189 def get_top_peptides(nuc_seq): | |
190 """Returns all peptides of max length.""" | |
191 values = list(get_all_peptides(nuc_seq)) | |
192 if not values: | |
193 raise StopIteration | |
194 max_len = max(len(x[-1]) for x in values) | |
195 for x in values: | |
196 if len(x[-1]) == max_len: | |
197 yield x | |
198 | |
199 def get_one_peptide(nuc_seq): | |
200 """Returns first (left most) peptide with max length.""" | |
201 values = list(get_top_peptides(nuc_seq)) | |
202 if not values: | |
203 raise StopIteration | |
204 yield values[0] | |
205 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
206 if options.mode == "all": |
5 | 207 get_peptides = get_all_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
208 elif options.mode == "top": |
5 | 209 get_peptides = get_top_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
210 elif options.mode == "one": |
5 | 211 get_peptides = get_one_peptide |
212 | |
213 in_count = 0 | |
214 out_count = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
215 if options.out_nuc_file == "-": |
5 | 216 out_nuc = sys.stdout |
217 else: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
218 out_nuc = open(options.out_nuc_file, "w") |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
219 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
220 if options.out_prot_file == "-": |
5 | 221 out_prot = sys.stdout |
222 else: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
223 out_prot = open(options.out_prot_file, "w") |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
224 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
225 if options.out_bed_file == "-": |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
226 out_bed = sys.stdout |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
227 else: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
228 out_bed = open(options.out_bed_file, "w") |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
229 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
230 for record in SeqIO.parse(options.input_file, seq_format): |
5 | 231 for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): |
232 out_count += 1 | |
233 if f_strand == +1: | |
234 loc = "%i..%i" % (f_start+1, f_end) | |
235 else: | |
236 loc = "complement(%i..%i)" % (f_start+1, f_end) | |
237 descr = "length %i aa, %i bp, from %s of %s" \ | |
238 % (len(t), len(n), loc, record.description) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
239 fid = record.id + "|%s%i" % (options.ftype, i+1) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
240 r = SeqRecord(Seq(n), id = fid, name = "", description= descr) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
241 t = SeqRecord(Seq(t), id = fid, name = "", description= descr) |
5 | 242 SeqIO.write(r, out_nuc, "fasta") |
243 SeqIO.write(t, out_prot, "fasta") | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
244 out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n') |
5 | 245 in_count += 1 |
246 if out_nuc is not sys.stdout: | |
247 out_nuc.close() | |
248 if out_prot is not sys.stdout: | |
249 out_prot.close() | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
250 if out_bed is not sys.stdout: |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
251 out_bed.close() |
5 | 252 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
253 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count) |