Mercurial > repos > peterjc > get_orfs_or_cdss
annotate tools/get_orfs_or_cdss/get_orfs_or_cdss.py @ 12:71905a6d52a7 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:37:04 +0000 |
parents | d51db443aaa4 |
children |
rev | line source |
---|---|
5 | 1 #!/usr/bin/env python |
2 """Find ORFs in a nucleotide sequence file. | |
3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
4 For more details, see the help text and argument descriptions in the |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
5 accompanying get_orfs_or_cdss.xml file which defines a Galaxy interface. |
5 | 6 |
7 This tool is a short Python script which requires Biopython. If you use | |
8 this tool in scientific work leading to a publication, please cite the | |
9 Biopython application note: | |
10 | |
11 Cock et al 2009. Biopython: freely available Python tools for computational | |
12 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
13 https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. |
5 | 14 |
15 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute | |
16 (formerly SCRI), Dundee, UK. All rights reserved. | |
17 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
18 See accompanying text file for licence details (MIT licence). |
5 | 19 """ |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
20 |
11 | 21 from __future__ import print_function |
22 | |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
23 import re |
5 | 24 import sys |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
25 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
26 from optparse import OptionParser |
5 | 27 |
11 | 28 usage = r"""Use as follows: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
29 |
11 | 30 $ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 \ |
31 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa \ | |
32 --ob cds.bed --og cds.gff3 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
33 """ |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
34 |
5 | 35 try: |
36 from Bio.Seq import Seq, reverse_complement, translate | |
37 from Bio.SeqRecord import SeqRecord | |
38 from Bio import SeqIO | |
39 from Bio.Data import CodonTable | |
40 except ImportError: | |
8 | 41 sys.exit("Missing Biopython library") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
42 |
5 | 43 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
44 parser = OptionParser(usage=usage) |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
45 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
46 "-i", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
47 "--input", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
48 dest="input_file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
49 default=None, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
50 help="Input fasta file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
51 metavar="FILE", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
52 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
53 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
54 "-f", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
55 "--format", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
56 dest="seq_format", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
57 default="fasta", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
58 help="Sequence format (e.g. fasta, fastq, sff)", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
59 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
60 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
61 "--table", dest="table", default=1, help="NCBI Translation table", type="int" |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
62 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
63 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
64 "-t", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
65 "--ftype", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
66 dest="ftype", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
67 type="choice", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
68 choices=["CDS", "ORF"], |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
69 default="ORF", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
70 help="Find ORF or CDSs", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
71 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
72 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
73 "-e", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
74 "--ends", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
75 dest="ends", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
76 type="choice", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
77 choices=["open", "closed"], |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
78 default="closed", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
79 help="Open or closed. Closed ensures start/stop codons are present", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
80 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
81 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
82 "-m", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
83 "--mode", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
84 dest="mode", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
85 type="choice", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
86 choices=["all", "top", "one"], |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
87 default="all", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
88 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs " |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
89 "with max length, or first with maximum length", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
90 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
91 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
92 "--min_len", dest="min_len", default=10, help="Minimum ORF/CDS length", type="int" |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
93 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
94 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
95 "-s", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
96 "--strand", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
97 dest="strand", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
98 type="choice", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
99 choices=["forward", "reverse", "both"], |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
100 default="both", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
101 help="Strand to search for features on", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
102 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
103 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
104 "--on", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
105 dest="out_nuc_file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
106 default=None, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
107 help="Output nucleotide sequences, or - for STDOUT", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
108 metavar="FILE", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
109 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
110 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
111 "--op", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
112 dest="out_prot_file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
113 default=None, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
114 help="Output protein sequences, or - for STDOUT", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
115 metavar="FILE", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
116 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
117 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
118 "--ob", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
119 dest="out_bed_file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
120 default=None, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
121 help="Output BED file, or - for STDOUT", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
122 metavar="FILE", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
123 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
124 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
125 "--og", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
126 dest="out_gff3_file", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
127 default=None, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
128 help="Output GFF3 file, or - for STDOUT", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
129 metavar="FILE", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
130 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
131 parser.add_option( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
132 "-v", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
133 "--version", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
134 dest="version", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
135 default=False, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
136 action="store_true", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
137 help="Show version and quit", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
138 ) |
5 | 139 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
140 options, args = parser.parse_args() |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
141 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
142 if options.version: |
11 | 143 print("v0.2.3") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
144 sys.exit(0) |
5 | 145 |
8 | 146 if not options.input_file: |
147 sys.exit("Input file is required") | |
148 | |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
149 if not any( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
150 ( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
151 options.out_nuc_file, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
152 options.out_prot_file, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
153 options.out_bed_file, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
154 options.out_gff3_file, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
155 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
156 ): |
8 | 157 sys.exit("At least one output file is required") |
158 | |
5 | 159 try: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
160 table_obj = CodonTable.ambiguous_generic_by_id[options.table] |
5 | 161 except KeyError: |
8 | 162 sys.exit("Unknown codon table %i" % options.table) |
5 | 163 |
8 | 164 if options.seq_format.lower() == "sff": |
5 | 165 seq_format = "sff-trim" |
8 | 166 elif options.seq_format.lower() == "fasta": |
5 | 167 seq_format = "fasta" |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
168 elif options.seq_format.lower().startswith("fastq"): |
5 | 169 seq_format = "fastq" |
170 else: | |
8 | 171 sys.exit("Unsupported file type %r" % options.seq_format) |
5 | 172 |
11 | 173 print("Genetic code table %i" % options.table) |
174 print("Minimum length %i aa" % options.min_len) | |
8 | 175 # print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) |
5 | 176 |
177 starts = sorted(table_obj.start_codons) | |
178 assert "NNN" not in starts | |
179 re_starts = re.compile("|".join(starts)) | |
180 | |
181 stops = sorted(table_obj.stop_codons) | |
182 assert "NNN" not in stops | |
183 re_stops = re.compile("|".join(stops)) | |
184 | |
8 | 185 |
5 | 186 def start_chop_and_trans(s, strict=True): |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
187 """Return offset, trimmed nuc, protein.""" |
5 | 188 if strict: |
189 assert s[-3:] in stops, s | |
190 assert len(s) % 3 == 0 | |
191 for match in re_starts.finditer(s): | |
8 | 192 # Must check the start is in frame |
5 | 193 start = match.start() |
194 if start % 3 == 0: | |
195 n = s[start:] | |
196 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | |
197 if strict: | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
198 t = translate(n, options.table, cds=True) |
5 | 199 else: |
8 | 200 # Use when missing stop codon, |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
201 t = "M" + translate(n[3:], options.table, to_stop=True) |
5 | 202 return start, n, t |
203 return None, None, None | |
204 | |
8 | 205 |
5 | 206 def break_up_frame(s): |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
207 """Return offset, nuc, protein.""" |
5 | 208 start = 0 |
209 for match in re_stops.finditer(s): | |
210 index = match.start() + 3 | |
211 if index % 3 != 0: | |
212 continue | |
213 n = s[start:index] | |
8 | 214 if options.ftype == "CDS": |
5 | 215 offset, n, t = start_chop_and_trans(n) |
216 else: | |
217 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
218 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
219 if n and len(t) >= options.min_len: |
5 | 220 yield start + offset, n, t |
221 start = index | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
222 if options.ends == "open": |
8 | 223 # No stop codon, Biopython's strict CDS translate will fail |
5 | 224 n = s[start:] |
8 | 225 # Ensure we have whole codons |
226 # TODO - Try appending N instead? | |
227 # TODO - Do the next four lines more elegantly | |
5 | 228 if len(n) % 3: |
229 n = n[:-1] | |
230 if len(n) % 3: | |
231 n = n[:-1] | |
8 | 232 if options.ftype == "CDS": |
5 | 233 offset, n, t = start_chop_and_trans(n, strict=False) |
234 else: | |
235 offset = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
236 t = translate(n, options.table, to_stop=True) |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
237 if n and len(t) >= options.min_len: |
5 | 238 yield start + offset, n, t |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
239 |
5 | 240 |
241 def get_all_peptides(nuc_seq): | |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
242 """Return start, end, strand, nucleotides, protein. |
5 | 243 |
244 Co-ordinates are Python style zero-based. | |
245 """ | |
8 | 246 # TODO - Refactor to use a generator function (in start order) |
247 # rather than making a list and sorting? | |
5 | 248 answer = [] |
249 full_len = len(nuc_seq) | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
250 if options.strand != "reverse": |
8 | 251 for frame in range(0, 3): |
5 | 252 for offset, n, t in break_up_frame(nuc_seq[frame:]): |
8 | 253 start = frame + offset # zero based |
5 | 254 answer.append((start, start + len(n), +1, n, t)) |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
255 if options.strand != "forward": |
5 | 256 rc = reverse_complement(nuc_seq) |
8 | 257 for frame in range(0, 3): |
5 | 258 for offset, n, t in break_up_frame(rc[frame:]): |
8 | 259 start = full_len - frame - offset # zero based |
260 answer.append((start - len(n), start, -1, n, t)) | |
5 | 261 answer.sort() |
262 return answer | |
263 | |
8 | 264 |
5 | 265 def get_top_peptides(nuc_seq): |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
266 """Return all peptides of max length.""" |
5 | 267 values = list(get_all_peptides(nuc_seq)) |
268 if not values: | |
269 raise StopIteration | |
270 max_len = max(len(x[-1]) for x in values) | |
271 for x in values: | |
272 if len(x[-1]) == max_len: | |
273 yield x | |
274 | |
8 | 275 |
5 | 276 def get_one_peptide(nuc_seq): |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
277 """Return first (left most) peptide with max length.""" |
5 | 278 values = list(get_top_peptides(nuc_seq)) |
279 if not values: | |
280 raise StopIteration | |
281 yield values[0] | |
282 | |
9
a06ad07431ba
v0.2.1 Adds table 24; Depends on Biopython 1.67 via Tool Shed package or bioconda.
peterjc
parents:
8
diff
changeset
|
283 |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
284 if options.mode == "all": |
5 | 285 get_peptides = get_all_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
286 elif options.mode == "top": |
5 | 287 get_peptides = get_top_peptides |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
288 elif options.mode == "one": |
5 | 289 get_peptides = get_one_peptide |
290 | |
291 in_count = 0 | |
292 out_count = 0 | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
293 if options.out_nuc_file == "-": |
5 | 294 out_nuc = sys.stdout |
8 | 295 elif options.out_nuc_file: |
296 out_nuc = open(options.out_nuc_file, "w") | |
5 | 297 else: |
8 | 298 out_nuc = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
299 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
300 if options.out_prot_file == "-": |
5 | 301 out_prot = sys.stdout |
8 | 302 elif options.out_prot_file: |
303 out_prot = open(options.out_prot_file, "w") | |
5 | 304 else: |
8 | 305 out_prot = None |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
306 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
307 if options.out_bed_file == "-": |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
308 out_bed = sys.stdout |
8 | 309 elif options.out_bed_file: |
310 out_bed = open(options.out_bed_file, "w") | |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
311 else: |
8 | 312 out_bed = None |
313 | |
314 if options.out_gff3_file == "-": | |
315 out_gff3 = sys.stdout | |
316 elif options.out_gff3_file: | |
317 out_gff3 = open(options.out_gff3_file, "w") | |
318 else: | |
319 out_gff3 = None | |
320 | |
321 if out_gff3: | |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
322 out_gff3.write("##gff-version 3\n") |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
323 |
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
324 for record in SeqIO.parse(options.input_file, seq_format): |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
325 for i, (f_start, f_end, f_strand, n, t) in enumerate( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
326 get_peptides(str(record.seq).upper()) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
327 ): |
5 | 328 out_count += 1 |
329 if f_strand == +1: | |
8 | 330 loc = "%i..%i" % (f_start + 1, f_end) |
5 | 331 else: |
8 | 332 loc = "complement(%i..%i)" % (f_start + 1, f_end) |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
333 descr = "length %i aa, %i bp, from %s of %s" % ( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
334 len(t), |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
335 len(n), |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
336 loc, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
337 record.description, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
338 ) |
8 | 339 fid = record.id + "|%s%i" % (options.ftype, i + 1) |
340 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | |
341 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | |
342 if out_nuc: | |
343 SeqIO.write(r, out_nuc, "fasta") | |
344 if out_prot: | |
345 SeqIO.write(t, out_prot, "fasta") | |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
346 nice_strand = "+" if f_strand == +1 else "-" |
8 | 347 if out_bed: |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
348 out_bed.write( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
349 "\t".join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
350 + "\n" |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
351 ) |
8 | 352 if out_gff3: |
12
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
353 out_gff3.write( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
354 "\t".join( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
355 map( |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
356 str, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
357 [ |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
358 record.id, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
359 "getOrfsOrCds", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
360 "CDS", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
361 f_start + 1, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
362 f_end, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
363 ".", |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
364 nice_strand, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
365 0, |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
366 "ID=%s%s" % (options.ftype, i + 1), |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
367 ], |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
368 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
369 ) |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
370 + "\n" |
71905a6d52a7
"Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents:
11
diff
changeset
|
371 ) |
5 | 372 in_count += 1 |
8 | 373 if out_nuc and out_nuc is not sys.stdout: |
5 | 374 out_nuc.close() |
8 | 375 if out_prot and out_prot is not sys.stdout: |
5 | 376 out_prot.close() |
8 | 377 if out_bed and out_bed is not sys.stdout: |
7
705a2e2df7fb
v0.1.1 fix typo; v0.1.0 BED output (Eric Rasche), NCBI genetic code 24; v0.0.7 embeds citation
peterjc
parents:
5
diff
changeset
|
378 out_bed.close() |
5 | 379 |
11 | 380 print("Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)) |