comparison generate-putative-osp.py @ 1:05b97a4dce94 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:44 +0000
parents
children 859e18a9814a
comparison
equal deleted inserted replaced
0:03670eba3480 1:05b97a4dce94
1 #!/usr/bin/env python
2 import argparse
3 from cpt import OrfFinder
4 from Bio import SeqIO
5 from Bio import Seq
6 from CPT_GFFParser import gffParse, gffWrite
7 from spaninFuncs import *
8 import re
9 import os
10 import sys
11
12 ### Requirement Inputs
13 #### INPUT : Genomic FASTA
14 #### OUTPUT : Putative OSP candidates in FASTA format.
15 ######### Optional OUTPUT: "Complete" potential ORFs, in BED/FASTAs/GFF3 formats
16 ### Notes:
17 ####### As of 2.13.2020 - RegEx pattern: [ACGSILMFTV][^REKD][GASNL]C for LipoRy
18 if __name__ == "__main__":
19
20 # Common parameters for both ISP / OSP portion of scripts
21
22 parser = argparse.ArgumentParser(
23 description="Get putative protein candidates for spanins"
24 )
25
26 parser.add_argument(
27 "fasta_file", type=argparse.FileType("r"), help="Fasta file"
28 ) # the "input" argument
29
30 parser.add_argument(
31 "-f",
32 "--format",
33 dest="seq_format",
34 default="fasta",
35 help="Sequence format (e.g. fasta, fastq, sff)",
36 ) # optional formats for input, currently just going to do ntFASTA
37
38 parser.add_argument(
39 "--strand",
40 dest="strand",
41 choices=("both", "forward", "reverse"),
42 default="both",
43 help="select strand",
44 ) # Selection of +, -, or both strands
45
46 parser.add_argument(
47 "--table", dest="table", default=11, help="NCBI Translation table", type=int
48 ) # Uses "default" NCBI codon table. This should always (afaik) be what we want...
49
50 parser.add_argument(
51 "-t",
52 "--ftype",
53 dest="ftype",
54 choices=("CDS", "ORF"),
55 default="ORF",
56 help="Find ORF or CDSs",
57 ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF
58
59 parser.add_argument(
60 "-e",
61 "--ends",
62 dest="ends",
63 choices=("open", "closed"),
64 default="closed",
65 help="Open or closed. Closed ensures start/stop codons are present",
66 ) # includes the start and stop codon
67
68 parser.add_argument(
69 "-m",
70 "--mode",
71 dest="mode",
72 choices=("all", "top", "one"),
73 default="all", # I think we want this to JUST be all...nearly always
74 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length",
75 )
76
77 parser.add_argument(
78 "--switch",
79 dest="switch",
80 default="all",
81 help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321",
82 )
83
84 # osp parameters
85 parser.add_argument(
86 "--osp_min_len",
87 dest="osp_min_len",
88 default=30,
89 help="Minimum ORF length, measured in codons",
90 type=int,
91 )
92 parser.add_argument(
93 "--max_osp",
94 dest="max_osp",
95 default=200,
96 help="Maximum ORF length, measured in codons",
97 type=int,
98 )
99 parser.add_argument(
100 "--osp_on",
101 dest="out_osp_nuc",
102 type=argparse.FileType("w"),
103 default="_out_osp.fna",
104 help="Output nucleotide sequences, FASTA",
105 )
106 parser.add_argument(
107 "--osp_op",
108 dest="out_osp_prot",
109 type=argparse.FileType("w"),
110 default="_out_osp.fa",
111 help="Output protein sequences, FASTA",
112 )
113 parser.add_argument(
114 "--osp_ob",
115 dest="out_osp_bed",
116 type=argparse.FileType("w"),
117 default="_out_osp.bed",
118 help="Output BED file",
119 )
120 parser.add_argument(
121 "--osp_og",
122 dest="out_osp_gff3",
123 type=argparse.FileType("w"),
124 default="_out_osp.gff3",
125 help="Output GFF3 file",
126 )
127 parser.add_argument(
128 "--osp_min_dist",
129 dest="osp_min_dist",
130 default=10,
131 help="Minimal distance to first AA of lipobox, measured in AA",
132 type=int,
133 )
134 parser.add_argument(
135 "--osp_max_dist",
136 dest="osp_max_dist",
137 default=50,
138 help="Maximum distance to first AA of lipobox, measured in AA",
139 type=int,
140 )
141 parser.add_argument(
142 "--min_lipo_after",
143 dest="min_lipo_after",
144 default=25,
145 help="minimal amount of residues after lipobox",
146 type=int,
147 )
148 parser.add_argument(
149 "--max_lipo_after",
150 dest="max_lipo_after",
151 default=170,
152 help="minimal amount of residues after lipobox",
153 type=int,
154 )
155 parser.add_argument(
156 "--putative_osp",
157 dest="putative_osp_fa",
158 type=argparse.FileType("w"),
159 default="_putative_osp.fa",
160 help="Output of putative FASTA file",
161 )
162 parser.add_argument(
163 "--summary_osp_txt",
164 dest="summary_osp_txt",
165 type=argparse.FileType("w"),
166 default="_summary_osp.txt",
167 help="Summary statistics on putative o-spanins",
168 )
169 parser.add_argument(
170 "--putative_osp_gff",
171 dest="putative_osp_gff",
172 type=argparse.FileType("w"),
173 default="_putative_osp.gff3",
174 help="gff3 output for putative o-spanins",
175 )
176 parser.add_argument("--osp_mode", action="store_true", default=True)
177
178 # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated?
179 args = parser.parse_args()
180
181 the_args = vars(parser.parse_args())
182
183 ### osp output, naive ORF finding:
184 osps = OrfFinder(args.table, args.ftype, args.ends, args.osp_min_len, args.strand)
185 osps.locate(
186 args.fasta_file,
187 args.out_osp_nuc,
188 args.out_osp_prot,
189 args.out_osp_bed,
190 args.out_osp_gff3,
191 )
192
193 """
194 ### For Control: Use T7 and lambda;
195 # Note the distance from start codon to lipobox region for t7
196 o-spanin
197 18,7-------------------------------------------------LIPO----------------------------------
198 >T7_EOS MSTLRELRLRRALKEQSVRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPESPMVSVDSSLMVEPNLTTEMLNVFSQ
199 -----------------------------LIPO----------------------------------------
200 > lambda_EOS MLKLKMMLCVMMLPLVVVGCTSKQSVSQCVKPPPPPAWIMQPPPDWQTPLNGIISPSERG
201 """
202 args.fasta_file.close()
203 args.fasta_file = open(args.fasta_file.name, "r")
204 args.out_osp_prot.close()
205 args.out_osp_prot = open(args.out_osp_prot.name, "r")
206
207 pairs = tuple_fasta(fasta_file=args.out_osp_prot)
208 have_lipo = [] # empty candidates list to be passed through the user input
209
210 for each_pair in pairs:
211 if len(each_pair[1]) <= args.max_osp:
212 try:
213 have_lipo += find_lipobox(
214 pair=each_pair,
215 minimum=args.osp_min_dist,
216 maximum=args.osp_max_dist,
217 min_after=args.min_lipo_after,
218 max_after=args.max_lipo_after,
219 osp_mode=args.osp_mode,
220 )
221 except (IndexError, TypeError):
222 continue
223
224 if args.switch == "all":
225 pass
226 else:
227 # for each_pair in have_lipo:
228 range_of = args.switch
229 range_of = re.search(("[\d]+:[\d]+"), range_of).group(0)
230 start = int(range_of.split(":")[0])
231 end = int(range_of.split(":")[1])
232 have_lipo = parse_a_range(pair=have_lipo, start=start, end=end)
233 # print(have_lipo)
234 # matches
235
236 # have_lipo = []
237 # have_lipo = matches
238 total_osp = len(have_lipo)
239 # print(have_lipo)
240 # print(total_osp)
241
242 # print(type(have_lipo))
243
244 # for i in have_lipo:
245 # print(i)
246
247 # export results in fasta format
248 ORF = []
249 length = [] # grabbing length of the sequences
250 candidate_dict = {k: v for k, v in have_lipo}
251 with args.putative_osp_fa as f:
252 for desc, s in candidate_dict.items(): # description / sequence
253 f.write(">" + str(desc))
254 f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n")
255 length.append(len(s))
256 ORF.append(desc)
257 if not length:
258 raise Exception("Parameters yielded no candidates.")
259 bot_size = min(length)
260 top_size = max(length)
261 avg = (sum(length)) / total_osp
262 n = len(length)
263 if n == 0:
264 raise Exception("no median for empty data")
265 if n % 2 == 1:
266 med = length[n // 2]
267 else:
268 i = n // 2
269 med = (length[i - 1] + length[i]) / 2
270
271 args.out_osp_prot.close()
272 all_orfs = open(args.out_osp_prot.name, "r")
273 all_osps = open(args.putative_osp_fa.name, "r")
274 # record = SeqIO.read(all_orfs, "fasta")
275 # print(len(record))
276 #### Extra stats
277 n = 0
278 for line in all_orfs:
279 if line.startswith(">"):
280 n += 1
281 all_orfs_counts = n
282
283 c = 0
284 for line in all_osps:
285 if line.startswith(">"):
286 c += 1
287 all_osps_counts = c
288
289 with args.summary_osp_txt as f:
290 f.write("total potential o-spanins: " + str(total_osp) + "\n")
291 f.write("average length (AA): " + str(avg) + "\n")
292 f.write("median length (AA): " + str(med) + "\n")
293 f.write("maximum orf in size (AA): " + str(top_size) + "\n")
294 f.write("minimum orf in size (AA): " + str(bot_size) + "\n")
295 # f.write(f"ratio of osps found from naive orfs: {c}/{n}")
296 f.write("ratio of osps found from naive orfs: " + str(c) + "/" + str(n))
297 # Output the putative list in gff3 format:
298 args.putative_osp_fa = open(args.putative_osp_fa.name, "r")
299 gff_data = prep_a_gff3(
300 fa=args.putative_osp_fa, spanin_type="osp", org=args.fasta_file
301 )
302 write_gff3(data=gff_data, output=args.putative_osp_gff)