comparison generate-putative-usp.py @ 1:f7afd1480d0f draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:51:59 +0000
parents
children 76a3830e9cb1
comparison
equal deleted inserted replaced
0:13c3e2d60fa6 1:f7afd1480d0f
1 import argparse
2 from cpt import OrfFinder
3 from Bio import SeqIO
4 from Bio import Seq
5 from CPT_GFFParser import gffParse, gffWrite
6 from spaninFuncs import *
7 import re
8 import os
9 import sys
10
11 """
12 ## Note
13 NOTE : This was made after I made the i-spanin and o-spanin tools, so there might be some methods that are used differently
14 and overall some 'differently' written code...
15 """
16
17 if __name__ == "__main__":
18 parser = argparse.ArgumentParser(
19 description="Get putative protein candidates for u-spanins"
20 )
21
22 parser.add_argument(
23 "fasta_file", type=argparse.FileType("r"), help="Fasta file"
24 ) # the "input" argument
25
26 parser.add_argument(
27 "--strand",
28 dest="strand",
29 choices=("both", "forward", "reverse"),
30 default="both",
31 help="select strand",
32 ) # Selection of +, -, or both strands
33
34 parser.add_argument(
35 "--table", dest="table", default=11, help="NCBI Translation table", type=int
36 ) # Uses "default" NCBI codon table. This should always (afaik) be what we want...
37
38 parser.add_argument(
39 "-t",
40 "--ftype",
41 dest="ftype",
42 choices=("CDS", "ORF"),
43 default="ORF",
44 help="Find ORF or CDSs",
45 ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF
46
47 parser.add_argument(
48 "-e",
49 "--ends",
50 dest="ends",
51 choices=("open", "closed"),
52 default="closed",
53 help="Open or closed. Closed ensures start/stop codons are present",
54 ) # includes the start and stop codon
55
56 parser.add_argument(
57 "-m",
58 "--mode",
59 dest="mode",
60 choices=("all", "top", "one"),
61 default="all", # I think we want this to JUST be all...nearly always
62 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length",
63 )
64
65 parser.add_argument(
66 "--switch",
67 dest="switch",
68 default="all",
69 help="switch between ALL putative usps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321",
70 )
71
72 parser.add_argument(
73 "--usp_on",
74 dest="out_usp_nuc",
75 type=argparse.FileType("w"),
76 default="_out_usp.fna",
77 help="Output nucleotide sequences, FASTA",
78 )
79 parser.add_argument(
80 "--usp_op",
81 dest="out_usp_prot",
82 type=argparse.FileType("w"),
83 default="_out_usp.fa",
84 help="Output protein sequences, FASTA",
85 )
86 parser.add_argument(
87 "--usp_ob",
88 dest="out_usp_bed",
89 type=argparse.FileType("w"),
90 default="_out_usp.bed",
91 help="Output BED file",
92 )
93 parser.add_argument(
94 "--usp_og",
95 dest="out_usp_gff3",
96 type=argparse.FileType("w"),
97 default="_out_usp.gff3",
98 help="Output GFF3 file",
99 )
100 parser.add_argument(
101 "--putative_usp",
102 dest="putative_usp_fa",
103 type=argparse.FileType("w"),
104 default="_putative_usp.fa",
105 help="Output of putative FASTA file",
106 )
107 parser.add_argument(
108 "--summary_usp_txt",
109 dest="summary_usp_txt",
110 type=argparse.FileType("w"),
111 default="_summary_usp.txt",
112 help="Summary statistics on putative o-spanins",
113 )
114 parser.add_argument(
115 "--putative_usp_gff",
116 dest="putative_usp_gff",
117 type=argparse.FileType("w"),
118 default="_putative_usp.gff3",
119 help="gff3 output for putative o-spanins",
120 )
121
122 parser.add_argument(
123 "--min_size", type=int, default=100, help="minimum size of peptide"
124 )
125 parser.add_argument(
126 "--max_size", type=int, default=200, help="maximum size of peptide"
127 )
128 parser.add_argument(
129 "--lipo_min_start", type=int, default=10, help="minimum start site of lipobox"
130 )
131 parser.add_argument(
132 "--lipo_max_start", type=int, default=30, help="maximum end site of lipobox"
133 )
134 parser.add_argument(
135 "--min_lipo_after",
136 type=int,
137 default=60,
138 help="minumum amount of residues after lipobox",
139 )
140 parser.add_argument(
141 "--max_lipo_after",
142 type=int,
143 default=160,
144 help="maximum amount of residues after lipobox",
145 )
146 parser.add_argument(
147 "--tmd_min_start", type=int, default=75, help="minumum start site of TMD"
148 )
149 parser.add_argument(
150 "--tmd_max_start", type=int, default=200, help="maximum end site of TMD"
151 )
152 parser.add_argument(
153 "--tmd_min_size", type=int, default=15, help="minimum size of TMD"
154 )
155 parser.add_argument(
156 "--tmd_max_size", type=int, default=25, help="maximum size of TMD"
157 )
158
159 args = parser.parse_args()
160
161 the_args = vars(parser.parse_args())
162
163 ### usp output, naive ORF finding:
164 usps = OrfFinder(args.table, args.ftype, args.ends, args.min_size, args.strand)
165 usps.locate(
166 args.fasta_file,
167 args.out_usp_nuc,
168 args.out_usp_prot,
169 args.out_usp_bed,
170 args.out_usp_gff3,
171 )
172
173 args.fasta_file.close()
174 args.fasta_file = open(args.fasta_file.name, "r")
175 args.out_usp_prot.close()
176 args.out_usp_prot = open(args.out_usp_prot.name, "r")
177
178 pairs = tuple_fasta(fasta_file=args.out_usp_prot)
179 have_lipo = []
180
181 for each_pair in pairs:
182 if len(each_pair[1]) <= args.max_size:
183 try:
184 have_lipo += find_lipobox(
185 pair=each_pair,
186 minimum=args.lipo_min_start,
187 maximum=args.lipo_max_start,
188 min_after=args.min_lipo_after,
189 max_after=args.max_lipo_after,
190 )
191 except (IndexError, TypeError):
192 continue
193
194 # print(len(have_lipo))
195 # print(have_lipo)
196
197 have_tmd_and_lipo = []
198 # print(args.tmd_min_start)
199 # print(args.tmd_max_start)
200 # print(args.tmd_min_size)
201 # print(args.tmd_max_size)
202
203 for each_pair in have_lipo:
204 try:
205 have_tmd_and_lipo += find_tmd(
206 pair=each_pair,
207 minimum=args.tmd_min_start,
208 maximum=args.tmd_max_start,
209 TMDmin=args.tmd_min_size,
210 TMDmax=args.tmd_max_size,
211 )
212 except (IndexError, TypeError):
213 continue
214
215 # print(len(have_tmd_and_lipo))
216 # print(have_tmd_and_lipo)
217
218 if args.switch == "all":
219 pass
220 else:
221 range_of = args.switch
222 range_of = re.search(("[\d]+:[\d]+"), range_of).group(0)
223 start = int(range_of.split(":")[0])
224 end = int(range_of.split(":")[1])
225 have_lipo = parse_a_range(pair=have_tmd_and_lipo, start=start, end=end)
226
227 total_have_tmd_and_lipo = len(have_tmd_and_lipo)
228
229 ORF = []
230 length = []
231 candidate_dict = {k: v for k, v in have_tmd_and_lipo}
232 with args.putative_usp_fa as f:
233 for desc, s in candidate_dict.items():
234 f.write(">" + str(desc))
235 f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n")
236 length.append(len(s))
237 ORF.append(desc)
238 #### Extra statistics
239 args.out_usp_prot.close()
240 all_orfs = open(args.out_usp_prot.name, "r")
241 all_isps = open(args.putative_usp_fa.name, "r")
242 # record = SeqIO.read(all_orfs, "fasta")
243 # print(len(record))
244 n = 0
245 for line in all_orfs:
246 if line.startswith(">"):
247 n += 1
248 all_orfs_counts = n
249
250 c = 0
251 for line in all_isps:
252 if line.startswith(">"):
253 c += 1
254 all_isps_counts = c
255
256 if ORF:
257 bot_size = min(length)
258 top_size = max(length)
259 avg = (sum(length)) / total_have_tmd_and_lipo
260 n = len(length)
261 if n == 0:
262 raise Exception("no median for empty data")
263 if n % 2 == 1:
264 med = length[n // 2]
265 else:
266 i = n // 2
267 med = (length[i - 1] + length[i]) / 2
268 with args.summary_usp_txt as f:
269 f.write("total potential u-spanins: " + str(total_have_tmd_and_lipo) + "\n")
270 f.write("average length (AA): " + str(avg) + "\n")
271 f.write("median length (AA): " + str(med) + "\n")
272 f.write("maximum orf in size (AA): " + str(top_size) + "\n")
273 f.write("minimum orf in size (AA): " + str(bot_size) + "\n")
274 f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n))
275
276 args.putative_usp_fa = open(args.putative_usp_fa.name, "r")
277 gff_data = prep_a_gff3(
278 fa=args.putative_usp_fa, spanin_type="usp", org=args.fasta_file
279 )
280 write_gff3(data=gff_data, output=args.putative_usp_gff)
281 else:
282 with args.summary_usp_txt as f:
283 f.write("No Candidate USPs found")
284 if have_lipo:
285 f.write("\nLipoboxes were found here:\n")
286 for each_lipo in have_lipo:
287 f.write(">" + str(each_lipo[0]))
288 f.write("\n" + lineWrapper(each_lipo[1].replace("*", "")) + "\n")
289 else:
290 f.write("\nNo Lipobox(es) were found within search restraints")