Mercurial > repos > cpt > cpt_putative_usp
comparison generate-putative-usp.py @ 1:f7afd1480d0f draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:51:59 +0000 |
| parents | |
| children | 76a3830e9cb1 |
comparison
equal
deleted
inserted
replaced
| 0:13c3e2d60fa6 | 1:f7afd1480d0f |
|---|---|
| 1 import argparse | |
| 2 from cpt import OrfFinder | |
| 3 from Bio import SeqIO | |
| 4 from Bio import Seq | |
| 5 from CPT_GFFParser import gffParse, gffWrite | |
| 6 from spaninFuncs import * | |
| 7 import re | |
| 8 import os | |
| 9 import sys | |
| 10 | |
| 11 """ | |
| 12 ## Note | |
| 13 NOTE : This was made after I made the i-spanin and o-spanin tools, so there might be some methods that are used differently | |
| 14 and overall some 'differently' written code... | |
| 15 """ | |
| 16 | |
| 17 if __name__ == "__main__": | |
| 18 parser = argparse.ArgumentParser( | |
| 19 description="Get putative protein candidates for u-spanins" | |
| 20 ) | |
| 21 | |
| 22 parser.add_argument( | |
| 23 "fasta_file", type=argparse.FileType("r"), help="Fasta file" | |
| 24 ) # the "input" argument | |
| 25 | |
| 26 parser.add_argument( | |
| 27 "--strand", | |
| 28 dest="strand", | |
| 29 choices=("both", "forward", "reverse"), | |
| 30 default="both", | |
| 31 help="select strand", | |
| 32 ) # Selection of +, -, or both strands | |
| 33 | |
| 34 parser.add_argument( | |
| 35 "--table", dest="table", default=11, help="NCBI Translation table", type=int | |
| 36 ) # Uses "default" NCBI codon table. This should always (afaik) be what we want... | |
| 37 | |
| 38 parser.add_argument( | |
| 39 "-t", | |
| 40 "--ftype", | |
| 41 dest="ftype", | |
| 42 choices=("CDS", "ORF"), | |
| 43 default="ORF", | |
| 44 help="Find ORF or CDSs", | |
| 45 ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF | |
| 46 | |
| 47 parser.add_argument( | |
| 48 "-e", | |
| 49 "--ends", | |
| 50 dest="ends", | |
| 51 choices=("open", "closed"), | |
| 52 default="closed", | |
| 53 help="Open or closed. Closed ensures start/stop codons are present", | |
| 54 ) # includes the start and stop codon | |
| 55 | |
| 56 parser.add_argument( | |
| 57 "-m", | |
| 58 "--mode", | |
| 59 dest="mode", | |
| 60 choices=("all", "top", "one"), | |
| 61 default="all", # I think we want this to JUST be all...nearly always | |
| 62 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length", | |
| 63 ) | |
| 64 | |
| 65 parser.add_argument( | |
| 66 "--switch", | |
| 67 dest="switch", | |
| 68 default="all", | |
| 69 help="switch between ALL putative usps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321", | |
| 70 ) | |
| 71 | |
| 72 parser.add_argument( | |
| 73 "--usp_on", | |
| 74 dest="out_usp_nuc", | |
| 75 type=argparse.FileType("w"), | |
| 76 default="_out_usp.fna", | |
| 77 help="Output nucleotide sequences, FASTA", | |
| 78 ) | |
| 79 parser.add_argument( | |
| 80 "--usp_op", | |
| 81 dest="out_usp_prot", | |
| 82 type=argparse.FileType("w"), | |
| 83 default="_out_usp.fa", | |
| 84 help="Output protein sequences, FASTA", | |
| 85 ) | |
| 86 parser.add_argument( | |
| 87 "--usp_ob", | |
| 88 dest="out_usp_bed", | |
| 89 type=argparse.FileType("w"), | |
| 90 default="_out_usp.bed", | |
| 91 help="Output BED file", | |
| 92 ) | |
| 93 parser.add_argument( | |
| 94 "--usp_og", | |
| 95 dest="out_usp_gff3", | |
| 96 type=argparse.FileType("w"), | |
| 97 default="_out_usp.gff3", | |
| 98 help="Output GFF3 file", | |
| 99 ) | |
| 100 parser.add_argument( | |
| 101 "--putative_usp", | |
| 102 dest="putative_usp_fa", | |
| 103 type=argparse.FileType("w"), | |
| 104 default="_putative_usp.fa", | |
| 105 help="Output of putative FASTA file", | |
| 106 ) | |
| 107 parser.add_argument( | |
| 108 "--summary_usp_txt", | |
| 109 dest="summary_usp_txt", | |
| 110 type=argparse.FileType("w"), | |
| 111 default="_summary_usp.txt", | |
| 112 help="Summary statistics on putative o-spanins", | |
| 113 ) | |
| 114 parser.add_argument( | |
| 115 "--putative_usp_gff", | |
| 116 dest="putative_usp_gff", | |
| 117 type=argparse.FileType("w"), | |
| 118 default="_putative_usp.gff3", | |
| 119 help="gff3 output for putative o-spanins", | |
| 120 ) | |
| 121 | |
| 122 parser.add_argument( | |
| 123 "--min_size", type=int, default=100, help="minimum size of peptide" | |
| 124 ) | |
| 125 parser.add_argument( | |
| 126 "--max_size", type=int, default=200, help="maximum size of peptide" | |
| 127 ) | |
| 128 parser.add_argument( | |
| 129 "--lipo_min_start", type=int, default=10, help="minimum start site of lipobox" | |
| 130 ) | |
| 131 parser.add_argument( | |
| 132 "--lipo_max_start", type=int, default=30, help="maximum end site of lipobox" | |
| 133 ) | |
| 134 parser.add_argument( | |
| 135 "--min_lipo_after", | |
| 136 type=int, | |
| 137 default=60, | |
| 138 help="minumum amount of residues after lipobox", | |
| 139 ) | |
| 140 parser.add_argument( | |
| 141 "--max_lipo_after", | |
| 142 type=int, | |
| 143 default=160, | |
| 144 help="maximum amount of residues after lipobox", | |
| 145 ) | |
| 146 parser.add_argument( | |
| 147 "--tmd_min_start", type=int, default=75, help="minumum start site of TMD" | |
| 148 ) | |
| 149 parser.add_argument( | |
| 150 "--tmd_max_start", type=int, default=200, help="maximum end site of TMD" | |
| 151 ) | |
| 152 parser.add_argument( | |
| 153 "--tmd_min_size", type=int, default=15, help="minimum size of TMD" | |
| 154 ) | |
| 155 parser.add_argument( | |
| 156 "--tmd_max_size", type=int, default=25, help="maximum size of TMD" | |
| 157 ) | |
| 158 | |
| 159 args = parser.parse_args() | |
| 160 | |
| 161 the_args = vars(parser.parse_args()) | |
| 162 | |
| 163 ### usp output, naive ORF finding: | |
| 164 usps = OrfFinder(args.table, args.ftype, args.ends, args.min_size, args.strand) | |
| 165 usps.locate( | |
| 166 args.fasta_file, | |
| 167 args.out_usp_nuc, | |
| 168 args.out_usp_prot, | |
| 169 args.out_usp_bed, | |
| 170 args.out_usp_gff3, | |
| 171 ) | |
| 172 | |
| 173 args.fasta_file.close() | |
| 174 args.fasta_file = open(args.fasta_file.name, "r") | |
| 175 args.out_usp_prot.close() | |
| 176 args.out_usp_prot = open(args.out_usp_prot.name, "r") | |
| 177 | |
| 178 pairs = tuple_fasta(fasta_file=args.out_usp_prot) | |
| 179 have_lipo = [] | |
| 180 | |
| 181 for each_pair in pairs: | |
| 182 if len(each_pair[1]) <= args.max_size: | |
| 183 try: | |
| 184 have_lipo += find_lipobox( | |
| 185 pair=each_pair, | |
| 186 minimum=args.lipo_min_start, | |
| 187 maximum=args.lipo_max_start, | |
| 188 min_after=args.min_lipo_after, | |
| 189 max_after=args.max_lipo_after, | |
| 190 ) | |
| 191 except (IndexError, TypeError): | |
| 192 continue | |
| 193 | |
| 194 # print(len(have_lipo)) | |
| 195 # print(have_lipo) | |
| 196 | |
| 197 have_tmd_and_lipo = [] | |
| 198 # print(args.tmd_min_start) | |
| 199 # print(args.tmd_max_start) | |
| 200 # print(args.tmd_min_size) | |
| 201 # print(args.tmd_max_size) | |
| 202 | |
| 203 for each_pair in have_lipo: | |
| 204 try: | |
| 205 have_tmd_and_lipo += find_tmd( | |
| 206 pair=each_pair, | |
| 207 minimum=args.tmd_min_start, | |
| 208 maximum=args.tmd_max_start, | |
| 209 TMDmin=args.tmd_min_size, | |
| 210 TMDmax=args.tmd_max_size, | |
| 211 ) | |
| 212 except (IndexError, TypeError): | |
| 213 continue | |
| 214 | |
| 215 # print(len(have_tmd_and_lipo)) | |
| 216 # print(have_tmd_and_lipo) | |
| 217 | |
| 218 if args.switch == "all": | |
| 219 pass | |
| 220 else: | |
| 221 range_of = args.switch | |
| 222 range_of = re.search(("[\d]+:[\d]+"), range_of).group(0) | |
| 223 start = int(range_of.split(":")[0]) | |
| 224 end = int(range_of.split(":")[1]) | |
| 225 have_lipo = parse_a_range(pair=have_tmd_and_lipo, start=start, end=end) | |
| 226 | |
| 227 total_have_tmd_and_lipo = len(have_tmd_and_lipo) | |
| 228 | |
| 229 ORF = [] | |
| 230 length = [] | |
| 231 candidate_dict = {k: v for k, v in have_tmd_and_lipo} | |
| 232 with args.putative_usp_fa as f: | |
| 233 for desc, s in candidate_dict.items(): | |
| 234 f.write(">" + str(desc)) | |
| 235 f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") | |
| 236 length.append(len(s)) | |
| 237 ORF.append(desc) | |
| 238 #### Extra statistics | |
| 239 args.out_usp_prot.close() | |
| 240 all_orfs = open(args.out_usp_prot.name, "r") | |
| 241 all_isps = open(args.putative_usp_fa.name, "r") | |
| 242 # record = SeqIO.read(all_orfs, "fasta") | |
| 243 # print(len(record)) | |
| 244 n = 0 | |
| 245 for line in all_orfs: | |
| 246 if line.startswith(">"): | |
| 247 n += 1 | |
| 248 all_orfs_counts = n | |
| 249 | |
| 250 c = 0 | |
| 251 for line in all_isps: | |
| 252 if line.startswith(">"): | |
| 253 c += 1 | |
| 254 all_isps_counts = c | |
| 255 | |
| 256 if ORF: | |
| 257 bot_size = min(length) | |
| 258 top_size = max(length) | |
| 259 avg = (sum(length)) / total_have_tmd_and_lipo | |
| 260 n = len(length) | |
| 261 if n == 0: | |
| 262 raise Exception("no median for empty data") | |
| 263 if n % 2 == 1: | |
| 264 med = length[n // 2] | |
| 265 else: | |
| 266 i = n // 2 | |
| 267 med = (length[i - 1] + length[i]) / 2 | |
| 268 with args.summary_usp_txt as f: | |
| 269 f.write("total potential u-spanins: " + str(total_have_tmd_and_lipo) + "\n") | |
| 270 f.write("average length (AA): " + str(avg) + "\n") | |
| 271 f.write("median length (AA): " + str(med) + "\n") | |
| 272 f.write("maximum orf in size (AA): " + str(top_size) + "\n") | |
| 273 f.write("minimum orf in size (AA): " + str(bot_size) + "\n") | |
| 274 f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n)) | |
| 275 | |
| 276 args.putative_usp_fa = open(args.putative_usp_fa.name, "r") | |
| 277 gff_data = prep_a_gff3( | |
| 278 fa=args.putative_usp_fa, spanin_type="usp", org=args.fasta_file | |
| 279 ) | |
| 280 write_gff3(data=gff_data, output=args.putative_usp_gff) | |
| 281 else: | |
| 282 with args.summary_usp_txt as f: | |
| 283 f.write("No Candidate USPs found") | |
| 284 if have_lipo: | |
| 285 f.write("\nLipoboxes were found here:\n") | |
| 286 for each_lipo in have_lipo: | |
| 287 f.write(">" + str(each_lipo[0])) | |
| 288 f.write("\n" + lineWrapper(each_lipo[1].replace("*", "")) + "\n") | |
| 289 else: | |
| 290 f.write("\nNo Lipobox(es) were found within search restraints") |
