Mercurial > repos > cpt > cpt_putative_osp
comparison generate-putative-osp.py @ 1:05b97a4dce94 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt | 
|---|---|
| date | Mon, 05 Jun 2023 02:51:44 +0000 | 
| parents | |
| children | 859e18a9814a | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:03670eba3480 | 1:05b97a4dce94 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import argparse | |
| 3 from cpt import OrfFinder | |
| 4 from Bio import SeqIO | |
| 5 from Bio import Seq | |
| 6 from CPT_GFFParser import gffParse, gffWrite | |
| 7 from spaninFuncs import * | |
| 8 import re | |
| 9 import os | |
| 10 import sys | |
| 11 | |
| 12 ### Requirement Inputs | |
| 13 #### INPUT : Genomic FASTA | |
| 14 #### OUTPUT : Putative OSP candidates in FASTA format. | |
| 15 ######### Optional OUTPUT: "Complete" potential ORFs, in BED/FASTAs/GFF3 formats | |
| 16 ### Notes: | |
| 17 ####### As of 2.13.2020 - RegEx pattern: [ACGSILMFTV][^REKD][GASNL]C for LipoRy | |
| 18 if __name__ == "__main__": | |
| 19 | |
| 20 # Common parameters for both ISP / OSP portion of scripts | |
| 21 | |
| 22 parser = argparse.ArgumentParser( | |
| 23 description="Get putative protein candidates for spanins" | |
| 24 ) | |
| 25 | |
| 26 parser.add_argument( | |
| 27 "fasta_file", type=argparse.FileType("r"), help="Fasta file" | |
| 28 ) # the "input" argument | |
| 29 | |
| 30 parser.add_argument( | |
| 31 "-f", | |
| 32 "--format", | |
| 33 dest="seq_format", | |
| 34 default="fasta", | |
| 35 help="Sequence format (e.g. fasta, fastq, sff)", | |
| 36 ) # optional formats for input, currently just going to do ntFASTA | |
| 37 | |
| 38 parser.add_argument( | |
| 39 "--strand", | |
| 40 dest="strand", | |
| 41 choices=("both", "forward", "reverse"), | |
| 42 default="both", | |
| 43 help="select strand", | |
| 44 ) # Selection of +, -, or both strands | |
| 45 | |
| 46 parser.add_argument( | |
| 47 "--table", dest="table", default=11, help="NCBI Translation table", type=int | |
| 48 ) # Uses "default" NCBI codon table. This should always (afaik) be what we want... | |
| 49 | |
| 50 parser.add_argument( | |
| 51 "-t", | |
| 52 "--ftype", | |
| 53 dest="ftype", | |
| 54 choices=("CDS", "ORF"), | |
| 55 default="ORF", | |
| 56 help="Find ORF or CDSs", | |
| 57 ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF | |
| 58 | |
| 59 parser.add_argument( | |
| 60 "-e", | |
| 61 "--ends", | |
| 62 dest="ends", | |
| 63 choices=("open", "closed"), | |
| 64 default="closed", | |
| 65 help="Open or closed. Closed ensures start/stop codons are present", | |
| 66 ) # includes the start and stop codon | |
| 67 | |
| 68 parser.add_argument( | |
| 69 "-m", | |
| 70 "--mode", | |
| 71 dest="mode", | |
| 72 choices=("all", "top", "one"), | |
| 73 default="all", # I think we want this to JUST be all...nearly always | |
| 74 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length", | |
| 75 ) | |
| 76 | |
| 77 parser.add_argument( | |
| 78 "--switch", | |
| 79 dest="switch", | |
| 80 default="all", | |
| 81 help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321", | |
| 82 ) | |
| 83 | |
| 84 # osp parameters | |
| 85 parser.add_argument( | |
| 86 "--osp_min_len", | |
| 87 dest="osp_min_len", | |
| 88 default=30, | |
| 89 help="Minimum ORF length, measured in codons", | |
| 90 type=int, | |
| 91 ) | |
| 92 parser.add_argument( | |
| 93 "--max_osp", | |
| 94 dest="max_osp", | |
| 95 default=200, | |
| 96 help="Maximum ORF length, measured in codons", | |
| 97 type=int, | |
| 98 ) | |
| 99 parser.add_argument( | |
| 100 "--osp_on", | |
| 101 dest="out_osp_nuc", | |
| 102 type=argparse.FileType("w"), | |
| 103 default="_out_osp.fna", | |
| 104 help="Output nucleotide sequences, FASTA", | |
| 105 ) | |
| 106 parser.add_argument( | |
| 107 "--osp_op", | |
| 108 dest="out_osp_prot", | |
| 109 type=argparse.FileType("w"), | |
| 110 default="_out_osp.fa", | |
| 111 help="Output protein sequences, FASTA", | |
| 112 ) | |
| 113 parser.add_argument( | |
| 114 "--osp_ob", | |
| 115 dest="out_osp_bed", | |
| 116 type=argparse.FileType("w"), | |
| 117 default="_out_osp.bed", | |
| 118 help="Output BED file", | |
| 119 ) | |
| 120 parser.add_argument( | |
| 121 "--osp_og", | |
| 122 dest="out_osp_gff3", | |
| 123 type=argparse.FileType("w"), | |
| 124 default="_out_osp.gff3", | |
| 125 help="Output GFF3 file", | |
| 126 ) | |
| 127 parser.add_argument( | |
| 128 "--osp_min_dist", | |
| 129 dest="osp_min_dist", | |
| 130 default=10, | |
| 131 help="Minimal distance to first AA of lipobox, measured in AA", | |
| 132 type=int, | |
| 133 ) | |
| 134 parser.add_argument( | |
| 135 "--osp_max_dist", | |
| 136 dest="osp_max_dist", | |
| 137 default=50, | |
| 138 help="Maximum distance to first AA of lipobox, measured in AA", | |
| 139 type=int, | |
| 140 ) | |
| 141 parser.add_argument( | |
| 142 "--min_lipo_after", | |
| 143 dest="min_lipo_after", | |
| 144 default=25, | |
| 145 help="minimal amount of residues after lipobox", | |
| 146 type=int, | |
| 147 ) | |
| 148 parser.add_argument( | |
| 149 "--max_lipo_after", | |
| 150 dest="max_lipo_after", | |
| 151 default=170, | |
| 152 help="minimal amount of residues after lipobox", | |
| 153 type=int, | |
| 154 ) | |
| 155 parser.add_argument( | |
| 156 "--putative_osp", | |
| 157 dest="putative_osp_fa", | |
| 158 type=argparse.FileType("w"), | |
| 159 default="_putative_osp.fa", | |
| 160 help="Output of putative FASTA file", | |
| 161 ) | |
| 162 parser.add_argument( | |
| 163 "--summary_osp_txt", | |
| 164 dest="summary_osp_txt", | |
| 165 type=argparse.FileType("w"), | |
| 166 default="_summary_osp.txt", | |
| 167 help="Summary statistics on putative o-spanins", | |
| 168 ) | |
| 169 parser.add_argument( | |
| 170 "--putative_osp_gff", | |
| 171 dest="putative_osp_gff", | |
| 172 type=argparse.FileType("w"), | |
| 173 default="_putative_osp.gff3", | |
| 174 help="gff3 output for putative o-spanins", | |
| 175 ) | |
| 176 parser.add_argument("--osp_mode", action="store_true", default=True) | |
| 177 | |
| 178 # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated? | |
| 179 args = parser.parse_args() | |
| 180 | |
| 181 the_args = vars(parser.parse_args()) | |
| 182 | |
| 183 ### osp output, naive ORF finding: | |
| 184 osps = OrfFinder(args.table, args.ftype, args.ends, args.osp_min_len, args.strand) | |
| 185 osps.locate( | |
| 186 args.fasta_file, | |
| 187 args.out_osp_nuc, | |
| 188 args.out_osp_prot, | |
| 189 args.out_osp_bed, | |
| 190 args.out_osp_gff3, | |
| 191 ) | |
| 192 | |
| 193 """ | |
| 194 ### For Control: Use T7 and lambda; | |
| 195 # Note the distance from start codon to lipobox region for t7 | |
| 196 o-spanin | |
| 197 18,7-------------------------------------------------LIPO---------------------------------- | |
| 198 >T7_EOS MSTLRELRLRRALKEQSVRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPESPMVSVDSSLMVEPNLTTEMLNVFSQ | |
| 199 -----------------------------LIPO---------------------------------------- | |
| 200 > lambda_EOS MLKLKMMLCVMMLPLVVVGCTSKQSVSQCVKPPPPPAWIMQPPPDWQTPLNGIISPSERG | |
| 201 """ | |
| 202 args.fasta_file.close() | |
| 203 args.fasta_file = open(args.fasta_file.name, "r") | |
| 204 args.out_osp_prot.close() | |
| 205 args.out_osp_prot = open(args.out_osp_prot.name, "r") | |
| 206 | |
| 207 pairs = tuple_fasta(fasta_file=args.out_osp_prot) | |
| 208 have_lipo = [] # empty candidates list to be passed through the user input | |
| 209 | |
| 210 for each_pair in pairs: | |
| 211 if len(each_pair[1]) <= args.max_osp: | |
| 212 try: | |
| 213 have_lipo += find_lipobox( | |
| 214 pair=each_pair, | |
| 215 minimum=args.osp_min_dist, | |
| 216 maximum=args.osp_max_dist, | |
| 217 min_after=args.min_lipo_after, | |
| 218 max_after=args.max_lipo_after, | |
| 219 osp_mode=args.osp_mode, | |
| 220 ) | |
| 221 except (IndexError, TypeError): | |
| 222 continue | |
| 223 | |
| 224 if args.switch == "all": | |
| 225 pass | |
| 226 else: | |
| 227 # for each_pair in have_lipo: | |
| 228 range_of = args.switch | |
| 229 range_of = re.search(("[\d]+:[\d]+"), range_of).group(0) | |
| 230 start = int(range_of.split(":")[0]) | |
| 231 end = int(range_of.split(":")[1]) | |
| 232 have_lipo = parse_a_range(pair=have_lipo, start=start, end=end) | |
| 233 # print(have_lipo) | |
| 234 # matches | |
| 235 | |
| 236 # have_lipo = [] | |
| 237 # have_lipo = matches | |
| 238 total_osp = len(have_lipo) | |
| 239 # print(have_lipo) | |
| 240 # print(total_osp) | |
| 241 | |
| 242 # print(type(have_lipo)) | |
| 243 | |
| 244 # for i in have_lipo: | |
| 245 # print(i) | |
| 246 | |
| 247 # export results in fasta format | |
| 248 ORF = [] | |
| 249 length = [] # grabbing length of the sequences | |
| 250 candidate_dict = {k: v for k, v in have_lipo} | |
| 251 with args.putative_osp_fa as f: | |
| 252 for desc, s in candidate_dict.items(): # description / sequence | |
| 253 f.write(">" + str(desc)) | |
| 254 f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") | |
| 255 length.append(len(s)) | |
| 256 ORF.append(desc) | |
| 257 if not length: | |
| 258 raise Exception("Parameters yielded no candidates.") | |
| 259 bot_size = min(length) | |
| 260 top_size = max(length) | |
| 261 avg = (sum(length)) / total_osp | |
| 262 n = len(length) | |
| 263 if n == 0: | |
| 264 raise Exception("no median for empty data") | |
| 265 if n % 2 == 1: | |
| 266 med = length[n // 2] | |
| 267 else: | |
| 268 i = n // 2 | |
| 269 med = (length[i - 1] + length[i]) / 2 | |
| 270 | |
| 271 args.out_osp_prot.close() | |
| 272 all_orfs = open(args.out_osp_prot.name, "r") | |
| 273 all_osps = open(args.putative_osp_fa.name, "r") | |
| 274 # record = SeqIO.read(all_orfs, "fasta") | |
| 275 # print(len(record)) | |
| 276 #### Extra stats | |
| 277 n = 0 | |
| 278 for line in all_orfs: | |
| 279 if line.startswith(">"): | |
| 280 n += 1 | |
| 281 all_orfs_counts = n | |
| 282 | |
| 283 c = 0 | |
| 284 for line in all_osps: | |
| 285 if line.startswith(">"): | |
| 286 c += 1 | |
| 287 all_osps_counts = c | |
| 288 | |
| 289 with args.summary_osp_txt as f: | |
| 290 f.write("total potential o-spanins: " + str(total_osp) + "\n") | |
| 291 f.write("average length (AA): " + str(avg) + "\n") | |
| 292 f.write("median length (AA): " + str(med) + "\n") | |
| 293 f.write("maximum orf in size (AA): " + str(top_size) + "\n") | |
| 294 f.write("minimum orf in size (AA): " + str(bot_size) + "\n") | |
| 295 # f.write(f"ratio of osps found from naive orfs: {c}/{n}") | |
| 296 f.write("ratio of osps found from naive orfs: " + str(c) + "/" + str(n)) | |
| 297 # Output the putative list in gff3 format: | |
| 298 args.putative_osp_fa = open(args.putative_osp_fa.name, "r") | |
| 299 gff_data = prep_a_gff3( | |
| 300 fa=args.putative_osp_fa, spanin_type="osp", org=args.fasta_file | |
| 301 ) | |
| 302 write_gff3(data=gff_data, output=args.putative_osp_gff) | 
