Mercurial > repos > dereeper > pangenome_explorer
comparison SkewIT/src/skewi.py @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #! /usr/bin/env python | |
2 ########################################################################## | |
3 #skewi.py calculates a single SkewI value for a given chromosome | |
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | |
5 # | |
6 #This file is part of SkewIT | |
7 # | |
8 #SkewIT is free software; you can redistribute it and/or modify | |
9 #it under the terms of the GNU General Public License as published by | |
10 #the Free Software Foundation; either version 3 of the license, or | |
11 #(at your option) any later version. | |
12 | |
13 #This program is distributed in the hope that it will be useful, | |
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 #GNU General Public License for more details. | |
17 | |
18 #You should have received a copy of the GNU General Public License | |
19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 ##################################################################### | |
22 #Jennifer Lu, jennifer.lu717@gmail.com | |
23 #02/17/2020 | |
24 # | |
25 #This program calculates a Skew Index (SkewI) | |
26 #for a given genome within the range 0 to 1. Higher SkewI values | |
27 #indicate a strong GC Skew signal while lower SkewI values | |
28 #indicate a potentially low GC Skew signal | |
29 # | |
30 #Given a multi-fasta file, the program will output one SkewI per sequence | |
31 ###################################################################### | |
32 import sys, os, argparse | |
33 from time import gmtime | |
34 from time import strftime | |
35 from Bio import SeqIO | |
36 import numpy as np | |
37 ##################################################################### | |
38 def usage(): | |
39 sys.stderr.write("\n #########################################################################################\n") | |
40 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n") | |
41 sys.stderr.write(" ## > python skewi.py -i SEQ.FASTA -o SKEW.TXT\t\t\t\t\t\t##\n") | |
42 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n") | |
43 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n") | |
44 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n") | |
45 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n") | |
46 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" ) | |
47 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n") | |
48 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n") | |
49 sys.stderr.write(" ## \t --min-seq-len LEN.......set a minimum sequence length\t\t\t\t##\n") | |
50 sys.stderr.write(" ## \t .....................[default: 500,000bp]\t\t\t\t\t##\n") | |
51 sys.stderr.write(" ## \t --complete/--all........only analyze sequences with 'complete' in header\t##\n") | |
52 sys.stderr.write(" ## \t .....................[default: --complete]\t\t\t\t\t##\n") | |
53 sys.stderr.write(" ## \t --plasmid/--no-plasmid..include/exclude plasmid sequences from analysis\t##\n") | |
54 sys.stderr.write(" ## \t .....................[default: --no-plasmid]\t\t\t\t##\n") | |
55 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to standard out\t##\n") | |
56 sys.stderr.write(" ## \t Otherwise, a tab-delimited file will be generated:\t\t\t##\n") | |
57 sys.stderr.write(" ## \t with two columns: 1) sequence ID 2) skewI value\t\t\t\t##\n") | |
58 sys.stderr.write(" #########################################################################################\n") | |
59 sys.stderr.write(" #########################################################################################\n\n") | |
60 exit(0) | |
61 ##################################################################### | |
62 def main(): | |
63 parser = argparse.ArgumentParser() | |
64 #Required parameter: | |
65 parser.add_argument("-i","--input","-s","--seq", | |
66 dest="in_file", required=False, default="", | |
67 help="Sequence file for which to calculate gc_skew") | |
68 #Defaults =20K/Same as window size | |
69 parser.add_argument("-k","-w","--window-len", | |
70 dest="window_size", required=False, default=20000, type=int, | |
71 help="Window size for which to calculate each sign(g-c) [default: 20kb]") | |
72 parser.add_argument("-f","--freq", dest="freq", type=int, default=-1, | |
73 help="Length between the start indices of each window. \ | |
74 [default: same as window_len, must be less than window size]") | |
75 #No output provided = standard out | |
76 parser.add_argument('-o','--output', required=False, | |
77 dest="out_file",default="", | |
78 help="Output text file to save skewi (skew index values)") | |
79 #Complete Genomes/Plasmid Sequence options | |
80 parser.add_argument('--complete', required=False, dest='complete_tf', | |
81 action='store_true', default=True, help='Only analyze complete genomes') | |
82 parser.add_argument('--all', required=False, dest='complete_tf', | |
83 action='store_false', default=True, help='Analyze complete and draft genomes') | |
84 parser.add_argument('--plasmid', required=False, dest='plasmid_tf', | |
85 action='store_true', default=False, help='Analyze plasmid sequences') | |
86 parser.add_argument('--no-plasmid', required=False, dest='plasmid_tf', | |
87 action='store_false', default=False, help='Exclude plasmid sequences') | |
88 #Sequence length minimum | |
89 parser.add_argument('--min-len', required=False, dest='min_seq_len', | |
90 type=int, default=500000, | |
91 help='Minimum sequence length to analyze [default: 500kb]') | |
92 #Usage option | |
93 parser.add_argument('--usage', required=False, dest='usage', | |
94 action='store_true', default=False, | |
95 help='Prints usage information for this program') | |
96 args=parser.parse_args() | |
97 | |
98 ######################## | |
99 #Test parameters | |
100 if args.usage: | |
101 usage() | |
102 if (args.in_file == ""): | |
103 sys.stderr.write(" >> Please provide an input sequence file\n") | |
104 usage() | |
105 exit(1) | |
106 if (not os.path.isfile(args.in_file)): | |
107 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file) | |
108 exit(1) | |
109 #Lengths | |
110 if (args.freq == -1): | |
111 args.freq = args.window_size | |
112 if (args.window_size < 0): | |
113 sys.stderr.write(" >> ERROR: --window-size must be greater than 0\n") | |
114 exit(1) | |
115 if (args.freq < 0): | |
116 sys.stderr.write(" >> ERROR: --freq must be greater than 0\n") | |
117 exit(1) | |
118 if (args.freq > args.window_size): | |
119 sys.stderr.write(" >> ERROR: window size must be >= frequency\n") | |
120 exit(1) | |
121 ######################## | |
122 #Start program | |
123 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
124 sys.stderr.write(" >> PROGRAM START TIME: " + time + '\n') | |
125 sys.stderr.write(" Input file: %s\n" % args.in_file) | |
126 if args.out_file != "": | |
127 sys.stderr.write(" Output: %s\n" % args.out_file) | |
128 else: | |
129 sys.stderr.write(" Output: system standard out\n") | |
130 sys.stderr.write(" Window size (bp): %i\n" % args.window_size) | |
131 sys.stderr.write(" Frequency (bp): %i\n" % args.freq) | |
132 sys.stderr.write(" Minimum sequence length (bp): %i\n" % args.min_seq_len) | |
133 sys.stderr.write(" Complete sequences only: %s\n" % args.complete_tf) | |
134 sys.stderr.write(" Exclude plasmids: %s\n" % (not args.plasmid_tf)) | |
135 sys.stderr.flush() | |
136 ######################## | |
137 count_seqs = 0 | |
138 argmax = [] | |
139 seq2skewi = {} | |
140 sys.stderr.write("\n") | |
141 sys.stderr.write(" >> Processing file: %s\n" % args.in_file) | |
142 sys.stderr.write("\r\t%i seqs evaluated: " % (count_seqs)) | |
143 for record in SeqIO.parse(args.in_file,'fasta'): | |
144 my_description = str(record.description) | |
145 if ("complete" not in my_description) and args.complete_tf: | |
146 continue | |
147 if ("plasmid" in my_description) and not args.plasmid_tf: | |
148 continue | |
149 my_seq = str(record.seq) | |
150 if len(my_seq) < args.min_seq_len: | |
151 continue | |
152 count_seqs += 1 | |
153 #Get sequence and description | |
154 #Print Update | |
155 count = 0 | |
156 tot = len(my_seq)/args.window_size | |
157 #Calculate skew | |
158 skew = [] | |
159 for i in range(0,len(my_seq),args.window_size): | |
160 g = my_seq[i:i+args.window_size].count("G") | |
161 c = my_seq[i:i+args.window_size].count("C") | |
162 if (g-c) > 0: | |
163 skew.append(1) | |
164 elif (g-c) < 0: | |
165 skew.append(-1) | |
166 else: | |
167 skew.append(0) | |
168 #Final print | |
169 curr_size = len(skew)/2 | |
170 sys.stdout.flush() | |
171 skew += skew[:curr_size] | |
172 #Calculate abs(T2-T1) where T is sum of values | |
173 curr_diffs = [] | |
174 x = sum(skew[0:curr_size]) | |
175 y = sum(skew[curr_size:2*curr_size]) | |
176 for i in range(0, curr_size-1): | |
177 curr_diffs.append(abs(x-y)) | |
178 x = x - skew[i] + skew[i+curr_size] | |
179 y = y - skew[i+curr_size] + skew[i+2*curr_size] | |
180 curr_diffs.append(abs(x-y)) | |
181 if len(curr_diffs) > 0: | |
182 max_cd = float(max(curr_diffs)) | |
183 seq2skewi[my_description] = max_cd/float(len(my_seq))*float(args.window_size) | |
184 sys.stderr.write("\r\t%i sequences evaluated" % (count_seqs)) | |
185 sys.stderr.flush() | |
186 | |
187 sys.stderr.write("\r\t%i total sequences evaluated (all finished)\n" % (count_seqs)) | |
188 sys.stderr.flush() | |
189 | |
190 if args.out_file == "": | |
191 sys.stdout.write("Sequence\tSkewI\n") | |
192 for seq in seq2skewi: | |
193 sys.stdout.write("%s\t%0.10f\n" % (seq,seq2skewi[seq])) | |
194 else: | |
195 sys.stdout.write(" >> Printing SkewI values to file: %s\n" % args.out_file) | |
196 o_file = open(args.out_file,'w') | |
197 o_file.write("Sequence\tSkewI\n") | |
198 for seq in seq2skewi: | |
199 o_file.write("%s\t%0.10f\n" % (seq,seq2skewi[seq])) | |
200 o_file.close() | |
201 | |
202 #End program | |
203 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
204 sys.stderr.write(" >> PROGRAM FINISH TIME: " + time + '\n') | |
205 if __name__== "__main__": | |
206 main() |