3
|
1 #! /usr/bin/env python
|
|
2 ##########################################################################
|
|
3 #skewi.py calculates a single SkewI value for a given chromosome
|
|
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu
|
|
5 #
|
|
6 #This file is part of SkewIT
|
|
7 #
|
|
8 #SkewIT is free software; you can redistribute it and/or modify
|
|
9 #it under the terms of the GNU General Public License as published by
|
|
10 #the Free Software Foundation; either version 3 of the license, or
|
|
11 #(at your option) any later version.
|
|
12
|
|
13 #This program is distributed in the hope that it will be useful,
|
|
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 #GNU General Public License for more details.
|
|
17
|
|
18 #You should have received a copy of the GNU General Public License
|
|
19 #along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
20
|
|
21 #####################################################################
|
|
22 #Jennifer Lu, jennifer.lu717@gmail.com
|
|
23 #02/17/2020
|
|
24 #
|
|
25 #This program calculates a Skew Index (SkewI)
|
|
26 #for a given genome within the range 0 to 1. Higher SkewI values
|
|
27 #indicate a strong GC Skew signal while lower SkewI values
|
|
28 #indicate a potentially low GC Skew signal
|
|
29 #
|
|
30 #Given a multi-fasta file, the program will output one SkewI per sequence
|
|
31 ######################################################################
|
|
32 import sys, os, argparse
|
|
33 from time import gmtime
|
|
34 from time import strftime
|
|
35 from Bio import SeqIO
|
|
36 import numpy as np
|
|
37 #####################################################################
|
|
38 def usage():
|
|
39 sys.stderr.write("\n #########################################################################################\n")
|
|
40 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n")
|
|
41 sys.stderr.write(" ## > python skewi.py -i SEQ.FASTA -o SKEW.TXT\t\t\t\t\t\t##\n")
|
|
42 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n")
|
|
43 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n")
|
|
44 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n")
|
|
45 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n")
|
|
46 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" )
|
|
47 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n")
|
|
48 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n")
|
|
49 sys.stderr.write(" ## \t --min-seq-len LEN.......set a minimum sequence length\t\t\t\t##\n")
|
|
50 sys.stderr.write(" ## \t .....................[default: 500,000bp]\t\t\t\t\t##\n")
|
|
51 sys.stderr.write(" ## \t --complete/--all........only analyze sequences with 'complete' in header\t##\n")
|
|
52 sys.stderr.write(" ## \t .....................[default: --complete]\t\t\t\t\t##\n")
|
|
53 sys.stderr.write(" ## \t --plasmid/--no-plasmid..include/exclude plasmid sequences from analysis\t##\n")
|
|
54 sys.stderr.write(" ## \t .....................[default: --no-plasmid]\t\t\t\t##\n")
|
|
55 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to standard out\t##\n")
|
|
56 sys.stderr.write(" ## \t Otherwise, a tab-delimited file will be generated:\t\t\t##\n")
|
|
57 sys.stderr.write(" ## \t with two columns: 1) sequence ID 2) skewI value\t\t\t\t##\n")
|
|
58 sys.stderr.write(" #########################################################################################\n")
|
|
59 sys.stderr.write(" #########################################################################################\n\n")
|
|
60 exit(0)
|
|
61 #####################################################################
|
|
62 def main():
|
|
63 parser = argparse.ArgumentParser()
|
|
64 #Required parameter:
|
|
65 parser.add_argument("-i","--input","-s","--seq",
|
|
66 dest="in_file", required=False, default="",
|
|
67 help="Sequence file for which to calculate gc_skew")
|
|
68 #Defaults =20K/Same as window size
|
|
69 parser.add_argument("-k","-w","--window-len",
|
|
70 dest="window_size", required=False, default=20000, type=int,
|
|
71 help="Window size for which to calculate each sign(g-c) [default: 20kb]")
|
|
72 parser.add_argument("-f","--freq", dest="freq", type=int, default=-1,
|
|
73 help="Length between the start indices of each window. \
|
|
74 [default: same as window_len, must be less than window size]")
|
|
75 #No output provided = standard out
|
|
76 parser.add_argument('-o','--output', required=False,
|
|
77 dest="out_file",default="",
|
|
78 help="Output text file to save skewi (skew index values)")
|
|
79 #Complete Genomes/Plasmid Sequence options
|
|
80 parser.add_argument('--complete', required=False, dest='complete_tf',
|
|
81 action='store_true', default=True, help='Only analyze complete genomes')
|
|
82 parser.add_argument('--all', required=False, dest='complete_tf',
|
|
83 action='store_false', default=True, help='Analyze complete and draft genomes')
|
|
84 parser.add_argument('--plasmid', required=False, dest='plasmid_tf',
|
|
85 action='store_true', default=False, help='Analyze plasmid sequences')
|
|
86 parser.add_argument('--no-plasmid', required=False, dest='plasmid_tf',
|
|
87 action='store_false', default=False, help='Exclude plasmid sequences')
|
|
88 #Sequence length minimum
|
|
89 parser.add_argument('--min-len', required=False, dest='min_seq_len',
|
|
90 type=int, default=500000,
|
|
91 help='Minimum sequence length to analyze [default: 500kb]')
|
|
92 #Usage option
|
|
93 parser.add_argument('--usage', required=False, dest='usage',
|
|
94 action='store_true', default=False,
|
|
95 help='Prints usage information for this program')
|
|
96 args=parser.parse_args()
|
|
97
|
|
98 ########################
|
|
99 #Test parameters
|
|
100 if args.usage:
|
|
101 usage()
|
|
102 if (args.in_file == ""):
|
|
103 sys.stderr.write(" >> Please provide an input sequence file\n")
|
|
104 usage()
|
|
105 exit(1)
|
|
106 if (not os.path.isfile(args.in_file)):
|
|
107 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file)
|
|
108 exit(1)
|
|
109 #Lengths
|
|
110 if (args.freq == -1):
|
|
111 args.freq = args.window_size
|
|
112 if (args.window_size < 0):
|
|
113 sys.stderr.write(" >> ERROR: --window-size must be greater than 0\n")
|
|
114 exit(1)
|
|
115 if (args.freq < 0):
|
|
116 sys.stderr.write(" >> ERROR: --freq must be greater than 0\n")
|
|
117 exit(1)
|
|
118 if (args.freq > args.window_size):
|
|
119 sys.stderr.write(" >> ERROR: window size must be >= frequency\n")
|
|
120 exit(1)
|
|
121 ########################
|
|
122 #Start program
|
|
123 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
|
|
124 sys.stderr.write(" >> PROGRAM START TIME: " + time + '\n')
|
|
125 sys.stderr.write(" Input file: %s\n" % args.in_file)
|
|
126 if args.out_file != "":
|
|
127 sys.stderr.write(" Output: %s\n" % args.out_file)
|
|
128 else:
|
|
129 sys.stderr.write(" Output: system standard out\n")
|
|
130 sys.stderr.write(" Window size (bp): %i\n" % args.window_size)
|
|
131 sys.stderr.write(" Frequency (bp): %i\n" % args.freq)
|
|
132 sys.stderr.write(" Minimum sequence length (bp): %i\n" % args.min_seq_len)
|
|
133 sys.stderr.write(" Complete sequences only: %s\n" % args.complete_tf)
|
|
134 sys.stderr.write(" Exclude plasmids: %s\n" % (not args.plasmid_tf))
|
|
135 sys.stderr.flush()
|
|
136 ########################
|
|
137 count_seqs = 0
|
|
138 argmax = []
|
|
139 seq2skewi = {}
|
|
140 sys.stderr.write("\n")
|
|
141 sys.stderr.write(" >> Processing file: %s\n" % args.in_file)
|
|
142 sys.stderr.write("\r\t%i seqs evaluated: " % (count_seqs))
|
|
143 for record in SeqIO.parse(args.in_file,'fasta'):
|
|
144 my_description = str(record.description)
|
|
145 if ("complete" not in my_description) and args.complete_tf:
|
|
146 continue
|
|
147 if ("plasmid" in my_description) and not args.plasmid_tf:
|
|
148 continue
|
|
149 my_seq = str(record.seq)
|
|
150 if len(my_seq) < args.min_seq_len:
|
|
151 continue
|
|
152 count_seqs += 1
|
|
153 #Get sequence and description
|
|
154 #Print Update
|
|
155 count = 0
|
|
156 tot = len(my_seq)/args.window_size
|
|
157 #Calculate skew
|
|
158 skew = []
|
|
159 for i in range(0,len(my_seq),args.window_size):
|
|
160 g = my_seq[i:i+args.window_size].count("G")
|
|
161 c = my_seq[i:i+args.window_size].count("C")
|
|
162 if (g-c) > 0:
|
|
163 skew.append(1)
|
|
164 elif (g-c) < 0:
|
|
165 skew.append(-1)
|
|
166 else:
|
|
167 skew.append(0)
|
|
168 #Final print
|
|
169 curr_size = len(skew)/2
|
|
170 sys.stdout.flush()
|
|
171 skew += skew[:curr_size]
|
|
172 #Calculate abs(T2-T1) where T is sum of values
|
|
173 curr_diffs = []
|
|
174 x = sum(skew[0:curr_size])
|
|
175 y = sum(skew[curr_size:2*curr_size])
|
|
176 for i in range(0, curr_size-1):
|
|
177 curr_diffs.append(abs(x-y))
|
|
178 x = x - skew[i] + skew[i+curr_size]
|
|
179 y = y - skew[i+curr_size] + skew[i+2*curr_size]
|
|
180 curr_diffs.append(abs(x-y))
|
|
181 if len(curr_diffs) > 0:
|
|
182 max_cd = float(max(curr_diffs))
|
|
183 seq2skewi[my_description] = max_cd/float(len(my_seq))*float(args.window_size)
|
|
184 sys.stderr.write("\r\t%i sequences evaluated" % (count_seqs))
|
|
185 sys.stderr.flush()
|
|
186
|
|
187 sys.stderr.write("\r\t%i total sequences evaluated (all finished)\n" % (count_seqs))
|
|
188 sys.stderr.flush()
|
|
189
|
|
190 if args.out_file == "":
|
|
191 sys.stdout.write("Sequence\tSkewI\n")
|
|
192 for seq in seq2skewi:
|
|
193 sys.stdout.write("%s\t%0.10f\n" % (seq,seq2skewi[seq]))
|
|
194 else:
|
|
195 sys.stdout.write(" >> Printing SkewI values to file: %s\n" % args.out_file)
|
|
196 o_file = open(args.out_file,'w')
|
|
197 o_file.write("Sequence\tSkewI\n")
|
|
198 for seq in seq2skewi:
|
|
199 o_file.write("%s\t%0.10f\n" % (seq,seq2skewi[seq]))
|
|
200 o_file.close()
|
|
201
|
|
202 #End program
|
|
203 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
|
|
204 sys.stderr.write(" >> PROGRAM FINISH TIME: " + time + '\n')
|
|
205 if __name__== "__main__":
|
|
206 main()
|