annotate SkewIT/src/gcskew.py @ 5:4157f435fa89 draft

Uploaded
author dereeper
date Thu, 30 May 2024 12:11:25 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/env python
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2 ##########################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #gcskew.py calculates gcskew values for a given chromosome
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 #This file is part of SkewIT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 #SkewIT is free software; you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 #it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 #the Free Software Foundation; either version 3 of the license, or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 #(at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 #This program is distributed in the hope that it will be useful,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 #GNU General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 #You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 #along with this program; if not, see <http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 ##############################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 #Jennifer Lu, jennifer.lu717@gmail.com
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 #02/26/2020
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 #This program calculate gc_skew for a genome provided to this program.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 #By default, the program will calculate gc-skew for 20kb windows every 20kb
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 #(adjacent/non-overlapping windows of 20kb)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 #Users can specify window size and frequency of the calculation. If a specified
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 #frequency is less than the window size, windows will be overlapping
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 ######################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 import sys, os, argparse
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 from time import gmtime
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 from time import strftime
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 from Bio import SeqIO
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 import numpy as np
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 #####################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 def usage():
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 sys.stderr.write("\n #########################################################################################\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 sys.stderr.write(" ## > python gcskew.py -i SEQ.FASTA -o GCSKEW.TXT\t\t\t\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" )
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to gcskew.txt\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 sys.stderr.write(" ## \t The output file is a 3 column, tab-delimited file listing:\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 sys.stderr.write(" ## \t (1) sequence ID (2) starting index of window (3) gc skew value\t\t##\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 sys.stderr.write(" #########################################################################################\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 sys.stderr.write(" #########################################################################################\n\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 exit(0)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 #####################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 def main():
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 parser = argparse.ArgumentParser()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 parser.add_argument("-i","--input","-s","--seq",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 dest="in_file", required=False, default="",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 help="FASTA/Multi-FASTA sequence file for which to calculate gc_skew")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 parser.add_argument("-k","-w","--window-size", default=20000,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 dest="window_size", required=False, type=int,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 parser.add_argument("-f","--freq",required=False,type=int,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 dest="freq", default=-1,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 help="Frequency at which to calculate GC skew [default: same as window size]")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 parser.add_argument("-o","--output", required=False, default="gcskew.txt",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 dest="out_file", help="Output text file to save gc skew calculations [default: gcskew.txt]")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 parser.add_argument('--usage', required=False, dest='usage',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 action='store_true', default=False,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 help='Prints usage information for this program')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 args=parser.parse_args()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 ##############################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 #Test Parameters
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 if args.usage:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 usage()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 if (args.in_file == ""):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 sys.stderr.write(" >> Please provide an input sequence file\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 usage()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 exit(1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 if (not os.path.isfile(args.in_file)):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 exit(1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 if args.freq == -1:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 freq = args.window_size
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 else:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 freq = args.freq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 if (args.window_size < 0):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 sys.stderr.write(" >> ERROR: window size -w must be greater than 0\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 exit(1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 if (freq < 0):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 sys.stderr.write(" >> ERROR: -f/--freq must be greater than 0\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 exit(1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 if (args.freq > args.window_size):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 sys.stderr.write(" >> ERROR: window size must be >= frequency\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 exit(1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 ##############################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 #Start program
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 sys.stdout.write("PROGRAM START TIME: " + time + '\n')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 sys.stdout.write(" Input file: %s\n" % args.in_file)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 sys.stdout.write(" Output: %s\n" % args.out_file)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 sys.stdout.write(" Window size (bp): %i\n" % args.window_size)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 sys.stdout.write(" Frequency (bp): %i\n" % freq)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 ############################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 #Process sequence file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 id2string = {}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 id2name = {}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 count_seqs = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 sys.stdout.write(">> Reading sequence file %s\n" % args.in_file)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 sys.stdout.write("\t%i seqs found" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 for record in SeqIO.parse(args.in_file,'fasta'):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 count_seqs += 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 sys.stdout.write("\r\t%i seqs found" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 #Save string
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 id2name[count_seqs] = record.id
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 id2string[count_seqs] = str(record.seq)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 sys.stdout.write("\r\t%i seqs found\n" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 #Calculate and plot gc skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 tot = count_seqs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 count_seqs = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 sys.stdout.write(">> Calculating/Saving GC Skew for %i seqs\n" % tot)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 sys.stdout.write("\tworking on sequence %i/%i" % (count_seqs,tot))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 #Prep output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 o_file = open(args.out_file,'w')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 o_file.write("Sequence\tIndex\tGC Skew (%ikb)\n" % (args.window_size/1000))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 #For each sequence, calculate gc skew and print
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 for i in range(1,tot+1):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 #Update
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 count_seqs += 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 sys.stdout.write("\r\tworking on sequence %i/%i" % (count_seqs,tot))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 #Calculate gc skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 my_seq = id2string[i].upper()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 my_description = id2name[i]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 count = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 #Calculate skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 g = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 c = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 curr_seq = ""
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 for i in range(0, len(my_seq), freq):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 if (i+args.window_size) >= len(my_seq):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 break
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 curr_seq = my_seq[i:i+args.window_size]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 g = float(curr_seq.count("G"))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 c = float(curr_seq.count("C"))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 if (g+c) > 0:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 new_calc = (g-c)/(g+c)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 else:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 new_calc = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 #Print to file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 o_file.write("%s\t%i\t%0.8f\n" % (my_description, i, new_calc))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 sys.stdout.write("\r\tFinished calculating gc skew for %i sequences\n" % tot)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 o_file.close()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 #End program
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 sys.stdout.write("PROGRAM FINISH TIME: " + time + '\n')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 if __name__== "__main__":
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 main()