annotate SkewIT/src/plot_gcskew.py @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #! /usr/bin/env python
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2 ##########################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #plot_gcskew.py plots gcskew values for a given chromosome
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 #This file is part of SkewIT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 #SkewIT is free software; you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 #it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 #the Free Software Foundation; either version 3 of the license, or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 #(at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 #This program is distributed in the hope that it will be useful,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 #GNU General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 #You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 #along with this program; if not, see <http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 ##############################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 #Jennifer Lu, jlu26@jhmi.edu
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 #2019/01/15
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 #This program plots gc_skew for a genome provided to this program
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 #By default, the program will calculate gc-skew for 20kb windows every 20kb
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 #(adjacent/non-overlapping windows of 20kb)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 #Users can specify window size and frequency of the calculation. If a specified
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 #frequency is less than the window size, windows will be overlapping
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 ######################################################################
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 import sys, os, argparse
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 from time import gmtime
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 from time import strftime
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 from Bio import SeqIO
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 import numpy as np
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 import matplotlib
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 import matplotlib.pyplot as plt
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 import matplotlib.ticker as tick
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 def reformat_bp(tick_val, pos):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 val = round(tick_val/1000000,1)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 new_tick = '{:} Mb'.format(val)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 new_tick = str(new_tick)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 return new_tick
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 def main():
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 parser = argparse.ArgumentParser()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 parser.add_argument("-i","--input","-s","--seq",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 dest="in_file", required=True,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 help="Sequence for which to calculate gc_skew")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 parser.add_argument("-k","-l","--window_len", default=20000,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 dest="window_size", required=False, type=int,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 parser.add_argument("--freq",required=False,type=int,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 dest="freq", default=1000,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 help="Frequency at which to calculate GC skew [default: 1Kbp]")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 parser.add_argument("-o","--output", required=False, default="curr.png",
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 dest="out_file", help="Name of GC Skew Plot to Save to (default: curr.png)")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 args=parser.parse_args()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 #Start program
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 sys.stdout.write(" PROGRAM START TIME: " + time + '\n')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 if args.freq == 0:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 freq = args.window_size
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 else:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 freq = args.freq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 #Process sequence file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 id2string = {}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 id2name = {}
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 count_seqs = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 sys.stdout.write("\t>> Processing sequence file\n")
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 sys.stdout.write("\t\t%i seqs found" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 for record in SeqIO.parse(args.in_file,'fasta'):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 count_seqs += 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 if count_seqs % 100 == 0:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 sys.stdout.write("\r\t\t%i seqs found" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 #Save string
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 id2name[count_seqs] = record.id
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 id2string[count_seqs] = str(record.seq)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84 sys.stdout.write("\r\t\t%i seqs found\n" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 #Calculate and plot gc skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 tot = count_seqs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 count_seqs = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 sys.stdout.write("\t>> Plotting GC Skew for %i seqs\n" % tot)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 sys.stdout.write("\t\t%i sequences processed" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 plot_filename = args.out_file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 fig,ax = plt.subplots(nrows=tot, ncols=1,figsize=(10,3*tot),squeeze=False)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 for i in range(1,tot+1):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 #Calculate/plot skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 my_seq = id2string[i]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 my_description = id2name[i]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 count = 0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 #Calculate skew
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 g = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 c = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 curr_seq = ""
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 indices = []
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 skews = []
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 for j in range(0, len(my_seq), freq):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 if (j+args.window_size) >= len(my_seq):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 break
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 curr_seq = my_seq[j:j+args.window_size]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 g = float(curr_seq.count("G"))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 c = float(curr_seq.count("C"))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 if (g+c) > 0:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 new_calc = (g-c)/(g+c)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 else:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 new_calc = 0.0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 #Save values
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 indices.append(j)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 skews.append(new_calc)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 #Final print
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 count_seqs += 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 sys.stdout.write("\r\t\t%i sequences processed" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 #Split to pos/neg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 s = np.array(skews)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 pos = np.ma.masked_where(s <= 0, s)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 neg = np.ma.masked_where(s >= 0, s)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 mid = np.ma.masked_where(s != 0, s)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 #Print to file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 lines = ax[i-1,0].plot(indices,pos, 'deepskyblue', indices,mid, 'g', indices,neg,'k')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 ax[i-1,0].set(xlabel='Genome Position', ylabel='GC Skew',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 title=my_description)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 ax[i-1,0].xaxis.set_major_formatter(tick.FuncFormatter(reformat_bp))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 ax[i-1,0].grid()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 #Font Sizes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 plt.setp(lines,linewidth=0.3)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 plt.xticks(fontsize=8)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 plt.yticks(fontsize=8)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 plt.tight_layout()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 fig.savefig(plot_filename)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 sys.stdout.write("\r\t\t%i sequences processed (all finished)\n" % (count_seqs))
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 sys.stdout.flush()
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 #End program
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 sys.stdout.write(" PROGRAM FINISH TIME: " + time + '\n')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 if __name__== "__main__":
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 main()