comparison SkewIT/src/plot_gcskew.py @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #! /usr/bin/env python
2 ##########################################################################
3 #plot_gcskew.py plots gcskew values for a given chromosome
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu
5 #
6 #This file is part of SkewIT
7 #
8 #SkewIT is free software; you can redistribute it and/or modify
9 #it under the terms of the GNU General Public License as published by
10 #the Free Software Foundation; either version 3 of the license, or
11 #(at your option) any later version.
12
13 #This program is distributed in the hope that it will be useful,
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 #GNU General Public License for more details.
17
18 #You should have received a copy of the GNU General Public License
19 #along with this program; if not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
21 #Jennifer Lu, jlu26@jhmi.edu
22 #2019/01/15
23 #
24 #This program plots gc_skew for a genome provided to this program
25 #By default, the program will calculate gc-skew for 20kb windows every 20kb
26 #(adjacent/non-overlapping windows of 20kb)
27 #
28 #Users can specify window size and frequency of the calculation. If a specified
29 #frequency is less than the window size, windows will be overlapping
30 ######################################################################
31 import sys, os, argparse
32 from time import gmtime
33 from time import strftime
34 from Bio import SeqIO
35 import numpy as np
36
37 import matplotlib
38 import matplotlib.pyplot as plt
39 import matplotlib.ticker as tick
40
41 def reformat_bp(tick_val, pos):
42 val = round(tick_val/1000000,1)
43 new_tick = '{:} Mb'.format(val)
44 new_tick = str(new_tick)
45 return new_tick
46
47 def main():
48 parser = argparse.ArgumentParser()
49 parser.add_argument("-i","--input","-s","--seq",
50 dest="in_file", required=True,
51 help="Sequence for which to calculate gc_skew")
52 parser.add_argument("-k","-l","--window_len", default=20000,
53 dest="window_size", required=False, type=int,
54 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]")
55 parser.add_argument("--freq",required=False,type=int,
56 dest="freq", default=1000,
57 help="Frequency at which to calculate GC skew [default: 1Kbp]")
58 parser.add_argument("-o","--output", required=False, default="curr.png",
59 dest="out_file", help="Name of GC Skew Plot to Save to (default: curr.png)")
60 args=parser.parse_args()
61
62 #Start program
63 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
64 sys.stdout.write(" PROGRAM START TIME: " + time + '\n')
65 if args.freq == 0:
66 freq = args.window_size
67 else:
68 freq = args.freq
69 #Process sequence file
70 id2string = {}
71 id2name = {}
72 count_seqs = 0
73 sys.stdout.write("\t>> Processing sequence file\n")
74 sys.stdout.write("\t\t%i seqs found" % (count_seqs))
75 sys.stdout.flush()
76 for record in SeqIO.parse(args.in_file,'fasta'):
77 count_seqs += 1
78 if count_seqs % 100 == 0:
79 sys.stdout.write("\r\t\t%i seqs found" % (count_seqs))
80 sys.stdout.flush()
81 #Save string
82 id2name[count_seqs] = record.id
83 id2string[count_seqs] = str(record.seq)
84 sys.stdout.write("\r\t\t%i seqs found\n" % (count_seqs))
85 sys.stdout.flush()
86
87 #Calculate and plot gc skew
88 tot = count_seqs
89 count_seqs = 0
90 sys.stdout.write("\t>> Plotting GC Skew for %i seqs\n" % tot)
91 sys.stdout.write("\t\t%i sequences processed" % (count_seqs))
92 sys.stdout.flush()
93 plot_filename = args.out_file
94 fig,ax = plt.subplots(nrows=tot, ncols=1,figsize=(10,3*tot),squeeze=False)
95 for i in range(1,tot+1):
96 #Calculate/plot skew
97 my_seq = id2string[i]
98 my_description = id2name[i]
99 count = 0
100 #Calculate skew
101 g = 0.0
102 c = 0.0
103 curr_seq = ""
104 indices = []
105 skews = []
106 for j in range(0, len(my_seq), freq):
107 if (j+args.window_size) >= len(my_seq):
108 break
109 curr_seq = my_seq[j:j+args.window_size]
110 g = float(curr_seq.count("G"))
111 c = float(curr_seq.count("C"))
112 if (g+c) > 0:
113 new_calc = (g-c)/(g+c)
114 else:
115 new_calc = 0.0
116 #Save values
117 indices.append(j)
118 skews.append(new_calc)
119 #Final print
120 count_seqs += 1
121 sys.stdout.write("\r\t\t%i sequences processed" % (count_seqs))
122 sys.stdout.flush()
123 #Split to pos/neg
124 s = np.array(skews)
125 pos = np.ma.masked_where(s <= 0, s)
126 neg = np.ma.masked_where(s >= 0, s)
127 mid = np.ma.masked_where(s != 0, s)
128 #Print to file
129 lines = ax[i-1,0].plot(indices,pos, 'deepskyblue', indices,mid, 'g', indices,neg,'k')
130 ax[i-1,0].set(xlabel='Genome Position', ylabel='GC Skew',
131 title=my_description)
132 ax[i-1,0].xaxis.set_major_formatter(tick.FuncFormatter(reformat_bp))
133 ax[i-1,0].grid()
134 #Font Sizes
135 plt.setp(lines,linewidth=0.3)
136 plt.xticks(fontsize=8)
137 plt.yticks(fontsize=8)
138 plt.tight_layout()
139 fig.savefig(plot_filename)
140 sys.stdout.write("\r\t\t%i sequences processed (all finished)\n" % (count_seqs))
141 sys.stdout.flush()
142
143 #End program
144 time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
145 sys.stdout.write(" PROGRAM FINISH TIME: " + time + '\n')
146
147 if __name__== "__main__":
148 main()