Mercurial > repos > dereeper > pangenome_explorer
comparison SkewIT/src/plot_gcskew.py @ 3:e42d30da7a74 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:52:25 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:97e4e3e818b6 | 3:e42d30da7a74 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 ########################################################################## | |
| 3 #plot_gcskew.py plots gcskew values for a given chromosome | |
| 4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | |
| 5 # | |
| 6 #This file is part of SkewIT | |
| 7 # | |
| 8 #SkewIT is free software; you can redistribute it and/or modify | |
| 9 #it under the terms of the GNU General Public License as published by | |
| 10 #the Free Software Foundation; either version 3 of the license, or | |
| 11 #(at your option) any later version. | |
| 12 | |
| 13 #This program is distributed in the hope that it will be useful, | |
| 14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 #GNU General Public License for more details. | |
| 17 | |
| 18 #You should have received a copy of the GNU General Public License | |
| 19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | |
| 20 ############################################################################## | |
| 21 #Jennifer Lu, jlu26@jhmi.edu | |
| 22 #2019/01/15 | |
| 23 # | |
| 24 #This program plots gc_skew for a genome provided to this program | |
| 25 #By default, the program will calculate gc-skew for 20kb windows every 20kb | |
| 26 #(adjacent/non-overlapping windows of 20kb) | |
| 27 # | |
| 28 #Users can specify window size and frequency of the calculation. If a specified | |
| 29 #frequency is less than the window size, windows will be overlapping | |
| 30 ###################################################################### | |
| 31 import sys, os, argparse | |
| 32 from time import gmtime | |
| 33 from time import strftime | |
| 34 from Bio import SeqIO | |
| 35 import numpy as np | |
| 36 | |
| 37 import matplotlib | |
| 38 import matplotlib.pyplot as plt | |
| 39 import matplotlib.ticker as tick | |
| 40 | |
| 41 def reformat_bp(tick_val, pos): | |
| 42 val = round(tick_val/1000000,1) | |
| 43 new_tick = '{:} Mb'.format(val) | |
| 44 new_tick = str(new_tick) | |
| 45 return new_tick | |
| 46 | |
| 47 def main(): | |
| 48 parser = argparse.ArgumentParser() | |
| 49 parser.add_argument("-i","--input","-s","--seq", | |
| 50 dest="in_file", required=True, | |
| 51 help="Sequence for which to calculate gc_skew") | |
| 52 parser.add_argument("-k","-l","--window_len", default=20000, | |
| 53 dest="window_size", required=False, type=int, | |
| 54 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]") | |
| 55 parser.add_argument("--freq",required=False,type=int, | |
| 56 dest="freq", default=1000, | |
| 57 help="Frequency at which to calculate GC skew [default: 1Kbp]") | |
| 58 parser.add_argument("-o","--output", required=False, default="curr.png", | |
| 59 dest="out_file", help="Name of GC Skew Plot to Save to (default: curr.png)") | |
| 60 args=parser.parse_args() | |
| 61 | |
| 62 #Start program | |
| 63 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 64 sys.stdout.write(" PROGRAM START TIME: " + time + '\n') | |
| 65 if args.freq == 0: | |
| 66 freq = args.window_size | |
| 67 else: | |
| 68 freq = args.freq | |
| 69 #Process sequence file | |
| 70 id2string = {} | |
| 71 id2name = {} | |
| 72 count_seqs = 0 | |
| 73 sys.stdout.write("\t>> Processing sequence file\n") | |
| 74 sys.stdout.write("\t\t%i seqs found" % (count_seqs)) | |
| 75 sys.stdout.flush() | |
| 76 for record in SeqIO.parse(args.in_file,'fasta'): | |
| 77 count_seqs += 1 | |
| 78 if count_seqs % 100 == 0: | |
| 79 sys.stdout.write("\r\t\t%i seqs found" % (count_seqs)) | |
| 80 sys.stdout.flush() | |
| 81 #Save string | |
| 82 id2name[count_seqs] = record.id | |
| 83 id2string[count_seqs] = str(record.seq) | |
| 84 sys.stdout.write("\r\t\t%i seqs found\n" % (count_seqs)) | |
| 85 sys.stdout.flush() | |
| 86 | |
| 87 #Calculate and plot gc skew | |
| 88 tot = count_seqs | |
| 89 count_seqs = 0 | |
| 90 sys.stdout.write("\t>> Plotting GC Skew for %i seqs\n" % tot) | |
| 91 sys.stdout.write("\t\t%i sequences processed" % (count_seqs)) | |
| 92 sys.stdout.flush() | |
| 93 plot_filename = args.out_file | |
| 94 fig,ax = plt.subplots(nrows=tot, ncols=1,figsize=(10,3*tot),squeeze=False) | |
| 95 for i in range(1,tot+1): | |
| 96 #Calculate/plot skew | |
| 97 my_seq = id2string[i] | |
| 98 my_description = id2name[i] | |
| 99 count = 0 | |
| 100 #Calculate skew | |
| 101 g = 0.0 | |
| 102 c = 0.0 | |
| 103 curr_seq = "" | |
| 104 indices = [] | |
| 105 skews = [] | |
| 106 for j in range(0, len(my_seq), freq): | |
| 107 if (j+args.window_size) >= len(my_seq): | |
| 108 break | |
| 109 curr_seq = my_seq[j:j+args.window_size] | |
| 110 g = float(curr_seq.count("G")) | |
| 111 c = float(curr_seq.count("C")) | |
| 112 if (g+c) > 0: | |
| 113 new_calc = (g-c)/(g+c) | |
| 114 else: | |
| 115 new_calc = 0.0 | |
| 116 #Save values | |
| 117 indices.append(j) | |
| 118 skews.append(new_calc) | |
| 119 #Final print | |
| 120 count_seqs += 1 | |
| 121 sys.stdout.write("\r\t\t%i sequences processed" % (count_seqs)) | |
| 122 sys.stdout.flush() | |
| 123 #Split to pos/neg | |
| 124 s = np.array(skews) | |
| 125 pos = np.ma.masked_where(s <= 0, s) | |
| 126 neg = np.ma.masked_where(s >= 0, s) | |
| 127 mid = np.ma.masked_where(s != 0, s) | |
| 128 #Print to file | |
| 129 lines = ax[i-1,0].plot(indices,pos, 'deepskyblue', indices,mid, 'g', indices,neg,'k') | |
| 130 ax[i-1,0].set(xlabel='Genome Position', ylabel='GC Skew', | |
| 131 title=my_description) | |
| 132 ax[i-1,0].xaxis.set_major_formatter(tick.FuncFormatter(reformat_bp)) | |
| 133 ax[i-1,0].grid() | |
| 134 #Font Sizes | |
| 135 plt.setp(lines,linewidth=0.3) | |
| 136 plt.xticks(fontsize=8) | |
| 137 plt.yticks(fontsize=8) | |
| 138 plt.tight_layout() | |
| 139 fig.savefig(plot_filename) | |
| 140 sys.stdout.write("\r\t\t%i sequences processed (all finished)\n" % (count_seqs)) | |
| 141 sys.stdout.flush() | |
| 142 | |
| 143 #End program | |
| 144 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 145 sys.stdout.write(" PROGRAM FINISH TIME: " + time + '\n') | |
| 146 | |
| 147 if __name__== "__main__": | |
| 148 main() |
