Mercurial > repos > dereeper > pangenome_explorer
comparison SkewIT/src/gcskew.py @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/env python | |
2 ########################################################################## | |
3 #gcskew.py calculates gcskew values for a given chromosome | |
4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | |
5 # | |
6 #This file is part of SkewIT | |
7 # | |
8 #SkewIT is free software; you can redistribute it and/or modify | |
9 #it under the terms of the GNU General Public License as published by | |
10 #the Free Software Foundation; either version 3 of the license, or | |
11 #(at your option) any later version. | |
12 | |
13 #This program is distributed in the hope that it will be useful, | |
14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 #GNU General Public License for more details. | |
17 | |
18 #You should have received a copy of the GNU General Public License | |
19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | |
20 ############################################################################## | |
21 #Jennifer Lu, jennifer.lu717@gmail.com | |
22 #02/26/2020 | |
23 # | |
24 #This program calculate gc_skew for a genome provided to this program. | |
25 #By default, the program will calculate gc-skew for 20kb windows every 20kb | |
26 #(adjacent/non-overlapping windows of 20kb) | |
27 # | |
28 #Users can specify window size and frequency of the calculation. If a specified | |
29 #frequency is less than the window size, windows will be overlapping | |
30 ###################################################################### | |
31 import sys, os, argparse | |
32 from time import gmtime | |
33 from time import strftime | |
34 from Bio import SeqIO | |
35 import numpy as np | |
36 ##################################################################### | |
37 def usage(): | |
38 sys.stderr.write("\n #########################################################################################\n") | |
39 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n") | |
40 sys.stderr.write(" ## > python gcskew.py -i SEQ.FASTA -o GCSKEW.TXT\t\t\t\t\t##\n") | |
41 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n") | |
42 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n") | |
43 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n") | |
44 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n") | |
45 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" ) | |
46 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n") | |
47 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n") | |
48 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to gcskew.txt\t##\n") | |
49 sys.stderr.write(" ## \t The output file is a 3 column, tab-delimited file listing:\t\t##\n") | |
50 sys.stderr.write(" ## \t (1) sequence ID (2) starting index of window (3) gc skew value\t\t##\n") | |
51 sys.stderr.write(" #########################################################################################\n") | |
52 sys.stderr.write(" #########################################################################################\n\n") | |
53 exit(0) | |
54 ##################################################################### | |
55 def main(): | |
56 parser = argparse.ArgumentParser() | |
57 parser.add_argument("-i","--input","-s","--seq", | |
58 dest="in_file", required=False, default="", | |
59 help="FASTA/Multi-FASTA sequence file for which to calculate gc_skew") | |
60 parser.add_argument("-k","-w","--window-size", default=20000, | |
61 dest="window_size", required=False, type=int, | |
62 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]") | |
63 parser.add_argument("-f","--freq",required=False,type=int, | |
64 dest="freq", default=-1, | |
65 help="Frequency at which to calculate GC skew [default: same as window size]") | |
66 parser.add_argument("-o","--output", required=False, default="gcskew.txt", | |
67 dest="out_file", help="Output text file to save gc skew calculations [default: gcskew.txt]") | |
68 parser.add_argument('--usage', required=False, dest='usage', | |
69 action='store_true', default=False, | |
70 help='Prints usage information for this program') | |
71 args=parser.parse_args() | |
72 | |
73 ############################################## | |
74 #Test Parameters | |
75 if args.usage: | |
76 usage() | |
77 if (args.in_file == ""): | |
78 sys.stderr.write(" >> Please provide an input sequence file\n") | |
79 usage() | |
80 exit(1) | |
81 if (not os.path.isfile(args.in_file)): | |
82 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file) | |
83 exit(1) | |
84 if args.freq == -1: | |
85 freq = args.window_size | |
86 else: | |
87 freq = args.freq | |
88 if (args.window_size < 0): | |
89 sys.stderr.write(" >> ERROR: window size -w must be greater than 0\n") | |
90 exit(1) | |
91 if (freq < 0): | |
92 sys.stderr.write(" >> ERROR: -f/--freq must be greater than 0\n") | |
93 exit(1) | |
94 if (args.freq > args.window_size): | |
95 sys.stderr.write(" >> ERROR: window size must be >= frequency\n") | |
96 exit(1) | |
97 ############################################## | |
98 #Start program | |
99 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
100 sys.stdout.write("PROGRAM START TIME: " + time + '\n') | |
101 sys.stdout.write(" Input file: %s\n" % args.in_file) | |
102 sys.stdout.write(" Output: %s\n" % args.out_file) | |
103 sys.stdout.write(" Window size (bp): %i\n" % args.window_size) | |
104 sys.stdout.write(" Frequency (bp): %i\n" % freq) | |
105 sys.stdout.flush() | |
106 ############################################################################ | |
107 #Process sequence file | |
108 id2string = {} | |
109 id2name = {} | |
110 count_seqs = 0 | |
111 sys.stdout.write(">> Reading sequence file %s\n" % args.in_file) | |
112 sys.stdout.write("\t%i seqs found" % (count_seqs)) | |
113 sys.stdout.flush() | |
114 for record in SeqIO.parse(args.in_file,'fasta'): | |
115 count_seqs += 1 | |
116 sys.stdout.write("\r\t%i seqs found" % (count_seqs)) | |
117 sys.stdout.flush() | |
118 #Save string | |
119 id2name[count_seqs] = record.id | |
120 id2string[count_seqs] = str(record.seq) | |
121 sys.stdout.write("\r\t%i seqs found\n" % (count_seqs)) | |
122 sys.stdout.flush() | |
123 | |
124 #Calculate and plot gc skew | |
125 tot = count_seqs | |
126 count_seqs = 0 | |
127 sys.stdout.write(">> Calculating/Saving GC Skew for %i seqs\n" % tot) | |
128 sys.stdout.write("\tworking on sequence %i/%i" % (count_seqs,tot)) | |
129 sys.stdout.flush() | |
130 #Prep output file | |
131 o_file = open(args.out_file,'w') | |
132 o_file.write("Sequence\tIndex\tGC Skew (%ikb)\n" % (args.window_size/1000)) | |
133 #For each sequence, calculate gc skew and print | |
134 for i in range(1,tot+1): | |
135 #Update | |
136 count_seqs += 1 | |
137 sys.stdout.write("\r\tworking on sequence %i/%i" % (count_seqs,tot)) | |
138 sys.stdout.flush() | |
139 #Calculate gc skew | |
140 my_seq = id2string[i].upper() | |
141 my_description = id2name[i] | |
142 count = 0 | |
143 #Calculate skew | |
144 g = 0.0 | |
145 c = 0.0 | |
146 curr_seq = "" | |
147 for i in range(0, len(my_seq), freq): | |
148 if (i+args.window_size) >= len(my_seq): | |
149 break | |
150 curr_seq = my_seq[i:i+args.window_size] | |
151 g = float(curr_seq.count("G")) | |
152 c = float(curr_seq.count("C")) | |
153 if (g+c) > 0: | |
154 new_calc = (g-c)/(g+c) | |
155 else: | |
156 new_calc = 0.0 | |
157 #Print to file | |
158 o_file.write("%s\t%i\t%0.8f\n" % (my_description, i, new_calc)) | |
159 sys.stdout.write("\r\tFinished calculating gc skew for %i sequences\n" % tot) | |
160 sys.stdout.flush() | |
161 o_file.close() | |
162 #End program | |
163 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
164 sys.stdout.write("PROGRAM FINISH TIME: " + time + '\n') | |
165 | |
166 if __name__== "__main__": | |
167 main() |