diff SkewIT/src/gcskew.py @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SkewIT/src/gcskew.py	Thu May 30 11:52:25 2024 +0000
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+##########################################################################
+#gcskew.py calculates gcskew values for a given chromosome
+#Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu
+#
+#This file is part of SkewIT 
+#
+#SkewIT is free software; you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation; either version 3 of the license, or
+#(at your option) any later version.
+
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+
+#You should have received a copy of the GNU General Public License
+#along with this program; if not, see <http://www.gnu.org/licenses/>.
+##############################################################################
+#Jennifer Lu, jennifer.lu717@gmail.com
+#02/26/2020
+#
+#This program calculate gc_skew for a genome provided to this program.
+#By default, the program will calculate gc-skew for 20kb windows every 20kb 
+#(adjacent/non-overlapping windows of 20kb)
+#
+#Users can specify window size and frequency of the calculation. If a specified
+#frequency is less than the window size, windows will be overlapping 
+######################################################################
+import sys, os, argparse 
+from time import gmtime
+from time import strftime
+from Bio import SeqIO
+import numpy as np
+#####################################################################
+def usage():
+    sys.stderr.write("\n #########################################################################################\n")
+    sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n")
+    sys.stderr.write(" ## > python gcskew.py -i SEQ.FASTA -o GCSKEW.TXT\t\t\t\t\t##\n")
+    sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n")
+    sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n")
+    sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n")
+    sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n")
+    sys.stderr.write(" ## \t    .....................[default:20000]\t\t\t\t\t##\n" )
+    sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n")
+    sys.stderr.write(" ## \t    .....................[default: k == f]\t\t\t\t\t##\n")
+    sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to gcskew.txt\t##\n")
+    sys.stderr.write(" ## \t      The output file is a 3 column, tab-delimited file listing:\t\t##\n")
+    sys.stderr.write(" ## \t      (1) sequence ID (2) starting index of window (3) gc skew value\t\t##\n")   
+    sys.stderr.write(" #########################################################################################\n")
+    sys.stderr.write(" #########################################################################################\n\n")
+    exit(0)
+#####################################################################
+def main(): 
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i","--input","-s","--seq", 
+        dest="in_file", required=False, default="",
+        help="FASTA/Multi-FASTA sequence file for which to calculate gc_skew")
+    parser.add_argument("-k","-w","--window-size", default=20000,
+        dest="window_size", required=False, type=int,
+        help="Window size for which to calculate each g-c/g+c [default: 20Kbp]")
+    parser.add_argument("-f","--freq",required=False,type=int,
+        dest="freq", default=-1, 
+        help="Frequency at which to calculate GC skew [default: same as window size]")
+    parser.add_argument("-o","--output", required=False, default="gcskew.txt",
+        dest="out_file", help="Output text file to save gc skew calculations [default: gcskew.txt]")
+    parser.add_argument('--usage', required=False, dest='usage', 
+        action='store_true', default=False,
+        help='Prints usage information for this program')
+    args=parser.parse_args() 
+    
+    ##############################################
+    #Test Parameters
+    if args.usage:
+        usage()
+    if (args.in_file == ""):
+        sys.stderr.write(" >> Please provide an input sequence file\n")
+        usage()
+        exit(1)
+    if (not os.path.isfile(args.in_file)):
+        sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file) 
+        exit(1)
+    if args.freq == -1:
+        freq = args.window_size
+    else:
+        freq = args.freq
+    if (args.window_size < 0):
+        sys.stderr.write(" >> ERROR: window size -w must be greater than 0\n")
+        exit(1)
+    if (freq < 0):
+        sys.stderr.write(" >> ERROR: -f/--freq must be greater than 0\n")
+        exit(1)
+    if (args.freq > args.window_size):
+        sys.stderr.write(" >> ERROR: window size must be >= frequency\n")
+        exit(1)
+    ##############################################
+    #Start program
+    time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
+    sys.stdout.write("PROGRAM START TIME: " + time + '\n')
+    sys.stdout.write("    Input file: %s\n" % args.in_file) 
+    sys.stdout.write("    Output: %s\n" % args.out_file)
+    sys.stdout.write("    Window size (bp): %i\n" % args.window_size)
+    sys.stdout.write("    Frequency (bp): %i\n" % freq) 
+    sys.stdout.flush()
+    ############################################################################
+    #Process sequence file 
+    id2string = {}
+    id2name = {}
+    count_seqs = 0 
+    sys.stdout.write(">> Reading sequence file %s\n" % args.in_file)
+    sys.stdout.write("\t%i seqs found" % (count_seqs))
+    sys.stdout.flush()
+    for record in SeqIO.parse(args.in_file,'fasta'):
+        count_seqs += 1
+        sys.stdout.write("\r\t%i seqs found" % (count_seqs))
+        sys.stdout.flush()
+        #Save string
+        id2name[count_seqs] = record.id
+        id2string[count_seqs] = str(record.seq)
+    sys.stdout.write("\r\t%i seqs found\n" % (count_seqs))
+    sys.stdout.flush()
+
+    #Calculate and plot gc skew
+    tot = count_seqs
+    count_seqs = 0
+    sys.stdout.write(">> Calculating/Saving GC Skew for %i seqs\n" % tot)
+    sys.stdout.write("\tworking on sequence %i/%i" % (count_seqs,tot))
+    sys.stdout.flush()
+    #Prep output file
+    o_file = open(args.out_file,'w')
+    o_file.write("Sequence\tIndex\tGC Skew (%ikb)\n" % (args.window_size/1000))
+    #For each sequence, calculate gc skew and print
+    for i in range(1,tot+1): 
+        #Update 
+        count_seqs += 1
+        sys.stdout.write("\r\tworking on sequence %i/%i" % (count_seqs,tot))
+        sys.stdout.flush()
+        #Calculate gc skew
+        my_seq = id2string[i].upper()
+        my_description = id2name[i]
+        count = 0
+        #Calculate skew
+        g = 0.0
+        c = 0.0
+        curr_seq = ""
+        for i in range(0, len(my_seq), freq):
+            if (i+args.window_size) >= len(my_seq):
+                break
+            curr_seq = my_seq[i:i+args.window_size]
+            g = float(curr_seq.count("G"))
+            c = float(curr_seq.count("C"))
+            if (g+c) > 0:
+                new_calc = (g-c)/(g+c)
+            else:
+                new_calc = 0.0
+            #Print to file
+            o_file.write("%s\t%i\t%0.8f\n" % (my_description, i, new_calc))
+    sys.stdout.write("\r\tFinished calculating gc skew for %i sequences\n" % tot)
+    sys.stdout.flush()
+    o_file.close()
+    #End program
+    time = strftime("%m-%d-%Y %H:%M:%S", gmtime())
+    sys.stdout.write("PROGRAM FINISH TIME: " + time + '\n')
+
+if __name__== "__main__":
+    main()