Mercurial > repos > jamueller > codon_analysis

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GeneCodonSearch1_2.py	Thu May 23 05:16:03 2019 -0400
@@ -0,0 +1,94 @@
+#1: Modules
+import re
+import argparse
+import numpy as np
+np.set_printoptions(threshold=np.inf)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-m", "--minimum",type=float, help="cutoff value")
+parser.add_argument("-c", "--codon",type=str, help="codons of interest")
+parser.add_argument("-i", "--input_sequences", type=str, help="")
+parser.add_argument("-r", "--result", type=str)
+
+args = parser.parse_args()
+
+sequences = args.input_sequences
+cutoff = args.minimum
+codonlist=args.codon
+resultfile=args.result
+
+codonlist = codonlist.split(",")
+
+i=0
+codon_dictionary = {}
+while i < len(codonlist):
+    codon_dictionary[codonlist[i]] = 0
+    i+=1
+#2:input
+dict1 = {}
+dict2 = {}
+seq_id =""
+
+i=0
+input_sequences = open(sequences,mode = 'r')
+for line in input_sequences:
+    if line [0] == ">":
+        match = re.search(">(\S+)\s(.*)$",line)
+        seq_id = match.group(1)
+        dict1[seq_id] = ""
+    elif seq_id != "" and line.strip()!= "":
+        dict1[seq_id] += line.strip()
+
+i=0
+for name in dict1.keys():
+    dict2[i] = name
+    i+=1
+
+testsequences=int(len(dict2))
+
+#3: Codon-Scoring of the sequences
+i=0
+m=1
+testseq=""
+
+scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object)
+
+for i in range(len(dict2)):
+
+    l=0
+    scorecomp=0
+    codonscores=[]
+    testseq=dict1[dict2[i]]
+    codon_dictionary=dict.fromkeys(codon_dictionary,0)
+
+    for l in range(len(testseq)):
+        try:
+            codon_dictionary.get(testseq[l*3:l*3+3])
+            codon_dictionary[testseq[l*3:l*3+3]]+=1
+        except KeyError:
+            continue
+
+    scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary))
+    for x in codon_dictionary.values():
+        codonscores.append(x)
+
+    if scorecomp > cutoff :
+        scoremat[m,0]=dict2[i]
+        scoremat[m,1]=round(scorecomp,7)
+        scoremat[m,2:]=codonscores
+        m+=1
+
+scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))]
+scoremat_sort[0,0]="Gene_ID"
+scoremat_sort[0,1]="Total Score"
+scoremat_sort[0,2:]=codonlist
+scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]]
+
+#4: output
+with open(resultfile, 'w') as file_object:
+    file_object.write("DNA sequences with a higher total score than ")
+    file_object.write(str(cutoff))
+    file_object.write("\n")
+    file_object.write("\n")
+    file_object.write(str(scoremat_cleaned))
+    file_object.close()
\ No newline at end of file