view GeneCodonSearch1_2.py @ 1:e8c8bf56aab0 draft default tip

Uploaded
author jamueller
date Thu, 23 May 2019 05:16:03 -0400
parents
children
line wrap: on
line source

#1: Modules
import re
import argparse
import numpy as np
np.set_printoptions(threshold=np.inf)

parser = argparse.ArgumentParser()
parser.add_argument("-m", "--minimum",type=float, help="cutoff value")
parser.add_argument("-c", "--codon",type=str, help="codons of interest")
parser.add_argument("-i", "--input_sequences", type=str, help="")
parser.add_argument("-r", "--result", type=str)

args = parser.parse_args()

sequences = args.input_sequences
cutoff = args.minimum
codonlist=args.codon
resultfile=args.result

codonlist = codonlist.split(",")

i=0
codon_dictionary = {}
while i < len(codonlist):
    codon_dictionary[codonlist[i]] = 0
    i+=1
#2:input
dict1 = {}
dict2 = {}
seq_id =""

i=0
input_sequences = open(sequences,mode = 'r')
for line in input_sequences:
    if line [0] == ">":
        match = re.search(">(\S+)\s(.*)$",line)
        seq_id = match.group(1)
        dict1[seq_id] = ""
    elif seq_id != "" and line.strip()!= "":
        dict1[seq_id] += line.strip()

i=0
for name in dict1.keys():
    dict2[i] = name
    i+=1

testsequences=int(len(dict2))

#3: Codon-Scoring of the sequences
i=0
m=1
testseq=""

scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object)

for i in range(len(dict2)):

    l=0
    scorecomp=0
    codonscores=[]
    testseq=dict1[dict2[i]]
    codon_dictionary=dict.fromkeys(codon_dictionary,0)
    
    for l in range(len(testseq)):
        try:
            codon_dictionary.get(testseq[l*3:l*3+3])
            codon_dictionary[testseq[l*3:l*3+3]]+=1
        except KeyError:
            continue
    
    scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary))
    for x in codon_dictionary.values():
        codonscores.append(x)
        
    if scorecomp > cutoff :
        scoremat[m,0]=dict2[i]
        scoremat[m,1]=round(scorecomp,7)
        scoremat[m,2:]=codonscores
        m+=1

scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))]
scoremat_sort[0,0]="Gene_ID"
scoremat_sort[0,1]="Total Score"
scoremat_sort[0,2:]=codonlist        
scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]]

#4: output
with open(resultfile, 'w') as file_object:
    file_object.write("DNA sequences with a higher total score than ")
    file_object.write(str(cutoff))
    file_object.write("\n")
    file_object.write("\n")
    file_object.write(str(scoremat_cleaned))
    file_object.close()