# HG changeset patch # User jamueller # Date 1558602963 14400 # Node ID e8c8bf56aab0646bb5943f380825e1139f4361f8 # Parent caadae802e656f7c71ceb1f838c3c4e8a2195fd7 Uploaded diff -r caadae802e65 -r e8c8bf56aab0 GeneCodonSearch1_2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GeneCodonSearch1_2.py Thu May 23 05:16:03 2019 -0400 @@ -0,0 +1,94 @@ +#1: Modules +import re +import argparse +import numpy as np +np.set_printoptions(threshold=np.inf) + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--minimum",type=float, help="cutoff value") +parser.add_argument("-c", "--codon",type=str, help="codons of interest") +parser.add_argument("-i", "--input_sequences", type=str, help="") +parser.add_argument("-r", "--result", type=str) + +args = parser.parse_args() + +sequences = args.input_sequences +cutoff = args.minimum +codonlist=args.codon +resultfile=args.result + +codonlist = codonlist.split(",") + +i=0 +codon_dictionary = {} +while i < len(codonlist): + codon_dictionary[codonlist[i]] = 0 + i+=1 +#2:input +dict1 = {} +dict2 = {} +seq_id ="" + +i=0 +input_sequences = open(sequences,mode = 'r') +for line in input_sequences: + if line [0] == ">": + match = re.search(">(\S+)\s(.*)$",line) + seq_id = match.group(1) + dict1[seq_id] = "" + elif seq_id != "" and line.strip()!= "": + dict1[seq_id] += line.strip() + +i=0 +for name in dict1.keys(): + dict2[i] = name + i+=1 + +testsequences=int(len(dict2)) + +#3: Codon-Scoring of the sequences +i=0 +m=1 +testseq="" + +scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object) + +for i in range(len(dict2)): + + l=0 + scorecomp=0 + codonscores=[] + testseq=dict1[dict2[i]] + codon_dictionary=dict.fromkeys(codon_dictionary,0) + + for l in range(len(testseq)): + try: + codon_dictionary.get(testseq[l*3:l*3+3]) + codon_dictionary[testseq[l*3:l*3+3]]+=1 + except KeyError: + continue + + scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary)) + for x in codon_dictionary.values(): + codonscores.append(x) + + if scorecomp > cutoff : + scoremat[m,0]=dict2[i] + scoremat[m,1]=round(scorecomp,7) + scoremat[m,2:]=codonscores + m+=1 + +scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))] +scoremat_sort[0,0]="Gene_ID" +scoremat_sort[0,1]="Total Score" +scoremat_sort[0,2:]=codonlist +scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]] + +#4: output +with open(resultfile, 'w') as file_object: + file_object.write("DNA sequences with a higher total score than ") + file_object.write(str(cutoff)) + file_object.write("\n") + file_object.write("\n") + file_object.write(str(scoremat_cleaned)) + file_object.close() \ No newline at end of file