Mercurial > repos > jamueller > codon_analysis
view GeneCodonSearch1_2.py @ 1:e8c8bf56aab0 draft default tip
Uploaded
author | jamueller |
---|---|
date | Thu, 23 May 2019 05:16:03 -0400 |
parents | |
children |
line wrap: on
line source
#1: Modules import re import argparse import numpy as np np.set_printoptions(threshold=np.inf) parser = argparse.ArgumentParser() parser.add_argument("-m", "--minimum",type=float, help="cutoff value") parser.add_argument("-c", "--codon",type=str, help="codons of interest") parser.add_argument("-i", "--input_sequences", type=str, help="") parser.add_argument("-r", "--result", type=str) args = parser.parse_args() sequences = args.input_sequences cutoff = args.minimum codonlist=args.codon resultfile=args.result codonlist = codonlist.split(",") i=0 codon_dictionary = {} while i < len(codonlist): codon_dictionary[codonlist[i]] = 0 i+=1 #2:input dict1 = {} dict2 = {} seq_id ="" i=0 input_sequences = open(sequences,mode = 'r') for line in input_sequences: if line [0] == ">": match = re.search(">(\S+)\s(.*)$",line) seq_id = match.group(1) dict1[seq_id] = "" elif seq_id != "" and line.strip()!= "": dict1[seq_id] += line.strip() i=0 for name in dict1.keys(): dict2[i] = name i+=1 testsequences=int(len(dict2)) #3: Codon-Scoring of the sequences i=0 m=1 testseq="" scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object) for i in range(len(dict2)): l=0 scorecomp=0 codonscores=[] testseq=dict1[dict2[i]] codon_dictionary=dict.fromkeys(codon_dictionary,0) for l in range(len(testseq)): try: codon_dictionary.get(testseq[l*3:l*3+3]) codon_dictionary[testseq[l*3:l*3+3]]+=1 except KeyError: continue scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary)) for x in codon_dictionary.values(): codonscores.append(x) if scorecomp > cutoff : scoremat[m,0]=dict2[i] scoremat[m,1]=round(scorecomp,7) scoremat[m,2:]=codonscores m+=1 scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))] scoremat_sort[0,0]="Gene_ID" scoremat_sort[0,1]="Total Score" scoremat_sort[0,2:]=codonlist scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]] #4: output with open(resultfile, 'w') as file_object: file_object.write("DNA sequences with a higher total score than ") file_object.write(str(cutoff)) file_object.write("\n") file_object.write("\n") file_object.write(str(scoremat_cleaned)) file_object.close()