Mercurial > repos > jamueller > codon_analysis
comparison GeneCodonSearch1_2.py @ 1:e8c8bf56aab0 draft default tip
Uploaded
author | jamueller |
---|---|
date | Thu, 23 May 2019 05:16:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:caadae802e65 | 1:e8c8bf56aab0 |
---|---|
1 #1: Modules | |
2 import re | |
3 import argparse | |
4 import numpy as np | |
5 np.set_printoptions(threshold=np.inf) | |
6 | |
7 parser = argparse.ArgumentParser() | |
8 parser.add_argument("-m", "--minimum",type=float, help="cutoff value") | |
9 parser.add_argument("-c", "--codon",type=str, help="codons of interest") | |
10 parser.add_argument("-i", "--input_sequences", type=str, help="") | |
11 parser.add_argument("-r", "--result", type=str) | |
12 | |
13 args = parser.parse_args() | |
14 | |
15 sequences = args.input_sequences | |
16 cutoff = args.minimum | |
17 codonlist=args.codon | |
18 resultfile=args.result | |
19 | |
20 codonlist = codonlist.split(",") | |
21 | |
22 i=0 | |
23 codon_dictionary = {} | |
24 while i < len(codonlist): | |
25 codon_dictionary[codonlist[i]] = 0 | |
26 i+=1 | |
27 #2:input | |
28 dict1 = {} | |
29 dict2 = {} | |
30 seq_id ="" | |
31 | |
32 i=0 | |
33 input_sequences = open(sequences,mode = 'r') | |
34 for line in input_sequences: | |
35 if line [0] == ">": | |
36 match = re.search(">(\S+)\s(.*)$",line) | |
37 seq_id = match.group(1) | |
38 dict1[seq_id] = "" | |
39 elif seq_id != "" and line.strip()!= "": | |
40 dict1[seq_id] += line.strip() | |
41 | |
42 i=0 | |
43 for name in dict1.keys(): | |
44 dict2[i] = name | |
45 i+=1 | |
46 | |
47 testsequences=int(len(dict2)) | |
48 | |
49 #3: Codon-Scoring of the sequences | |
50 i=0 | |
51 m=1 | |
52 testseq="" | |
53 | |
54 scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object) | |
55 | |
56 for i in range(len(dict2)): | |
57 | |
58 l=0 | |
59 scorecomp=0 | |
60 codonscores=[] | |
61 testseq=dict1[dict2[i]] | |
62 codon_dictionary=dict.fromkeys(codon_dictionary,0) | |
63 | |
64 for l in range(len(testseq)): | |
65 try: | |
66 codon_dictionary.get(testseq[l*3:l*3+3]) | |
67 codon_dictionary[testseq[l*3:l*3+3]]+=1 | |
68 except KeyError: | |
69 continue | |
70 | |
71 scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary)) | |
72 for x in codon_dictionary.values(): | |
73 codonscores.append(x) | |
74 | |
75 if scorecomp > cutoff : | |
76 scoremat[m,0]=dict2[i] | |
77 scoremat[m,1]=round(scorecomp,7) | |
78 scoremat[m,2:]=codonscores | |
79 m+=1 | |
80 | |
81 scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))] | |
82 scoremat_sort[0,0]="Gene_ID" | |
83 scoremat_sort[0,1]="Total Score" | |
84 scoremat_sort[0,2:]=codonlist | |
85 scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]] | |
86 | |
87 #4: output | |
88 with open(resultfile, 'w') as file_object: | |
89 file_object.write("DNA sequences with a higher total score than ") | |
90 file_object.write(str(cutoff)) | |
91 file_object.write("\n") | |
92 file_object.write("\n") | |
93 file_object.write(str(scoremat_cleaned)) | |
94 file_object.close() |