annotate tools/metag_tools/convert_SOLiD_color2nuc.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 convert SOLiD calor-base data to nucleotide sequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 example: T011213122200221123032111221021210131332222101
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 import sys, os
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 def stop_err(msg):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 sys.stderr.write(msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 sys.stderr.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 def color2base(color_seq):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 first_nuc = ['A','C','G','T']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 code_matrix = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 code_matrix['0'] = ['A','C','G','T']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 code_matrix['1'] = ['C','A','T','G']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 code_matrix['2'] = ['G','T','A','C']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 code_matrix['3'] = ['T','G','C','A']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 overlap_nuc = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 nuc_seq = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 seq_prefix = prefix = color_seq[0].upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 color_seq = color_seq[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 if not (seq_prefix in first_nuc):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 for code in color_seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 if not (code in ['0','1','2','3']):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 second_nuc = code_matrix[code]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 overlap_nuc = second_nuc[first_nuc.index(prefix)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 nuc_seq += overlap_nuc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 prefix = overlap_nuc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 return seq_prefix, nuc_seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 infilename = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 keep_prefix = sys.argv[2].lower()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 outfilename = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 outfile = open(outfilename,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 prefix = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 color_seq = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 for i, line in enumerate(file(infilename)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 line = line.rstrip('\r\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 if not line: continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 if line.startswith("#"): continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if line.startswith(">"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 if color_seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 prefix, nuc_seq = color2base(color_seq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 if keep_prefix == 'yes':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 nuc_seq = prefix + nuc_seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 outfile.write(title+'\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 outfile.write(nuc_seq+'\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 title = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 color_seq = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 color_seq += line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if color_seq:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 prefix, nuc_seq = color2base(color_seq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if keep_prefix == 'yes':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 nuc_seq = prefix + nuc_seq
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 outfile.write(title+'\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 outfile.write(nuc_seq+'\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if __name__=='__main__': __main__()