0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 convert SOLiD calor-base data to nucleotide sequence
|
|
4 example: T011213122200221123032111221021210131332222101
|
|
5 TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
|
|
6 """
|
|
7
|
|
8 import sys, os
|
|
9
|
|
10 def stop_err(msg):
|
|
11
|
|
12 sys.stderr.write(msg)
|
|
13 sys.stderr.write('\n')
|
|
14 sys.exit()
|
|
15
|
|
16 def color2base(color_seq):
|
|
17
|
|
18 first_nuc = ['A','C','G','T']
|
|
19 code_matrix = {}
|
|
20 code_matrix['0'] = ['A','C','G','T']
|
|
21 code_matrix['1'] = ['C','A','T','G']
|
|
22 code_matrix['2'] = ['G','T','A','C']
|
|
23 code_matrix['3'] = ['T','G','C','A']
|
|
24
|
|
25 overlap_nuc = ''
|
|
26 nuc_seq = ''
|
|
27
|
|
28 seq_prefix = prefix = color_seq[0].upper()
|
|
29 color_seq = color_seq[1:]
|
|
30
|
|
31 if not (seq_prefix in first_nuc):
|
|
32 stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
|
|
33
|
|
34 for code in color_seq:
|
|
35
|
|
36 if not (code in ['0','1','2','3']):
|
|
37 stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
|
|
38
|
|
39 second_nuc = code_matrix[code]
|
|
40 overlap_nuc = second_nuc[first_nuc.index(prefix)]
|
|
41 nuc_seq += overlap_nuc
|
|
42 prefix = overlap_nuc
|
|
43
|
|
44 return seq_prefix, nuc_seq
|
|
45
|
|
46 def __main__():
|
|
47
|
|
48 infilename = sys.argv[1]
|
|
49 keep_prefix = sys.argv[2].lower()
|
|
50 outfilename = sys.argv[3]
|
|
51
|
|
52 outfile = open(outfilename,'w')
|
|
53
|
|
54 prefix = ''
|
|
55 color_seq = ''
|
|
56 for i, line in enumerate(file(infilename)):
|
|
57 line = line.rstrip('\r\n')
|
|
58
|
|
59 if not line: continue
|
|
60 if line.startswith("#"): continue
|
|
61
|
|
62 if line.startswith(">"):
|
|
63
|
|
64 if color_seq:
|
|
65 prefix, nuc_seq = color2base(color_seq)
|
|
66
|
|
67 if keep_prefix == 'yes':
|
|
68 nuc_seq = prefix + nuc_seq
|
|
69
|
|
70 outfile.write(title+'\n')
|
|
71 outfile.write(nuc_seq+'\n')
|
|
72
|
|
73 title = line
|
|
74 color_seq = ''
|
|
75 else:
|
|
76 color_seq += line
|
|
77
|
|
78 if color_seq:
|
|
79 prefix, nuc_seq = color2base(color_seq)
|
|
80
|
|
81 if keep_prefix == 'yes':
|
|
82 nuc_seq = prefix + nuc_seq
|
|
83
|
|
84 outfile.write(title+'\n')
|
|
85 outfile.write(nuc_seq+'\n')
|
|
86
|
|
87 outfile.close()
|
|
88
|
|
89 if __name__=='__main__': __main__()
|