comparison convert_SOLiD_color2nuc.py @ 0:ab28e7de2db3 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ab28e7de2db3
1 #!/usr/bin/env python
2 """
3 convert SOLiD calor-base data to nucleotide sequence
4 example: T011213122200221123032111221021210131332222101
5 TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
6 """
7
8 import sys, os
9
10 def stop_err(msg):
11
12 sys.stderr.write(msg)
13 sys.stderr.write('\n')
14 sys.exit()
15
16 def color2base(color_seq):
17
18 first_nuc = ['A','C','G','T']
19 code_matrix = {}
20 code_matrix['0'] = ['A','C','G','T']
21 code_matrix['1'] = ['C','A','T','G']
22 code_matrix['2'] = ['G','T','A','C']
23 code_matrix['3'] = ['T','G','C','A']
24
25 overlap_nuc = ''
26 nuc_seq = ''
27
28 seq_prefix = prefix = color_seq[0].upper()
29 color_seq = color_seq[1:]
30
31 if not (seq_prefix in first_nuc):
32 stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
33
34 for code in color_seq:
35
36 if not (code in ['0','1','2','3']):
37 stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
38
39 second_nuc = code_matrix[code]
40 overlap_nuc = second_nuc[first_nuc.index(prefix)]
41 nuc_seq += overlap_nuc
42 prefix = overlap_nuc
43
44 return seq_prefix, nuc_seq
45
46 def __main__():
47
48 infilename = sys.argv[1]
49 keep_prefix = sys.argv[2].lower()
50 outfilename = sys.argv[3]
51
52 outfile = open(outfilename,'w')
53
54 prefix = ''
55 color_seq = ''
56 for i, line in enumerate(file(infilename)):
57 line = line.rstrip('\r\n')
58
59 if not line: continue
60 if line.startswith("#"): continue
61
62 if line.startswith(">"):
63
64 if color_seq:
65 prefix, nuc_seq = color2base(color_seq)
66
67 if keep_prefix == 'yes':
68 nuc_seq = prefix + nuc_seq
69
70 outfile.write(title+'\n')
71 outfile.write(nuc_seq+'\n')
72
73 title = line
74 color_seq = ''
75 else:
76 color_seq += line
77
78 if color_seq:
79 prefix, nuc_seq = color2base(color_seq)
80
81 if keep_prefix == 'yes':
82 nuc_seq = prefix + nuc_seq
83
84 outfile.write(title+'\n')
85 outfile.write(nuc_seq+'\n')
86
87 outfile.close()
88
89 if __name__=='__main__': __main__()