Mercurial > repos > devteam > convert_solid_color2nuc
comparison convert_SOLiD_color2nuc.py @ 0:ab28e7de2db3 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:33:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ab28e7de2db3 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 convert SOLiD calor-base data to nucleotide sequence | |
4 example: T011213122200221123032111221021210131332222101 | |
5 TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT | |
6 """ | |
7 | |
8 import sys, os | |
9 | |
10 def stop_err(msg): | |
11 | |
12 sys.stderr.write(msg) | |
13 sys.stderr.write('\n') | |
14 sys.exit() | |
15 | |
16 def color2base(color_seq): | |
17 | |
18 first_nuc = ['A','C','G','T'] | |
19 code_matrix = {} | |
20 code_matrix['0'] = ['A','C','G','T'] | |
21 code_matrix['1'] = ['C','A','T','G'] | |
22 code_matrix['2'] = ['G','T','A','C'] | |
23 code_matrix['3'] = ['T','G','C','A'] | |
24 | |
25 overlap_nuc = '' | |
26 nuc_seq = '' | |
27 | |
28 seq_prefix = prefix = color_seq[0].upper() | |
29 color_seq = color_seq[1:] | |
30 | |
31 if not (seq_prefix in first_nuc): | |
32 stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix ) | |
33 | |
34 for code in color_seq: | |
35 | |
36 if not (code in ['0','1','2','3']): | |
37 stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code) | |
38 | |
39 second_nuc = code_matrix[code] | |
40 overlap_nuc = second_nuc[first_nuc.index(prefix)] | |
41 nuc_seq += overlap_nuc | |
42 prefix = overlap_nuc | |
43 | |
44 return seq_prefix, nuc_seq | |
45 | |
46 def __main__(): | |
47 | |
48 infilename = sys.argv[1] | |
49 keep_prefix = sys.argv[2].lower() | |
50 outfilename = sys.argv[3] | |
51 | |
52 outfile = open(outfilename,'w') | |
53 | |
54 prefix = '' | |
55 color_seq = '' | |
56 for i, line in enumerate(file(infilename)): | |
57 line = line.rstrip('\r\n') | |
58 | |
59 if not line: continue | |
60 if line.startswith("#"): continue | |
61 | |
62 if line.startswith(">"): | |
63 | |
64 if color_seq: | |
65 prefix, nuc_seq = color2base(color_seq) | |
66 | |
67 if keep_prefix == 'yes': | |
68 nuc_seq = prefix + nuc_seq | |
69 | |
70 outfile.write(title+'\n') | |
71 outfile.write(nuc_seq+'\n') | |
72 | |
73 title = line | |
74 color_seq = '' | |
75 else: | |
76 color_seq += line | |
77 | |
78 if color_seq: | |
79 prefix, nuc_seq = color2base(color_seq) | |
80 | |
81 if keep_prefix == 'yes': | |
82 nuc_seq = prefix + nuc_seq | |
83 | |
84 outfile.write(title+'\n') | |
85 outfile.write(nuc_seq+'\n') | |
86 | |
87 outfile.close() | |
88 | |
89 if __name__=='__main__': __main__() |