annotate tools/genome_diversity/select_snps.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 import math
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 from optparse import OptionParser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 import genome_diversity as gd
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 def main_function(parse_arguments=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 if parse_arguments is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 parse_arguments = lambda arguments: (None, arguments)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 def main_decorator(to_decorate):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 def decorated_main(arguments=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 if arguments is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 arguments = sys.argv
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 options, arguments = parse_arguments(arguments)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 sys.exit(to_decorate(options, arguments))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 return decorated_main
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 return main_decorator
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 def parse_arguments(arguments):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 parser = OptionParser()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 parser.add_option('--input', dest='input')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 parser.add_option('--output', dest='output')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 parser.add_option('--chrlens_loc', dest='chrlens_loc')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 parser.add_option('--num_snps', dest='num_snps')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 parser.add_option('--ref_pos_col', dest='ref_pos_col')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 parser.add_option('--species', dest='species')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 return parser.parse_args(arguments[1:])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 @main_function(parse_arguments)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 def main(options, arguments):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 ref_pos_idx = to_int( options.ref_pos_col ) -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 chrlens = gd.ChrLens( options.chrlens_loc, options.species )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 total_len = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 for chrom in chrlens:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 total_len += chrlens.length(chrom)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 total_requested = int( options.num_snps )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 selected = select_snps( data, total_len, total_requested )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 write_selected_snps( options.output, out_data, lines, comments )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 def to_int( value ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 int_value = int( value )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 int_value = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 return int_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 fh = open( filename, 'r' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if (chrom_idx >= pos_idx):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 needed = chrom_idx + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 needed = pos_idx + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 lines = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 data = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 comments = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 line_idx = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 line_num = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 for line in fh:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 line_num += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 line = line.rstrip('\r\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 if line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 if line.startswith('#'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 comments.append(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 elems = line.split('\t')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 if len(elems) >= needed:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 chrom = elems[chrom_idx]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 pos = int(elems[pos_idx])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 lines.append(line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 chrom_sort = chrom.lstrip('chr')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 line_idx += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 fh.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 data = sorted( data, key=lambda x: (x[0], x[2]) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 return lines, data, comments
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 def select_snps( data, total_len, requested ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 old_chrom = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 next_print = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 selected = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 space = total_len / requested
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 for data_idx, datum in enumerate( data ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 chrom = datum[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 pos = datum[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if chrom != old_chrom:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 old_chrom = chrom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 next_print = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 if pos >= next_print:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 selected.append(data_idx)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 next_print += space
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 return selected
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 def fix_selection_and_order_like_input(data, selected, requested):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 total_selected = len( selected )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 a = float( total_selected ) / requested
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 b = a / 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 idx_list = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 for i in range( requested ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 idx = int( math.ceil( i * a + b ) - 1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 idx_list.append( idx )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 out_data = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 for i, data_idx in enumerate(selected):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 if total_selected > requested:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 if i in idx_list:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 out_data.append(data[data_idx])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 out_data.append(data[data_idx])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 out_data = sorted( out_data, key=lambda x: x[3] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 return out_data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 def write_selected_snps( filename, data, lines, comments ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 fh = open( filename, 'w' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 for comment in comments:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 fh.write("%s\n" % comment )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 for datum in data:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 line_idx = datum[4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 fh.write("%s\n" % lines[line_idx])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 fh.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 main()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148