annotate select_snps.py @ 0:2c498d40ecde

Uploaded
author miller-lab
date Mon, 09 Apr 2012 12:03:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
1 #!/usr/bin/env python
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
2
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
3 import os
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
4 import sys
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
5 import math
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
6 from optparse import OptionParser
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
7 import genome_diversity as gd
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
8
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
9 def main_function(parse_arguments=None):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
10 if parse_arguments is None:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
11 parse_arguments = lambda arguments: (None, arguments)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
12 def main_decorator(to_decorate):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
13 def decorated_main(arguments=None):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
14 if arguments is None:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
15 arguments = sys.argv
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
16 options, arguments = parse_arguments(arguments)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
17 sys.exit(to_decorate(options, arguments))
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
18 return decorated_main
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
19 return main_decorator
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
20
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
21 def parse_arguments(arguments):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
22 parser = OptionParser()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
23 parser.add_option('--input', dest='input')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
24 parser.add_option('--output', dest='output')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
25 parser.add_option('--index_dir', dest='index_dir')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
26 parser.add_option('--num_snps', dest='num_snps')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
29 parser.add_option('--ref_species', dest='ref_species')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
30 return parser.parse_args(arguments[1:])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
31
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
32 @main_function(parse_arguments)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
33 def main(options, arguments):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
34
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
37
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
40 sys.exit(1)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
41
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
43 chrom_len_file = '%s.len' % options.ref_species
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
45
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
46 chrlens = gd.ChrLens( chrom_len_path )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
47
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
48 total_len = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
49 for chrom in chrlens:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
50 total_len += chrlens.length(chrom)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
51
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
52 total_requested = int( options.num_snps )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
54 selected = select_snps( data, total_len, total_requested )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
56 write_selected_snps( options.output, out_data, lines, comments )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
57
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
58 def to_int( value ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
59 try:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
60 int_value = int( value )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
61 except ValueError:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
62 int_value = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
63 return int_value
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
64
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
66 fh = open( filename, 'r' )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
67 if (chrom_idx >= pos_idx):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
68 needed = chrom_idx + 1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
69 else:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
70 needed = pos_idx + 1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
71 lines = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
72 data = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
73 comments = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
74 line_idx = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
75 line_num = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
76 for line in fh:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
77 line_num += 1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
78 line = line.rstrip('\r\n')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
79 if line:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
80 if line.startswith('#'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
81 comments.append(line)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
82 else:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
83 elems = line.split('\t')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
84 if len(elems) >= needed:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
85 chrom = elems[chrom_idx]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
86 try:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
87 pos = int(elems[pos_idx])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
88 except ValueError:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
90 sys.exit(1)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
91 lines.append(line)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
92 chrom_sort = chrom.lstrip('chr')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
94 line_idx += 1
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
95 fh.close()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
97 return lines, data, comments
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
98
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
99 def select_snps( data, total_len, requested ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
100 old_chrom = None
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
101 next_print = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
102 selected = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
103 space = total_len / requested
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
104 for data_idx, datum in enumerate( data ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
105 chrom = datum[1]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
106 pos = datum[2]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
107 if chrom != old_chrom:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
108 old_chrom = chrom
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
109 next_print = 0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
110 if pos >= next_print:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
111 selected.append(data_idx)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
112 next_print += space
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
113 return selected
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
114
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
115 def fix_selection_and_order_like_input(data, selected, requested):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
116 total_selected = len( selected )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
117 a = float( total_selected ) / requested
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
118 b = a / 2
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
119
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
120 idx_list = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
121 for i in range( requested ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
122 idx = int( math.ceil( i * a + b ) - 1 )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
123 idx_list.append( idx )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
124
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
125 out_data = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
126
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
127 for i, data_idx in enumerate(selected):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
128 if total_selected > requested:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
129 if i in idx_list:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
130 out_data.append(data[data_idx])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
131 else:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
132 out_data.append(data[data_idx])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
133
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
134 out_data = sorted( out_data, key=lambda x: x[3] )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
135
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
136 return out_data
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
137
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
138 def write_selected_snps( filename, data, lines, comments ):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
139 fh = open( filename, 'w' )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
140
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
141 for comment in comments:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
142 fh.write("%s\n" % comment )
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
143
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
144 for datum in data:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
145 line_idx = datum[4]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
146 fh.write("%s\n" % lines[line_idx])
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
147
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
148 fh.close()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
149
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
150 if __name__ == "__main__":
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
151 main()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
152
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
153