Mercurial > repos > miller-lab > genome_diversity
comparison select_snps.py @ 12:4b6590dd7250
Uploaded
author | miller-lab |
---|---|
date | Wed, 12 Sep 2012 17:10:26 -0400 |
parents | 2c498d40ecde |
children |
comparison
equal
deleted
inserted
replaced
11:d4ec09e8079f | 12:4b6590dd7250 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import math | |
6 from optparse import OptionParser | |
7 import genome_diversity as gd | |
8 | |
9 def main_function(parse_arguments=None): | |
10 if parse_arguments is None: | |
11 parse_arguments = lambda arguments: (None, arguments) | |
12 def main_decorator(to_decorate): | |
13 def decorated_main(arguments=None): | |
14 if arguments is None: | |
15 arguments = sys.argv | |
16 options, arguments = parse_arguments(arguments) | |
17 sys.exit(to_decorate(options, arguments)) | |
18 return decorated_main | |
19 return main_decorator | |
20 | |
21 def parse_arguments(arguments): | |
22 parser = OptionParser() | |
23 parser.add_option('--input', dest='input') | |
24 parser.add_option('--output', dest='output') | |
25 parser.add_option('--index_dir', dest='index_dir') | |
26 parser.add_option('--num_snps', dest='num_snps') | |
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col') | |
28 parser.add_option('--ref_pos_col', dest='ref_pos_col') | |
29 parser.add_option('--ref_species', dest='ref_species') | |
30 return parser.parse_args(arguments[1:]) | |
31 | |
32 @main_function(parse_arguments) | |
33 def main(options, arguments): | |
34 | |
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1 | |
36 ref_pos_idx = to_int( options.ref_pos_col ) -1 | |
37 | |
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx): | |
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset." | |
40 sys.exit(1) | |
41 | |
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom') | |
43 chrom_len_file = '%s.len' % options.ref_species | |
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file) | |
45 | |
46 chrlens = gd.ChrLens( chrom_len_path ) | |
47 | |
48 total_len = 0 | |
49 for chrom in chrlens: | |
50 total_len += chrlens.length(chrom) | |
51 | |
52 total_requested = int( options.num_snps ) | |
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx ) | |
54 selected = select_snps( data, total_len, total_requested ) | |
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested) | |
56 write_selected_snps( options.output, out_data, lines, comments ) | |
57 | |
58 def to_int( value ): | |
59 try: | |
60 int_value = int( value ) | |
61 except ValueError: | |
62 int_value = 0 | |
63 return int_value | |
64 | |
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ): | |
66 fh = open( filename, 'r' ) | |
67 if (chrom_idx >= pos_idx): | |
68 needed = chrom_idx + 1 | |
69 else: | |
70 needed = pos_idx + 1 | |
71 lines = [] | |
72 data = [] | |
73 comments = [] | |
74 line_idx = 0 | |
75 line_num = 0 | |
76 for line in fh: | |
77 line_num += 1 | |
78 line = line.rstrip('\r\n') | |
79 if line: | |
80 if line.startswith('#'): | |
81 comments.append(line) | |
82 else: | |
83 elems = line.split('\t') | |
84 if len(elems) >= needed: | |
85 chrom = elems[chrom_idx] | |
86 try: | |
87 pos = int(elems[pos_idx]) | |
88 except ValueError: | |
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) ) | |
90 sys.exit(1) | |
91 lines.append(line) | |
92 chrom_sort = chrom.lstrip('chr') | |
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] ) | |
94 line_idx += 1 | |
95 fh.close() | |
96 data = sorted( data, key=lambda x: (x[0], x[2]) ) | |
97 return lines, data, comments | |
98 | |
99 def select_snps( data, total_len, requested ): | |
100 old_chrom = None | |
101 next_print = 0 | |
102 selected = [] | |
103 space = total_len / requested | |
104 for data_idx, datum in enumerate( data ): | |
105 chrom = datum[1] | |
106 pos = datum[2] | |
107 if chrom != old_chrom: | |
108 old_chrom = chrom | |
109 next_print = 0 | |
110 if pos >= next_print: | |
111 selected.append(data_idx) | |
112 next_print += space | |
113 return selected | |
114 | |
115 def fix_selection_and_order_like_input(data, selected, requested): | |
116 total_selected = len( selected ) | |
117 a = float( total_selected ) / requested | |
118 b = a / 2 | |
119 | |
120 idx_list = [] | |
121 for i in range( requested ): | |
122 idx = int( math.ceil( i * a + b ) - 1 ) | |
123 idx_list.append( idx ) | |
124 | |
125 out_data = [] | |
126 | |
127 for i, data_idx in enumerate(selected): | |
128 if total_selected > requested: | |
129 if i in idx_list: | |
130 out_data.append(data[data_idx]) | |
131 else: | |
132 out_data.append(data[data_idx]) | |
133 | |
134 out_data = sorted( out_data, key=lambda x: x[3] ) | |
135 | |
136 return out_data | |
137 | |
138 def write_selected_snps( filename, data, lines, comments ): | |
139 fh = open( filename, 'w' ) | |
140 | |
141 for comment in comments: | |
142 fh.write("%s\n" % comment ) | |
143 | |
144 for datum in data: | |
145 line_idx = datum[4] | |
146 fh.write("%s\n" % lines[line_idx]) | |
147 | |
148 fh.close() | |
149 | |
150 if __name__ == "__main__": | |
151 main() | |
152 | |
153 |