comparison select_snps.py @ 12:4b6590dd7250

Uploaded
author miller-lab
date Wed, 12 Sep 2012 17:10:26 -0400
parents 2c498d40ecde
children
comparison
equal deleted inserted replaced
11:d4ec09e8079f 12:4b6590dd7250
1 #!/usr/bin/env python
2
3 import os
4 import sys
5 import math
6 from optparse import OptionParser
7 import genome_diversity as gd
8
9 def main_function(parse_arguments=None):
10 if parse_arguments is None:
11 parse_arguments = lambda arguments: (None, arguments)
12 def main_decorator(to_decorate):
13 def decorated_main(arguments=None):
14 if arguments is None:
15 arguments = sys.argv
16 options, arguments = parse_arguments(arguments)
17 sys.exit(to_decorate(options, arguments))
18 return decorated_main
19 return main_decorator
20
21 def parse_arguments(arguments):
22 parser = OptionParser()
23 parser.add_option('--input', dest='input')
24 parser.add_option('--output', dest='output')
25 parser.add_option('--index_dir', dest='index_dir')
26 parser.add_option('--num_snps', dest='num_snps')
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
29 parser.add_option('--ref_species', dest='ref_species')
30 return parser.parse_args(arguments[1:])
31
32 @main_function(parse_arguments)
33 def main(options, arguments):
34
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
37
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
40 sys.exit(1)
41
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
43 chrom_len_file = '%s.len' % options.ref_species
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
45
46 chrlens = gd.ChrLens( chrom_len_path )
47
48 total_len = 0
49 for chrom in chrlens:
50 total_len += chrlens.length(chrom)
51
52 total_requested = int( options.num_snps )
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
54 selected = select_snps( data, total_len, total_requested )
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
56 write_selected_snps( options.output, out_data, lines, comments )
57
58 def to_int( value ):
59 try:
60 int_value = int( value )
61 except ValueError:
62 int_value = 0
63 return int_value
64
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
66 fh = open( filename, 'r' )
67 if (chrom_idx >= pos_idx):
68 needed = chrom_idx + 1
69 else:
70 needed = pos_idx + 1
71 lines = []
72 data = []
73 comments = []
74 line_idx = 0
75 line_num = 0
76 for line in fh:
77 line_num += 1
78 line = line.rstrip('\r\n')
79 if line:
80 if line.startswith('#'):
81 comments.append(line)
82 else:
83 elems = line.split('\t')
84 if len(elems) >= needed:
85 chrom = elems[chrom_idx]
86 try:
87 pos = int(elems[pos_idx])
88 except ValueError:
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
90 sys.exit(1)
91 lines.append(line)
92 chrom_sort = chrom.lstrip('chr')
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
94 line_idx += 1
95 fh.close()
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
97 return lines, data, comments
98
99 def select_snps( data, total_len, requested ):
100 old_chrom = None
101 next_print = 0
102 selected = []
103 space = total_len / requested
104 for data_idx, datum in enumerate( data ):
105 chrom = datum[1]
106 pos = datum[2]
107 if chrom != old_chrom:
108 old_chrom = chrom
109 next_print = 0
110 if pos >= next_print:
111 selected.append(data_idx)
112 next_print += space
113 return selected
114
115 def fix_selection_and_order_like_input(data, selected, requested):
116 total_selected = len( selected )
117 a = float( total_selected ) / requested
118 b = a / 2
119
120 idx_list = []
121 for i in range( requested ):
122 idx = int( math.ceil( i * a + b ) - 1 )
123 idx_list.append( idx )
124
125 out_data = []
126
127 for i, data_idx in enumerate(selected):
128 if total_selected > requested:
129 if i in idx_list:
130 out_data.append(data[data_idx])
131 else:
132 out_data.append(data[data_idx])
133
134 out_data = sorted( out_data, key=lambda x: x[3] )
135
136 return out_data
137
138 def write_selected_snps( filename, data, lines, comments ):
139 fh = open( filename, 'w' )
140
141 for comment in comments:
142 fh.write("%s\n" % comment )
143
144 for datum in data:
145 line_idx = datum[4]
146 fh.write("%s\n" % lines[line_idx])
147
148 fh.close()
149
150 if __name__ == "__main__":
151 main()
152
153