0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import math
|
|
5 from optparse import OptionParser
|
|
6 import genome_diversity as gd
|
|
7
|
|
8 def main_function(parse_arguments=None):
|
|
9 if parse_arguments is None:
|
|
10 parse_arguments = lambda arguments: (None, arguments)
|
|
11 def main_decorator(to_decorate):
|
|
12 def decorated_main(arguments=None):
|
|
13 if arguments is None:
|
|
14 arguments = sys.argv
|
|
15 options, arguments = parse_arguments(arguments)
|
|
16 sys.exit(to_decorate(options, arguments))
|
|
17 return decorated_main
|
|
18 return main_decorator
|
|
19
|
|
20 def parse_arguments(arguments):
|
|
21 parser = OptionParser()
|
|
22 parser.add_option('--input', dest='input')
|
|
23 parser.add_option('--output', dest='output')
|
|
24 parser.add_option('--chrlens_loc', dest='chrlens_loc')
|
|
25 parser.add_option('--num_snps', dest='num_snps')
|
|
26 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
|
|
27 parser.add_option('--ref_pos_col', dest='ref_pos_col')
|
|
28 parser.add_option('--species', dest='species')
|
|
29 return parser.parse_args(arguments[1:])
|
|
30
|
|
31 @main_function(parse_arguments)
|
|
32 def main(options, arguments):
|
|
33
|
|
34 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
|
|
35 ref_pos_idx = to_int( options.ref_pos_col ) -1
|
|
36
|
|
37 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
|
|
38 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
|
|
39 sys.exit(1)
|
|
40
|
|
41 chrlens = gd.ChrLens( options.chrlens_loc, options.species )
|
|
42
|
|
43 total_len = 0
|
|
44 for chrom in chrlens:
|
|
45 total_len += chrlens.length(chrom)
|
|
46
|
|
47 total_requested = int( options.num_snps )
|
|
48 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
|
|
49 selected = select_snps( data, total_len, total_requested )
|
|
50 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
|
|
51 write_selected_snps( options.output, out_data, lines, comments )
|
|
52
|
|
53 def to_int( value ):
|
|
54 try:
|
|
55 int_value = int( value )
|
|
56 except ValueError:
|
|
57 int_value = 0
|
|
58 return int_value
|
|
59
|
|
60 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
|
|
61 fh = open( filename, 'r' )
|
|
62 if (chrom_idx >= pos_idx):
|
|
63 needed = chrom_idx + 1
|
|
64 else:
|
|
65 needed = pos_idx + 1
|
|
66 lines = []
|
|
67 data = []
|
|
68 comments = []
|
|
69 line_idx = 0
|
|
70 line_num = 0
|
|
71 for line in fh:
|
|
72 line_num += 1
|
|
73 line = line.rstrip('\r\n')
|
|
74 if line:
|
|
75 if line.startswith('#'):
|
|
76 comments.append(line)
|
|
77 else:
|
|
78 elems = line.split('\t')
|
|
79 if len(elems) >= needed:
|
|
80 chrom = elems[chrom_idx]
|
|
81 try:
|
|
82 pos = int(elems[pos_idx])
|
|
83 except ValueError:
|
|
84 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
|
|
85 sys.exit(1)
|
|
86 lines.append(line)
|
|
87 chrom_sort = chrom.lstrip('chr')
|
|
88 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
|
|
89 line_idx += 1
|
|
90 fh.close()
|
|
91 data = sorted( data, key=lambda x: (x[0], x[2]) )
|
|
92 return lines, data, comments
|
|
93
|
|
94 def select_snps( data, total_len, requested ):
|
|
95 old_chrom = None
|
|
96 next_print = 0
|
|
97 selected = []
|
|
98 space = total_len / requested
|
|
99 for data_idx, datum in enumerate( data ):
|
|
100 chrom = datum[1]
|
|
101 pos = datum[2]
|
|
102 if chrom != old_chrom:
|
|
103 old_chrom = chrom
|
|
104 next_print = 0
|
|
105 if pos >= next_print:
|
|
106 selected.append(data_idx)
|
|
107 next_print += space
|
|
108 return selected
|
|
109
|
|
110 def fix_selection_and_order_like_input(data, selected, requested):
|
|
111 total_selected = len( selected )
|
|
112 a = float( total_selected ) / requested
|
|
113 b = a / 2
|
|
114
|
|
115 idx_list = []
|
|
116 for i in range( requested ):
|
|
117 idx = int( math.ceil( i * a + b ) - 1 )
|
|
118 idx_list.append( idx )
|
|
119
|
|
120 out_data = []
|
|
121
|
|
122 for i, data_idx in enumerate(selected):
|
|
123 if total_selected > requested:
|
|
124 if i in idx_list:
|
|
125 out_data.append(data[data_idx])
|
|
126 else:
|
|
127 out_data.append(data[data_idx])
|
|
128
|
|
129 out_data = sorted( out_data, key=lambda x: x[3] )
|
|
130
|
|
131 return out_data
|
|
132
|
|
133 def write_selected_snps( filename, data, lines, comments ):
|
|
134 fh = open( filename, 'w' )
|
|
135
|
|
136 for comment in comments:
|
|
137 fh.write("%s\n" % comment )
|
|
138
|
|
139 for datum in data:
|
|
140 line_idx = datum[4]
|
|
141 fh.write("%s\n" % lines[line_idx])
|
|
142
|
|
143 fh.close()
|
|
144
|
|
145 if __name__ == "__main__":
|
|
146 main()
|
|
147
|
|
148
|