0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import math
|
|
6 from optparse import OptionParser
|
|
7 import genome_diversity as gd
|
|
8
|
|
9 def main_function(parse_arguments=None):
|
|
10 if parse_arguments is None:
|
|
11 parse_arguments = lambda arguments: (None, arguments)
|
|
12 def main_decorator(to_decorate):
|
|
13 def decorated_main(arguments=None):
|
|
14 if arguments is None:
|
|
15 arguments = sys.argv
|
|
16 options, arguments = parse_arguments(arguments)
|
|
17 sys.exit(to_decorate(options, arguments))
|
|
18 return decorated_main
|
|
19 return main_decorator
|
|
20
|
|
21 def parse_arguments(arguments):
|
|
22 parser = OptionParser()
|
|
23 parser.add_option('--input', dest='input')
|
|
24 parser.add_option('--output', dest='output')
|
|
25 parser.add_option('--index_dir', dest='index_dir')
|
|
26 parser.add_option('--num_snps', dest='num_snps')
|
|
27 parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
|
|
28 parser.add_option('--ref_pos_col', dest='ref_pos_col')
|
|
29 parser.add_option('--ref_species', dest='ref_species')
|
|
30 return parser.parse_args(arguments[1:])
|
|
31
|
|
32 @main_function(parse_arguments)
|
|
33 def main(options, arguments):
|
|
34
|
|
35 ref_chrom_idx = to_int( options.ref_chrom_col ) -1
|
|
36 ref_pos_idx = to_int( options.ref_pos_col ) -1
|
|
37
|
|
38 if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
|
|
39 print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
|
|
40 sys.exit(1)
|
|
41
|
|
42 chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
|
|
43 chrom_len_file = '%s.len' % options.ref_species
|
|
44 chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
|
|
45
|
|
46 chrlens = gd.ChrLens( chrom_len_path )
|
|
47
|
|
48 total_len = 0
|
|
49 for chrom in chrlens:
|
|
50 total_len += chrlens.length(chrom)
|
|
51
|
|
52 total_requested = int( options.num_snps )
|
|
53 lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
|
|
54 selected = select_snps( data, total_len, total_requested )
|
|
55 out_data = fix_selection_and_order_like_input(data, selected, total_requested)
|
|
56 write_selected_snps( options.output, out_data, lines, comments )
|
|
57
|
|
58 def to_int( value ):
|
|
59 try:
|
|
60 int_value = int( value )
|
|
61 except ValueError:
|
|
62 int_value = 0
|
|
63 return int_value
|
|
64
|
|
65 def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
|
|
66 fh = open( filename, 'r' )
|
|
67 if (chrom_idx >= pos_idx):
|
|
68 needed = chrom_idx + 1
|
|
69 else:
|
|
70 needed = pos_idx + 1
|
|
71 lines = []
|
|
72 data = []
|
|
73 comments = []
|
|
74 line_idx = 0
|
|
75 line_num = 0
|
|
76 for line in fh:
|
|
77 line_num += 1
|
|
78 line = line.rstrip('\r\n')
|
|
79 if line:
|
|
80 if line.startswith('#'):
|
|
81 comments.append(line)
|
|
82 else:
|
|
83 elems = line.split('\t')
|
|
84 if len(elems) >= needed:
|
|
85 chrom = elems[chrom_idx]
|
|
86 try:
|
|
87 pos = int(elems[pos_idx])
|
|
88 except ValueError:
|
|
89 sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
|
|
90 sys.exit(1)
|
|
91 lines.append(line)
|
|
92 chrom_sort = chrom.lstrip('chr')
|
|
93 data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
|
|
94 line_idx += 1
|
|
95 fh.close()
|
|
96 data = sorted( data, key=lambda x: (x[0], x[2]) )
|
|
97 return lines, data, comments
|
|
98
|
|
99 def select_snps( data, total_len, requested ):
|
|
100 old_chrom = None
|
|
101 next_print = 0
|
|
102 selected = []
|
|
103 space = total_len / requested
|
|
104 for data_idx, datum in enumerate( data ):
|
|
105 chrom = datum[1]
|
|
106 pos = datum[2]
|
|
107 if chrom != old_chrom:
|
|
108 old_chrom = chrom
|
|
109 next_print = 0
|
|
110 if pos >= next_print:
|
|
111 selected.append(data_idx)
|
|
112 next_print += space
|
|
113 return selected
|
|
114
|
|
115 def fix_selection_and_order_like_input(data, selected, requested):
|
|
116 total_selected = len( selected )
|
|
117 a = float( total_selected ) / requested
|
|
118 b = a / 2
|
|
119
|
|
120 idx_list = []
|
|
121 for i in range( requested ):
|
|
122 idx = int( math.ceil( i * a + b ) - 1 )
|
|
123 idx_list.append( idx )
|
|
124
|
|
125 out_data = []
|
|
126
|
|
127 for i, data_idx in enumerate(selected):
|
|
128 if total_selected > requested:
|
|
129 if i in idx_list:
|
|
130 out_data.append(data[data_idx])
|
|
131 else:
|
|
132 out_data.append(data[data_idx])
|
|
133
|
|
134 out_data = sorted( out_data, key=lambda x: x[3] )
|
|
135
|
|
136 return out_data
|
|
137
|
|
138 def write_selected_snps( filename, data, lines, comments ):
|
|
139 fh = open( filename, 'w' )
|
|
140
|
|
141 for comment in comments:
|
|
142 fh.write("%s\n" % comment )
|
|
143
|
|
144 for datum in data:
|
|
145 line_idx = datum[4]
|
|
146 fh.write("%s\n" % lines[line_idx])
|
|
147
|
|
148 fh.close()
|
|
149
|
|
150 if __name__ == "__main__":
|
|
151 main()
|
|
152
|
|
153
|