diff select_snps.py @ 12:4b6590dd7250

Uploaded
author miller-lab
date Wed, 12 Sep 2012 17:10:26 -0400
parents 2c498d40ecde
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/select_snps.py	Wed Sep 12 17:10:26 2012 -0400
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import math
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function(parse_arguments=None):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: (None, arguments)
+    def main_decorator(to_decorate):
+        def decorated_main(arguments=None):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments(arguments)
+            sys.exit(to_decorate(options, arguments))
+        return decorated_main
+    return main_decorator
+
+def parse_arguments(arguments):
+    parser = OptionParser()
+    parser.add_option('--input', dest='input')
+    parser.add_option('--output', dest='output')
+    parser.add_option('--index_dir', dest='index_dir')
+    parser.add_option('--num_snps', dest='num_snps')
+    parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
+    parser.add_option('--ref_pos_col', dest='ref_pos_col')
+    parser.add_option('--ref_species', dest='ref_species')
+    return parser.parse_args(arguments[1:])
+
+@main_function(parse_arguments)
+def main(options, arguments):
+
+    ref_chrom_idx = to_int( options.ref_chrom_col ) -1
+    ref_pos_idx = to_int( options.ref_pos_col ) -1
+
+    if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
+        print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
+        sys.exit(1)
+
+    chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
+    chrom_len_file = '%s.len' % options.ref_species
+    chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
+
+    chrlens = gd.ChrLens( chrom_len_path )
+
+    total_len = 0
+    for chrom in chrlens:
+        total_len += chrlens.length(chrom)
+
+    total_requested = int( options.num_snps )
+    lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
+    selected = select_snps( data, total_len, total_requested )
+    out_data = fix_selection_and_order_like_input(data, selected, total_requested)
+    write_selected_snps( options.output, out_data, lines, comments )
+
+def to_int( value ):
+    try:
+        int_value = int( value )
+    except ValueError:
+        int_value = 0
+    return int_value
+
+def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
+    fh = open( filename, 'r' )
+    if (chrom_idx >= pos_idx):
+        needed = chrom_idx + 1
+    else:
+        needed = pos_idx + 1
+    lines = []
+    data = []
+    comments = []
+    line_idx = 0
+    line_num = 0
+    for line in fh:
+        line_num += 1
+        line = line.rstrip('\r\n')
+        if line:
+            if line.startswith('#'):
+                comments.append(line)
+            else:
+                elems = line.split('\t')
+                if len(elems) >= needed:
+                    chrom = elems[chrom_idx]
+                    try:
+                        pos = int(elems[pos_idx])
+                    except ValueError:
+                        sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
+                        sys.exit(1)
+                    lines.append(line)
+                    chrom_sort = chrom.lstrip('chr')
+                    data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
+                    line_idx += 1
+    fh.close()
+    data = sorted( data, key=lambda x: (x[0], x[2]) )
+    return lines, data, comments
+
+def select_snps( data, total_len, requested ):
+    old_chrom = None
+    next_print = 0
+    selected = []
+    space = total_len / requested
+    for data_idx, datum in enumerate( data ):
+        chrom = datum[1]
+        pos = datum[2]
+        if chrom != old_chrom:
+            old_chrom = chrom
+            next_print = 0
+        if pos >= next_print:
+            selected.append(data_idx)
+            next_print += space
+    return selected
+
+def fix_selection_and_order_like_input(data, selected, requested):
+    total_selected = len( selected )
+    a = float( total_selected ) / requested
+    b = a / 2
+
+    idx_list = []
+    for i in range( requested ):
+        idx = int( math.ceil( i * a + b ) - 1 )
+        idx_list.append( idx )
+
+    out_data = []
+
+    for i, data_idx in enumerate(selected):
+        if total_selected > requested:
+            if i in idx_list:
+                out_data.append(data[data_idx])
+        else:
+            out_data.append(data[data_idx])
+
+    out_data = sorted( out_data, key=lambda x: x[3] )
+
+    return out_data
+
+def write_selected_snps( filename, data, lines, comments ):
+    fh = open( filename, 'w' )
+
+    for comment in comments:
+        fh.write("%s\n" % comment )
+
+    for datum in data:
+        line_idx = datum[4]
+        fh.write("%s\n" % lines[line_idx])
+
+    fh.close()
+
+if __name__ == "__main__":
+    main()
+
+