3
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values.
|
|
4 # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement.
|
|
5
|
|
6 import sys
|
|
7 import os
|
|
8 import tempfile
|
|
9
|
|
10 # Constants.
|
|
11 SEPARATOR = '\t'
|
|
12 TARGET = 'NA'
|
|
13 REPLACEMENT = -1
|
|
14 # List of known numerical columns.
|
|
15 NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all']
|
|
16
|
|
17 # Use tempfile to store data.
|
|
18 temp_out = tempfile.NamedTemporaryFile(delete=False)
|
|
19
|
|
20 # Use first line to set up data structure and identify numerical columns.
|
|
21 first_line = sys.stdin.readline()
|
|
22 fields = first_line.strip().split(SEPARATOR)
|
|
23 numerical_cols = []
|
|
24 for i, f in enumerate(fields):
|
|
25 if f in NUMERICAL_COLUMNS:
|
|
26 numerical_cols.append(i)
|
|
27
|
|
28 # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements.
|
|
29 col_type_counts = [ [0, 0] for i in range( len(fields) ) ]
|
|
30
|
|
31 # Set up function to process lines.
|
|
32 def process_line_fields(fields):
|
|
33 '''
|
|
34 Process fields in a line.
|
|
35 '''
|
|
36 for i, f in enumerate(fields):
|
|
37 # Ignore targets in calculation.
|
|
38 if f == TARGET:
|
|
39 continue
|
|
40
|
|
41 # Assume it's a number.
|
|
42 type_index = 1
|
|
43 try:
|
|
44 float(f)
|
|
45 except:
|
|
46 # Not a number.
|
|
47 type_index = 0
|
|
48 col_type_counts[i][type_index] += 1
|
|
49
|
|
50
|
|
51 # Process first line.
|
|
52 process_line_fields(fields)
|
|
53 temp_out.write(first_line)
|
|
54
|
|
55 # Process N-1 lines.
|
|
56 for line in sys.stdin:
|
|
57 fields = line.strip().split(SEPARATOR)
|
|
58 process_line_fields(fields)
|
|
59 temp_out.write(line)
|
|
60
|
|
61 # Close temp file so that it can be read.
|
|
62 temp_name = temp_out.name
|
|
63 temp_out.close()
|
|
64
|
|
65 # Get column type based on label or consensus.
|
|
66 col_types = range(len(col_type_counts))
|
|
67 for i, counts in enumerate(col_type_counts):
|
|
68 if i in numerical_cols:
|
|
69 col_type = 'number'
|
|
70 elif counts[0] > counts[1]:
|
|
71 col_type = 'string'
|
|
72 else:
|
|
73 col_type = 'number'
|
|
74 col_types[i] = col_type
|
|
75
|
|
76 # Replace target in number columns.
|
|
77 for line in open(temp_name, 'r'):
|
|
78 fields = line.strip().split(SEPARATOR)
|
|
79 for i, f in enumerate(fields):
|
|
80 if fields[i] == TARGET and col_types[i] == 'number':
|
|
81 fields[i] = str(REPLACEMENT)
|
|
82 print SEPARATOR.join(fields)
|
|
83
|
|
84 # Clean up temp file.
|
|
85 temp_out.close()
|
|
86 os.unlink(temp_out.name)
|
|
87
|
|
88
|
|
89
|