annotate replace_NA.py @ 4:0f07aec558b9 draft

Uploaded
author devteam
date Wed, 12 Feb 2014 16:05:46 -0500
parents 5cb2020a097a
children d4e292ddda05
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
5cb2020a097a Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
5cb2020a097a Uploaded
devteam
parents:
diff changeset
2
5cb2020a097a Uploaded
devteam
parents:
diff changeset
3 # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
4 # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
5
5cb2020a097a Uploaded
devteam
parents:
diff changeset
6 import sys
5cb2020a097a Uploaded
devteam
parents:
diff changeset
7 import os
5cb2020a097a Uploaded
devteam
parents:
diff changeset
8 import tempfile
5cb2020a097a Uploaded
devteam
parents:
diff changeset
9
5cb2020a097a Uploaded
devteam
parents:
diff changeset
10 # Constants.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
11 SEPARATOR = '\t'
5cb2020a097a Uploaded
devteam
parents:
diff changeset
12 TARGET = 'NA'
5cb2020a097a Uploaded
devteam
parents:
diff changeset
13 REPLACEMENT = -1
5cb2020a097a Uploaded
devteam
parents:
diff changeset
14 # List of known numerical columns.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
15 NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all']
5cb2020a097a Uploaded
devteam
parents:
diff changeset
16
5cb2020a097a Uploaded
devteam
parents:
diff changeset
17 # Use tempfile to store data.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
18 temp_out = tempfile.NamedTemporaryFile(delete=False)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
19
5cb2020a097a Uploaded
devteam
parents:
diff changeset
20 # Use first line to set up data structure and identify numerical columns.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
21 first_line = sys.stdin.readline()
5cb2020a097a Uploaded
devteam
parents:
diff changeset
22 fields = first_line.strip().split(SEPARATOR)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
23 numerical_cols = []
5cb2020a097a Uploaded
devteam
parents:
diff changeset
24 for i, f in enumerate(fields):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
25 if f in NUMERICAL_COLUMNS:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
26 numerical_cols.append(i)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
27
5cb2020a097a Uploaded
devteam
parents:
diff changeset
28 # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
29 col_type_counts = [ [0, 0] for i in range( len(fields) ) ]
5cb2020a097a Uploaded
devteam
parents:
diff changeset
30
5cb2020a097a Uploaded
devteam
parents:
diff changeset
31 # Set up function to process lines.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
32 def process_line_fields(fields):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
33 '''
5cb2020a097a Uploaded
devteam
parents:
diff changeset
34 Process fields in a line.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
35 '''
5cb2020a097a Uploaded
devteam
parents:
diff changeset
36 for i, f in enumerate(fields):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
37 # Ignore targets in calculation.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
38 if f == TARGET:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
39 continue
5cb2020a097a Uploaded
devteam
parents:
diff changeset
40
5cb2020a097a Uploaded
devteam
parents:
diff changeset
41 # Assume it's a number.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
42 type_index = 1
5cb2020a097a Uploaded
devteam
parents:
diff changeset
43 try:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
44 float(f)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
45 except:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
46 # Not a number.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
47 type_index = 0
5cb2020a097a Uploaded
devteam
parents:
diff changeset
48 col_type_counts[i][type_index] += 1
5cb2020a097a Uploaded
devteam
parents:
diff changeset
49
5cb2020a097a Uploaded
devteam
parents:
diff changeset
50
5cb2020a097a Uploaded
devteam
parents:
diff changeset
51 # Process first line.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
52 process_line_fields(fields)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
53 temp_out.write(first_line)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
54
5cb2020a097a Uploaded
devteam
parents:
diff changeset
55 # Process N-1 lines.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
56 for line in sys.stdin:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
57 fields = line.strip().split(SEPARATOR)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
58 process_line_fields(fields)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
59 temp_out.write(line)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
60
5cb2020a097a Uploaded
devteam
parents:
diff changeset
61 # Close temp file so that it can be read.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
62 temp_name = temp_out.name
5cb2020a097a Uploaded
devteam
parents:
diff changeset
63 temp_out.close()
5cb2020a097a Uploaded
devteam
parents:
diff changeset
64
5cb2020a097a Uploaded
devteam
parents:
diff changeset
65 # Get column type based on label or consensus.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
66 col_types = range(len(col_type_counts))
5cb2020a097a Uploaded
devteam
parents:
diff changeset
67 for i, counts in enumerate(col_type_counts):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
68 if i in numerical_cols:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
69 col_type = 'number'
5cb2020a097a Uploaded
devteam
parents:
diff changeset
70 elif counts[0] > counts[1]:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
71 col_type = 'string'
5cb2020a097a Uploaded
devteam
parents:
diff changeset
72 else:
5cb2020a097a Uploaded
devteam
parents:
diff changeset
73 col_type = 'number'
5cb2020a097a Uploaded
devteam
parents:
diff changeset
74 col_types[i] = col_type
5cb2020a097a Uploaded
devteam
parents:
diff changeset
75
5cb2020a097a Uploaded
devteam
parents:
diff changeset
76 # Replace target in number columns.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
77 for line in open(temp_name, 'r'):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
78 fields = line.strip().split(SEPARATOR)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
79 for i, f in enumerate(fields):
5cb2020a097a Uploaded
devteam
parents:
diff changeset
80 if fields[i] == TARGET and col_types[i] == 'number':
5cb2020a097a Uploaded
devteam
parents:
diff changeset
81 fields[i] = str(REPLACEMENT)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
82 print SEPARATOR.join(fields)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
83
5cb2020a097a Uploaded
devteam
parents:
diff changeset
84 # Clean up temp file.
5cb2020a097a Uploaded
devteam
parents:
diff changeset
85 temp_out.close()
5cb2020a097a Uploaded
devteam
parents:
diff changeset
86 os.unlink(temp_out.name)
5cb2020a097a Uploaded
devteam
parents:
diff changeset
87
5cb2020a097a Uploaded
devteam
parents:
diff changeset
88
5cb2020a097a Uploaded
devteam
parents:
diff changeset
89