comparison replace_NA.py @ 3:5cb2020a097a draft

Uploaded
author devteam
date Wed, 12 Feb 2014 15:44:18 -0500
parents
children d4e292ddda05
comparison
equal deleted inserted replaced
2:9c75a9b5ecd2 3:5cb2020a097a
1 #!/usr/bin/env python
2
3 # Reads a tabular file and replaces a target sequence (currently 'NA') with a number in columns that have numerical values.
4 # Limitations: (a) can only take input from stdin and (b) cannot specify target or replacement.
5
6 import sys
7 import os
8 import tempfile
9
10 # Constants.
11 SEPARATOR = '\t'
12 TARGET = 'NA'
13 REPLACEMENT = -1
14 # List of known numerical columns.
15 NUMERICAL_COLUMNS = ['1000g2012apr_all', 'esp6500si_all']
16
17 # Use tempfile to store data.
18 temp_out = tempfile.NamedTemporaryFile(delete=False)
19
20 # Use first line to set up data structure and identify numerical columns.
21 first_line = sys.stdin.readline()
22 fields = first_line.strip().split(SEPARATOR)
23 numerical_cols = []
24 for i, f in enumerate(fields):
25 if f in NUMERICAL_COLUMNS:
26 numerical_cols.append(i)
27
28 # Data structure is a 2-element list for each fields; first element is # of string elements and second element is # of number elements.
29 col_type_counts = [ [0, 0] for i in range( len(fields) ) ]
30
31 # Set up function to process lines.
32 def process_line_fields(fields):
33 '''
34 Process fields in a line.
35 '''
36 for i, f in enumerate(fields):
37 # Ignore targets in calculation.
38 if f == TARGET:
39 continue
40
41 # Assume it's a number.
42 type_index = 1
43 try:
44 float(f)
45 except:
46 # Not a number.
47 type_index = 0
48 col_type_counts[i][type_index] += 1
49
50
51 # Process first line.
52 process_line_fields(fields)
53 temp_out.write(first_line)
54
55 # Process N-1 lines.
56 for line in sys.stdin:
57 fields = line.strip().split(SEPARATOR)
58 process_line_fields(fields)
59 temp_out.write(line)
60
61 # Close temp file so that it can be read.
62 temp_name = temp_out.name
63 temp_out.close()
64
65 # Get column type based on label or consensus.
66 col_types = range(len(col_type_counts))
67 for i, counts in enumerate(col_type_counts):
68 if i in numerical_cols:
69 col_type = 'number'
70 elif counts[0] > counts[1]:
71 col_type = 'string'
72 else:
73 col_type = 'number'
74 col_types[i] = col_type
75
76 # Replace target in number columns.
77 for line in open(temp_name, 'r'):
78 fields = line.strip().split(SEPARATOR)
79 for i, f in enumerate(fields):
80 if fields[i] == TARGET and col_types[i] == 'number':
81 fields[i] = str(REPLACEMENT)
82 print SEPARATOR.join(fields)
83
84 # Clean up temp file.
85 temp_out.close()
86 os.unlink(temp_out.name)
87
88
89