annotate tablemerger.py @ 7:8de0ffc2166f draft default tip

Uploaded
author rreumerman
date Mon, 10 Jun 2013 09:40:54 -0400
parents bd5692103d5b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
1 '''Takes tab-delimited SNP tables from user input and merges them into one.'''
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
2
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
3 import sys
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
4 files = []
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
5 filenames = []
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
6
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
7 try:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
8 output = open(sys.argv[1], "w")
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
9 output.write('Position\tReference')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
10 except:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
11 exit("No output file given or unable to open output file.")
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
12 for name in sys.argv[2:]:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
13 try:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
14 files.append(open(name, "rU"))
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
15 except:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
16 continue
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
17
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
18 # Fetch headers and print them to output file;
7
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
19 headers = [File.readline()[:-1].split('\t')[1:] for File in files]
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
20 columns = [len(strains[1:]) for strains in headers]
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
21 output.write('\t'.join(['Position']+[headers[0][0]]+[a for b in headers for a in b[1:]]))
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
22 ##headers = [header.readline()[:-1].split('\t')[2:] for header in files]
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
23 ##columns = [len(strains) for strains in headers]
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
24 ##for strain in [a for b in headers for a in b]:
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
25 ## output.write('\t'+strain)
8de0ffc2166f Uploaded
rreumerman
parents: 4
diff changeset
26 ## output.flush()
4
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
27
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
28 file_active = [True]*len(files)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
29 snps = [row.readline()[:-1].split('\t') for row in files]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
30
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
31 while True in file_active:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
32 for h in range(0,len(snps)):
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
33 if file_active[h]:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
34 cur_pos = [h]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
35 lowest = int(snps[h][0])
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
36 break
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
37 i = 1
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
38
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
39 # Determine lowest position
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
40 while i < len(snps):
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
41 if int(snps[i][0]) < lowest and file_active[i]:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
42 lowest = int(snps[i][0])
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
43 cur_pos = [i]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
44 elif int(snps[i][0]) == lowest:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
45 cur_pos.append(i)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
46 i+=1
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
47
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
48 # Check if all SNPs sharing a position have the same reference base, exit with message otherwise;
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
49 if len(cur_pos) > 1:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
50 ref_base = snps[cur_pos[0]][1].lower()
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
51 for j in cur_pos[1:]:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
52 if snps[j][1].lower() != ref_base:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
53 error = '\nError: Reference bases not the same for position %s, present in following files:' % lowest
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
54 for k in cur_pos:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
55 error += ' '+filenames[k]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
56 exit(error+'.')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
57
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
58 # Write line to output file containing position, ref base and snps/empty cells;
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
59 output.write('\n'+snps[cur_pos[0]][0]+'\t'+snps[cur_pos[0]][1].lower())
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
60 for l,row in enumerate(snps):
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
61 if l in cur_pos:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
62 for snp in row[2:]:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
63 output.write('\t'+snp)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
64 else:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
65 output.write('\t'*columns[l])
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
66
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
67 # Read new line in files that had snp at current position;
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
68 for m in cur_pos:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
69 line = files[m].readline()
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
70 if line == '': file_active[m] = False
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
71 else:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
72 snps[m] = line.split('\t')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
73 snps[m][-1] = snps[m][-1].rstrip()# Remove newline character at end of line;
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
74
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
75 for it in files: it.close()
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
76 output.close()