diff tablemerger.py @ 4:bd5692103d5b draft

Uploaded
author rreumerman
date Fri, 05 Apr 2013 05:00:40 -0400
parents
children 8de0ffc2166f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tablemerger.py	Fri Apr 05 05:00:40 2013 -0400
@@ -0,0 +1,73 @@
+'''Takes tab-delimited SNP tables from user input and merges them into one.'''
+
+import sys
+files = []
+filenames = []
+
+try:
+    output = open(sys.argv[1], "w")
+    output.write('Position\tReference')
+except:
+    exit("No output file given or unable to open output file.")
+for name in sys.argv[2:]:
+    try:
+        files.append(open(name, "rU"))
+    except:
+        continue
+
+# Fetch headers and print them to output file;
+headers = [header.readline()[:-1].split('\t')[2:] for header in files]
+columns = [len(strains) for strains in headers]
+for strain in [a for b in headers for a in b]:
+    output.write('\t'+strain)
+    output.flush()
+
+file_active = [True]*len(files)
+snps = [row.readline()[:-1].split('\t') for row in files]
+
+while True in file_active:
+    for h in range(0,len(snps)):
+        if file_active[h]:
+            cur_pos = [h]
+            lowest = int(snps[h][0])
+            break
+    i = 1
+
+    # Determine lowest position
+    while i < len(snps):
+        if int(snps[i][0]) < lowest and file_active[i]:
+            lowest = int(snps[i][0])
+            cur_pos = [i]
+        elif int(snps[i][0]) == lowest:
+            cur_pos.append(i)
+        i+=1
+
+    # Check if all SNPs sharing a position have the same reference base, exit with message otherwise;
+    if len(cur_pos) > 1:
+        ref_base = snps[cur_pos[0]][1].lower()
+        for j in cur_pos[1:]:
+            if snps[j][1].lower() != ref_base:
+                error = '\nError: Reference bases not the same for position %s, present in following files:' % lowest
+                for k in cur_pos:
+                    error += ' '+filenames[k]
+                exit(error+'.')
+
+    # Write line to output file containing position, ref base and snps/empty cells;
+    output.write('\n'+snps[cur_pos[0]][0]+'\t'+snps[cur_pos[0]][1].lower())
+    for l,row in enumerate(snps):
+        if l in cur_pos:
+            for snp in row[2:]:
+                output.write('\t'+snp)
+        else:
+            output.write('\t'*columns[l])
+
+    # Read new line in files that had snp at current position;
+    for m in cur_pos:
+        line = files[m].readline()
+        if line == '': file_active[m] = False
+        else:
+            snps[m] = line.split('\t')
+            snps[m][-1] = snps[m][-1].rstrip()# Remove newline character at end of line;
+
+for it in files: it.close()
+output.close()