diff convert_SOLiD_color2nuc.py @ 0:ab28e7de2db3 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_SOLiD_color2nuc.py	Mon May 19 12:33:14 2014 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+"""
+convert SOLiD calor-base data to nucleotide sequence
+example: T011213122200221123032111221021210131332222101
+         TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+"""
+
+import sys, os
+
+def stop_err(msg):
+    
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+    sys.exit()
+    
+def color2base(color_seq):
+
+    first_nuc = ['A','C','G','T']
+    code_matrix = {}
+    code_matrix['0'] = ['A','C','G','T']
+    code_matrix['1'] = ['C','A','T','G']
+    code_matrix['2'] = ['G','T','A','C']
+    code_matrix['3'] = ['T','G','C','A']
+
+    overlap_nuc = ''
+    nuc_seq = ''
+    
+    seq_prefix = prefix = color_seq[0].upper()
+    color_seq = color_seq[1:]
+                
+    if not (seq_prefix in first_nuc):
+        stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
+
+    for code in color_seq:
+        
+        if not (code in ['0','1','2','3']):
+            stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
+        
+        second_nuc = code_matrix[code]
+        overlap_nuc = second_nuc[first_nuc.index(prefix)]
+        nuc_seq += overlap_nuc
+        prefix = overlap_nuc
+
+    return seq_prefix, nuc_seq
+
+def __main__():
+
+    infilename = sys.argv[1]
+    keep_prefix = sys.argv[2].lower()
+    outfilename = sys.argv[3]
+
+    outfile = open(outfilename,'w')
+
+    prefix = ''
+    color_seq = ''
+    for i, line in enumerate(file(infilename)):
+        line = line.rstrip('\r\n')
+
+        if not line: continue
+        if line.startswith("#"): continue
+    
+        if line.startswith(">"):
+            
+            if color_seq:
+                prefix, nuc_seq = color2base(color_seq)
+                
+                if keep_prefix == 'yes':
+                    nuc_seq = prefix + nuc_seq
+                
+                outfile.write(title+'\n')
+                outfile.write(nuc_seq+'\n')
+                
+            title = line
+            color_seq = ''
+        else:
+            color_seq += line
+            
+    if color_seq:
+        prefix, nuc_seq = color2base(color_seq)
+                
+        if keep_prefix == 'yes':
+            nuc_seq = prefix + nuc_seq
+
+        outfile.write(title+'\n')
+        outfile.write(nuc_seq+'\n')
+            
+    outfile.close()
+    
+if __name__=='__main__': __main__()