changeset 0:ab28e7de2db3 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:14 -0400
parents
children
files convert_SOLiD_color2nuc.py convert_SOLiD_color2nuc.xml dualcolorcode.png test-data/convert_SOLiD_color2nuc_test1.out test-data/convert_SOLiD_color2nuc_test1.txt
diffstat 5 files changed, 205 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_SOLiD_color2nuc.py	Mon May 19 12:33:14 2014 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+"""
+convert SOLiD calor-base data to nucleotide sequence
+example: T011213122200221123032111221021210131332222101
+         TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+"""
+
+import sys, os
+
+def stop_err(msg):
+    
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+    sys.exit()
+    
+def color2base(color_seq):
+
+    first_nuc = ['A','C','G','T']
+    code_matrix = {}
+    code_matrix['0'] = ['A','C','G','T']
+    code_matrix['1'] = ['C','A','T','G']
+    code_matrix['2'] = ['G','T','A','C']
+    code_matrix['3'] = ['T','G','C','A']
+
+    overlap_nuc = ''
+    nuc_seq = ''
+    
+    seq_prefix = prefix = color_seq[0].upper()
+    color_seq = color_seq[1:]
+                
+    if not (seq_prefix in first_nuc):
+        stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix )
+
+    for code in color_seq:
+        
+        if not (code in ['0','1','2','3']):
+            stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code)
+        
+        second_nuc = code_matrix[code]
+        overlap_nuc = second_nuc[first_nuc.index(prefix)]
+        nuc_seq += overlap_nuc
+        prefix = overlap_nuc
+
+    return seq_prefix, nuc_seq
+
+def __main__():
+
+    infilename = sys.argv[1]
+    keep_prefix = sys.argv[2].lower()
+    outfilename = sys.argv[3]
+
+    outfile = open(outfilename,'w')
+
+    prefix = ''
+    color_seq = ''
+    for i, line in enumerate(file(infilename)):
+        line = line.rstrip('\r\n')
+
+        if not line: continue
+        if line.startswith("#"): continue
+    
+        if line.startswith(">"):
+            
+            if color_seq:
+                prefix, nuc_seq = color2base(color_seq)
+                
+                if keep_prefix == 'yes':
+                    nuc_seq = prefix + nuc_seq
+                
+                outfile.write(title+'\n')
+                outfile.write(nuc_seq+'\n')
+                
+            title = line
+            color_seq = ''
+        else:
+            color_seq += line
+            
+    if color_seq:
+        prefix, nuc_seq = color2base(color_seq)
+                
+        if keep_prefix == 'yes':
+            nuc_seq = prefix + nuc_seq
+
+        outfile.write(title+'\n')
+        outfile.write(nuc_seq+'\n')
+            
+    outfile.close()
+    
+if __name__=='__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_SOLiD_color2nuc.xml	Mon May 19 12:33:14 2014 -0400
@@ -0,0 +1,72 @@
+<tool id="color2nuc" name="Convert Color Space" version="1.0.0">
+<description> to Nucleotides </description>
+<command interpreter="python">convert_SOLiD_color2nuc.py $input1 $input2 $output1 </command>
+
+<inputs>
+    <param name="input1" type="data" format="txt" label="SOLiD color coding file" />
+    <param name="input2" type="select" label="Keep prefix nucleotide">
+    	<option value="yes">Yes</option>
+    	<option value="no">No</option>
+    </param>
+</inputs>
+<outputs>
+  	<data name="output1" format="fasta" />
+</outputs>
+<!-- 
+<tests>
+	<test>
+		<param name="input1" value="convert_SOLiD_color2nuc_test1.txt" ftype="txt" />
+		<param name="input2" value="no" />
+		<output name="output1" file="convert_SOLiD_color2nuc_test1.out" />
+	</test>
+</tests>
+-->
+<help>
+
+.. class:: warningmark
+
+The tool was designed for color space files generated from an ABI SOLiD sequencer. The file format must be fasta-like: the title starts with a ">" character, and each color space sequence starts with a leading nucleotide.
+ 
+-----
+
+**What it does**
+
+This tool converts a color space sequence to nucleotides. The leading character must be a nucleotide: A, C, G, or T. 
+ 
+-----
+
+**Example**
+
+- If the color space file looks like this::
+
+	&gt;seq1
+	A013
+	&gt;seq2
+	T011213122200221123032111221021210131332222101
+	
+- If you would like to **keep** the leading nucleotide::
+
+	&gt;seq1
+	AACG
+	&gt;seq2
+	TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+	
+- If you **do not want to keep** the leading nucleotide (the length of nucleotide sequence will be one less than the color-space sequence)::
+ 
+ 	&gt;seq1
+ 	ACG
+ 	&gt;seq2
+	TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT 
+
+-----
+
+**ABI SOLiD Color Coding Alignment matrix**
+
+ Each di-nucleotide is represented by a single digit: 0 to 3. The matrix is symmetric, thus the leading nucleotide is necessary to determine the sequence (otherwise there are four possibilities).
+
+ 
+ .. image:: dualcolorcode.png
+
+
+</help>
+</tool>
Binary file dualcolorcode.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/convert_SOLiD_color2nuc_test1.out	Mon May 19 12:33:14 2014 -0400
@@ -0,0 +1,22 @@
+>2_14_26_F3
+TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT
+>2_14_192_F3
+GTTTCAGACAAATGGGCCAGGGAGGCTCTCACATCAAGACAGAGC
+>2_14_233_F3
+TGTTTGCGATGTGACTGATGAAGATGGAATACTCCACGACACTCG
+>2_14_294_F3
+CATTGACGATTTTTTTCATCGACTCGACCGCCCCGCAAGCGGCGA
+>2_14_463_F3
+GCTTAGGCCTTTCCTCCTTTAGTGCCTCTCTTCTAAAGACAAGAG
+>2_14_578_F3
+GCAACGGCTACCAGGAGATCAGTGGTTGCCACCTCATGACAAGAG
+>2_14_956_F3
+CAAGTAATTTCTGAAATCCCAACTTCATCAGAAACAGCTCCAGCA
+>2_14_988_F3
+CTGAAGGCATGGGCACCTTATTTCGGATAATTGCCAACCCTACGC
+>2_14_1028_F3
+GTCTAATTGTTGTTGTCCTTGCGTGTGCCTGTTTACAAGTGTAGT
+>2_14_1035_F3
+TTAATACCGCCGATCACCTTGTTCAAAATCTTATTGGATGTTATC
+>2_14_1157_F3
+ATTAAGTTCCTAGTCGCTCCTAAAAGCCAAGTTTTGCGTTGTCTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/convert_SOLiD_color2nuc_test1.txt	Mon May 19 12:33:14 2014 -0400
@@ -0,0 +1,22 @@
+>2_14_26_F3
+T011213122200221123032111221021210131332222101
+>2_14_192_F3
+T110021221100310030120022032222111321022112223
+>2_14_233_F3
+T011001332311121212312022310203312201132111223
+>2_14_294_F3
+T213012132300000021323212232103300033102330332
+>2_14_463_F3
+T132032030200202202003211302222202230022110222
+>2_14_578_F3
+T131013032310120222321211010130110221312110222
+>2_14_956_F3
+T210213030022120032001012021321220011232201231
+>2_14_988_F3
+T221202031310031102033002302330301301010023133
+>2_14_1028_F3
+T112230301101101120201331111302110031102111321
+>2_14_1035_F3
+T003033103303232110201102100032203301023110332
+>2_14_1157_F3
+T330302102023212332202300023010210001331011220
\ No newline at end of file