# HG changeset patch # User devteam # Date 1400517194 14400 # Node ID ab28e7de2db3b2ddbb849f3c072ce9094621aa2c Imported from capsule None diff -r 000000000000 -r ab28e7de2db3 convert_SOLiD_color2nuc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_SOLiD_color2nuc.py Mon May 19 12:33:14 2014 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python +""" +convert SOLiD calor-base data to nucleotide sequence +example: T011213122200221123032111221021210131332222101 + TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT +""" + +import sys, os + +def stop_err(msg): + + sys.stderr.write(msg) + sys.stderr.write('\n') + sys.exit() + +def color2base(color_seq): + + first_nuc = ['A','C','G','T'] + code_matrix = {} + code_matrix['0'] = ['A','C','G','T'] + code_matrix['1'] = ['C','A','T','G'] + code_matrix['2'] = ['G','T','A','C'] + code_matrix['3'] = ['T','G','C','A'] + + overlap_nuc = '' + nuc_seq = '' + + seq_prefix = prefix = color_seq[0].upper() + color_seq = color_seq[1:] + + if not (seq_prefix in first_nuc): + stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix ) + + for code in color_seq: + + if not (code in ['0','1','2','3']): + stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code) + + second_nuc = code_matrix[code] + overlap_nuc = second_nuc[first_nuc.index(prefix)] + nuc_seq += overlap_nuc + prefix = overlap_nuc + + return seq_prefix, nuc_seq + +def __main__(): + + infilename = sys.argv[1] + keep_prefix = sys.argv[2].lower() + outfilename = sys.argv[3] + + outfile = open(outfilename,'w') + + prefix = '' + color_seq = '' + for i, line in enumerate(file(infilename)): + line = line.rstrip('\r\n') + + if not line: continue + if line.startswith("#"): continue + + if line.startswith(">"): + + if color_seq: + prefix, nuc_seq = color2base(color_seq) + + if keep_prefix == 'yes': + nuc_seq = prefix + nuc_seq + + outfile.write(title+'\n') + outfile.write(nuc_seq+'\n') + + title = line + color_seq = '' + else: + color_seq += line + + if color_seq: + prefix, nuc_seq = color2base(color_seq) + + if keep_prefix == 'yes': + nuc_seq = prefix + nuc_seq + + outfile.write(title+'\n') + outfile.write(nuc_seq+'\n') + + outfile.close() + +if __name__=='__main__': __main__() diff -r 000000000000 -r ab28e7de2db3 convert_SOLiD_color2nuc.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_SOLiD_color2nuc.xml Mon May 19 12:33:14 2014 -0400 @@ -0,0 +1,72 @@ + + to Nucleotides +convert_SOLiD_color2nuc.py $input1 $input2 $output1 + + + + + + + + + + + + + + +.. class:: warningmark + +The tool was designed for color space files generated from an ABI SOLiD sequencer. The file format must be fasta-like: the title starts with a ">" character, and each color space sequence starts with a leading nucleotide. + +----- + +**What it does** + +This tool converts a color space sequence to nucleotides. The leading character must be a nucleotide: A, C, G, or T. + +----- + +**Example** + +- If the color space file looks like this:: + + >seq1 + A013 + >seq2 + T011213122200221123032111221021210131332222101 + +- If you would like to **keep** the leading nucleotide:: + + >seq1 + AACG + >seq2 + TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT + +- If you **do not want to keep** the leading nucleotide (the length of nucleotide sequence will be one less than the color-space sequence):: + + >seq1 + ACG + >seq2 + TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT + +----- + +**ABI SOLiD Color Coding Alignment matrix** + + Each di-nucleotide is represented by a single digit: 0 to 3. The matrix is symmetric, thus the leading nucleotide is necessary to determine the sequence (otherwise there are four possibilities). + + + .. image:: dualcolorcode.png + + + + diff -r 000000000000 -r ab28e7de2db3 dualcolorcode.png Binary file dualcolorcode.png has changed diff -r 000000000000 -r ab28e7de2db3 test-data/convert_SOLiD_color2nuc_test1.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/convert_SOLiD_color2nuc_test1.out Mon May 19 12:33:14 2014 -0400 @@ -0,0 +1,22 @@ +>2_14_26_F3 +TGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT +>2_14_192_F3 +GTTTCAGACAAATGGGCCAGGGAGGCTCTCACATCAAGACAGAGC +>2_14_233_F3 +TGTTTGCGATGTGACTGATGAAGATGGAATACTCCACGACACTCG +>2_14_294_F3 +CATTGACGATTTTTTTCATCGACTCGACCGCCCCGCAAGCGGCGA +>2_14_463_F3 +GCTTAGGCCTTTCCTCCTTTAGTGCCTCTCTTCTAAAGACAAGAG +>2_14_578_F3 +GCAACGGCTACCAGGAGATCAGTGGTTGCCACCTCATGACAAGAG +>2_14_956_F3 +CAAGTAATTTCTGAAATCCCAACTTCATCAGAAACAGCTCCAGCA +>2_14_988_F3 +CTGAAGGCATGGGCACCTTATTTCGGATAATTGCCAACCCTACGC +>2_14_1028_F3 +GTCTAATTGTTGTTGTCCTTGCGTGTGCCTGTTTACAAGTGTAGT +>2_14_1035_F3 +TTAATACCGCCGATCACCTTGTTCAAAATCTTATTGGATGTTATC +>2_14_1157_F3 +ATTAAGTTCCTAGTCGCTCCTAAAAGCCAAGTTTTGCGTTGTCTT diff -r 000000000000 -r ab28e7de2db3 test-data/convert_SOLiD_color2nuc_test1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/convert_SOLiD_color2nuc_test1.txt Mon May 19 12:33:14 2014 -0400 @@ -0,0 +1,22 @@ +>2_14_26_F3 +T011213122200221123032111221021210131332222101 +>2_14_192_F3 +T110021221100310030120022032222111321022112223 +>2_14_233_F3 +T011001332311121212312022310203312201132111223 +>2_14_294_F3 +T213012132300000021323212232103300033102330332 +>2_14_463_F3 +T132032030200202202003211302222202230022110222 +>2_14_578_F3 +T131013032310120222321211010130110221312110222 +>2_14_956_F3 +T210213030022120032001012021321220011232201231 +>2_14_988_F3 +T221202031310031102033002302330301301010023133 +>2_14_1028_F3 +T112230301101101120201331111302110031102111321 +>2_14_1035_F3 +T003033103303232110201102100032203301023110332 +>2_14_1157_F3 +T330302102023212332202300023010210001331011220 \ No newline at end of file