Mercurial > repos > devteam > convert_solid_color2nuc
diff convert_SOLiD_color2nuc.py @ 0:ab28e7de2db3 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:33:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_SOLiD_color2nuc.py Mon May 19 12:33:14 2014 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python +""" +convert SOLiD calor-base data to nucleotide sequence +example: T011213122200221123032111221021210131332222101 + TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT +""" + +import sys, os + +def stop_err(msg): + + sys.stderr.write(msg) + sys.stderr.write('\n') + sys.exit() + +def color2base(color_seq): + + first_nuc = ['A','C','G','T'] + code_matrix = {} + code_matrix['0'] = ['A','C','G','T'] + code_matrix['1'] = ['C','A','T','G'] + code_matrix['2'] = ['G','T','A','C'] + code_matrix['3'] = ['T','G','C','A'] + + overlap_nuc = '' + nuc_seq = '' + + seq_prefix = prefix = color_seq[0].upper() + color_seq = color_seq[1:] + + if not (seq_prefix in first_nuc): + stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix ) + + for code in color_seq: + + if not (code in ['0','1','2','3']): + stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code) + + second_nuc = code_matrix[code] + overlap_nuc = second_nuc[first_nuc.index(prefix)] + nuc_seq += overlap_nuc + prefix = overlap_nuc + + return seq_prefix, nuc_seq + +def __main__(): + + infilename = sys.argv[1] + keep_prefix = sys.argv[2].lower() + outfilename = sys.argv[3] + + outfile = open(outfilename,'w') + + prefix = '' + color_seq = '' + for i, line in enumerate(file(infilename)): + line = line.rstrip('\r\n') + + if not line: continue + if line.startswith("#"): continue + + if line.startswith(">"): + + if color_seq: + prefix, nuc_seq = color2base(color_seq) + + if keep_prefix == 'yes': + nuc_seq = prefix + nuc_seq + + outfile.write(title+'\n') + outfile.write(nuc_seq+'\n') + + title = line + color_seq = '' + else: + color_seq += line + + if color_seq: + prefix, nuc_seq = color2base(color_seq) + + if keep_prefix == 'yes': + nuc_seq = prefix + nuc_seq + + outfile.write(title+'\n') + outfile.write(nuc_seq+'\n') + + outfile.close() + +if __name__=='__main__': __main__()