Mercurial > repos > rnateam > graphclust_preprocessing
view splitStockholm.py @ 13:2a5defc09381 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/GraphClust commit 4406735e44aba20859c252be39f4e99df28c7a92
author | rnateam |
---|---|
date | Sat, 27 Oct 2018 13:27:57 -0400 |
parents | c0c9d19bc7b2 |
children |
line wrap: on
line source
#!/usr/bin/env python ######## # This script reads multiple alignments merged in single stockholm file # and splits the alignment blocks according to data.names table # The first sequence of each alignment file assumed to match to names table entries # Author: M. Miladi ######## import os import re import sys from Bio import AlignIO, SeqIO try: from StringIO import StringIO except ImportError: from io import StringIO stk_file = sys.argv[1] print ("Parsing and splitting stk file:{}".format(stk_file)) target_f = "alignment_data_split.stk" pattern = re.compile("^>.*$") toWriteID = "" count_for_id = 1 seq_counter = 0 new_id = "" seq_id = [] seq_string = [] orig_id = [] name_file = "FASTA/data.names" array_all_chunks = [] with open(name_file, 'r') as f: for line in f: if len(line.strip()) == 0: continue seq_id.append(int(line.split()[0])) seq_string.append(line.split()[1]) orig_id_srt = line.split()[3] orig_id_srt = orig_id_srt.rsplit('_',1)[0] orig_id.append(orig_id_srt) with open(stk_file) as stk_in: alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna) alignments_dic = {(a[0].id):a for a in alignments} regx_gaps = '[-.~_]' # valid gap symbols all be converted to "-" str_gaps = '-.~_' # valid gap symbols all be converted to "-" chunks = [] with open(target_f, 'w') as out_stk_handle: for i in range(len(orig_id)): #---------------------- # We need to map ungapped positions of the chunks to gapped positions of first sequence gap_count = 0 ungap_ind = 0 dic_gap_counts = dict() cur_alignment = alignments_dic[orig_id[i]] for c in cur_alignment[0].seq: #print ungap_ind if c in str_gaps: gap_count += 1 else: dic_gap_counts[ungap_ind] = gap_count ungap_ind += 1 ID = str(seq_id[i]) + " " + seq_string[i] chunks = re.findall(r'\d+', seq_string[i]) print (ID,chunks) index_start, index_end =int(chunks[1])-1, int(chunks[2])-1 subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]: index_end+dic_gap_counts[index_end]+1] #---------------------- # BioPython does not handel the GF ID entry for alignment # So we add entry in the second line manually siotmp = StringIO() AlignIO.write(subalign, siotmp, format="stockholm") stk_lines = siotmp.getvalue().split('\n') out_stk_handle.write('{}\n'.format(stk_lines[0])) out_stk_handle.write('#=GF ID {}\n'.format(ID)) out_stk_handle.writelines('\n'.join(stk_lines[1:])) #print out_stk_handle.getvalue()