Mercurial > repos > bebatut > format_cd_hit_output
diff format_cd_hit_output.py @ 0:4015e9d6d277 draft
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
author | bebatut |
---|---|
date | Tue, 26 Apr 2016 08:55:33 -0400 |
parents | |
children | 64da677bcee2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/format_cd_hit_output.py Tue Apr 26 08:55:33 2016 -0400 @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import argparse +import copy +import operator +from sets import Set + +def extract_mapping_info(input_mapping_filepath): + mapping_info = {} + categories = Set([]) + + with open(input_mapping_filepath,'r') as mapping_file: + for line in mapping_file.readlines(): + split_line = line[:-1].split('\t') + mapping_info.setdefault(split_line[0],split_line[1]) + categories.add(split_line[1]) + + return mapping_info, categories + +def init_category_distribution(categories = None): + cluster_category_distribution = {} + if categories != None: + for category in categories: + cluster_category_distribution[category] = 0 + return cluster_category_distribution + +def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, + cluster_category_distribution, categories, output_category_distribution_file, + cluster_seq_number): + if cluster_name != '': + if categories != None: + output_category_distribution_file.write(cluster_name) + output_category_distribution_file.write('\t' + str(cluster_seq_number)) + for category in categories: + output_category_distribution_file.write('\t') + output_category_distribution_file.write(str(cluster_category_distribution[category])) + output_category_distribution_file.write('\n') + + if cluster_ref_seq == '': + string = "No reference sequence found for " + string += cluster_name + raise ValueError(string) + + ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) + +def extract_cluster_info(args, mapping_info = None, categories = None): + ref_seq_cluster = {} + + if args.output_category_distribution != None: + if mapping_info == None or categories == None: + string = "A file with category distribution is expected but " + string += "no mapping information are available" + raise ValueError(string) + output_cat_distri_file = open(args.output_category_distribution, 'w') + output_cat_distri_file.write('Cluster\tSequence_number') + for category in categories: + output_cat_distri_file.write('\t' + category) + + output_cat_distri_file.write('\n') + else: + output_cat_distri_file = None + + with open(args.input_cluster_info,'r') as cluster_info_file: + cluster_name = '' + cluster_category_distribution = init_category_distribution(categories) + cluster_ref_seq = '' + cluster_seq_number = 0 + for line in cluster_info_file.readlines(): + if line[0] == '>': + flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, + cluster_category_distribution, categories, + output_cat_distri_file, cluster_seq_number) + cluster_name = line[1:-1] + cluster_name = cluster_name.replace(' ','_') + cluster_category_distribution = init_category_distribution(categories) + cluster_ref_seq = '' + cluster_seq_number = 0 + else: + seq_info = line[:-1].split('\t')[1].split(' ') + seq_name = seq_info[1][1:-3] + cluster_seq_number += 1 + + if categories != None: + seq_count = 1 + if args.number_sum != None: + if seq_name.find('size') != -1: + substring = seq_name[seq_name.find('size'):-1] + seq_count = int(substring.split('=')[1]) + if not mapping_info.has_key(seq_name): + string = seq_name + " not found in mapping" + raise ValueError(string) + category = mapping_info[seq_name] + cluster_category_distribution[category] += seq_count + + if seq_info[-1] == '*': + if cluster_ref_seq != '': + string = "A reference sequence (" + cluster_ref_seq + string += ") already found for cluster " + cluster_name + string += " (" + seq_name + ")" + raise ValueError(string) + cluster_ref_seq = seq_name + + flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, + cluster_category_distribution, categories, output_cat_distri_file, + cluster_seq_number) + + if args.output_category_distribution != None: + output_cat_distri_file.close() + + return ref_seq_cluster + +def rename_representative_sequences(args, ref_seq_cluster): + with open(args.input_representative_sequences,'r') as input_sequences: + with open(args.output_representative_sequences,'w') as output_sequences: + for line in input_sequences.readlines(): + if line[0] == '>': + seq_name = line[1:-1] + if not ref_seq_cluster.has_key(seq_name): + string = seq_name + " not found as reference sequence" + raise ValueError(string) + output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') + else: + output_sequences.write(line) + +def format_cd_hit_outputs(args): + if args.input_mapping != None: + mapping_info, categories = extract_mapping_info(args.input_mapping) + else: + mapping_info = None + categories = None + + ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) + + if args.input_representative_sequences != None: + rename_representative_sequences(args, ref_seq_cluster) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--input_cluster_info', required=True) + parser.add_argument('--input_representative_sequences') + parser.add_argument('--output_representative_sequences') + parser.add_argument('--input_mapping') + parser.add_argument('--output_category_distribution') + parser.add_argument('--number_sum') + args = parser.parse_args() + + format_cd_hit_outputs(args) \ No newline at end of file