Mercurial > repos > bebatut > format_cd_hit_output
comparison format_cd_hit_output.py @ 0:4015e9d6d277 draft
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
| author | bebatut |
|---|---|
| date | Tue, 26 Apr 2016 08:55:33 -0400 |
| parents | |
| children | 64da677bcee2 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4015e9d6d277 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 import sys | |
| 5 import os | |
| 6 import argparse | |
| 7 import copy | |
| 8 import operator | |
| 9 from sets import Set | |
| 10 | |
| 11 def extract_mapping_info(input_mapping_filepath): | |
| 12 mapping_info = {} | |
| 13 categories = Set([]) | |
| 14 | |
| 15 with open(input_mapping_filepath,'r') as mapping_file: | |
| 16 for line in mapping_file.readlines(): | |
| 17 split_line = line[:-1].split('\t') | |
| 18 mapping_info.setdefault(split_line[0],split_line[1]) | |
| 19 categories.add(split_line[1]) | |
| 20 | |
| 21 return mapping_info, categories | |
| 22 | |
| 23 def init_category_distribution(categories = None): | |
| 24 cluster_category_distribution = {} | |
| 25 if categories != None: | |
| 26 for category in categories: | |
| 27 cluster_category_distribution[category] = 0 | |
| 28 return cluster_category_distribution | |
| 29 | |
| 30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
| 31 cluster_category_distribution, categories, output_category_distribution_file, | |
| 32 cluster_seq_number): | |
| 33 if cluster_name != '': | |
| 34 if categories != None: | |
| 35 output_category_distribution_file.write(cluster_name) | |
| 36 output_category_distribution_file.write('\t' + str(cluster_seq_number)) | |
| 37 for category in categories: | |
| 38 output_category_distribution_file.write('\t') | |
| 39 output_category_distribution_file.write(str(cluster_category_distribution[category])) | |
| 40 output_category_distribution_file.write('\n') | |
| 41 | |
| 42 if cluster_ref_seq == '': | |
| 43 string = "No reference sequence found for " | |
| 44 string += cluster_name | |
| 45 raise ValueError(string) | |
| 46 | |
| 47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) | |
| 48 | |
| 49 def extract_cluster_info(args, mapping_info = None, categories = None): | |
| 50 ref_seq_cluster = {} | |
| 51 | |
| 52 if args.output_category_distribution != None: | |
| 53 if mapping_info == None or categories == None: | |
| 54 string = "A file with category distribution is expected but " | |
| 55 string += "no mapping information are available" | |
| 56 raise ValueError(string) | |
| 57 output_cat_distri_file = open(args.output_category_distribution, 'w') | |
| 58 output_cat_distri_file.write('Cluster\tSequence_number') | |
| 59 for category in categories: | |
| 60 output_cat_distri_file.write('\t' + category) | |
| 61 | |
| 62 output_cat_distri_file.write('\n') | |
| 63 else: | |
| 64 output_cat_distri_file = None | |
| 65 | |
| 66 with open(args.input_cluster_info,'r') as cluster_info_file: | |
| 67 cluster_name = '' | |
| 68 cluster_category_distribution = init_category_distribution(categories) | |
| 69 cluster_ref_seq = '' | |
| 70 cluster_seq_number = 0 | |
| 71 for line in cluster_info_file.readlines(): | |
| 72 if line[0] == '>': | |
| 73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
| 74 cluster_category_distribution, categories, | |
| 75 output_cat_distri_file, cluster_seq_number) | |
| 76 cluster_name = line[1:-1] | |
| 77 cluster_name = cluster_name.replace(' ','_') | |
| 78 cluster_category_distribution = init_category_distribution(categories) | |
| 79 cluster_ref_seq = '' | |
| 80 cluster_seq_number = 0 | |
| 81 else: | |
| 82 seq_info = line[:-1].split('\t')[1].split(' ') | |
| 83 seq_name = seq_info[1][1:-3] | |
| 84 cluster_seq_number += 1 | |
| 85 | |
| 86 if categories != None: | |
| 87 seq_count = 1 | |
| 88 if args.number_sum != None: | |
| 89 if seq_name.find('size') != -1: | |
| 90 substring = seq_name[seq_name.find('size'):-1] | |
| 91 seq_count = int(substring.split('=')[1]) | |
| 92 if not mapping_info.has_key(seq_name): | |
| 93 string = seq_name + " not found in mapping" | |
| 94 raise ValueError(string) | |
| 95 category = mapping_info[seq_name] | |
| 96 cluster_category_distribution[category] += seq_count | |
| 97 | |
| 98 if seq_info[-1] == '*': | |
| 99 if cluster_ref_seq != '': | |
| 100 string = "A reference sequence (" + cluster_ref_seq | |
| 101 string += ") already found for cluster " + cluster_name | |
| 102 string += " (" + seq_name + ")" | |
| 103 raise ValueError(string) | |
| 104 cluster_ref_seq = seq_name | |
| 105 | |
| 106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
| 107 cluster_category_distribution, categories, output_cat_distri_file, | |
| 108 cluster_seq_number) | |
| 109 | |
| 110 if args.output_category_distribution != None: | |
| 111 output_cat_distri_file.close() | |
| 112 | |
| 113 return ref_seq_cluster | |
| 114 | |
| 115 def rename_representative_sequences(args, ref_seq_cluster): | |
| 116 with open(args.input_representative_sequences,'r') as input_sequences: | |
| 117 with open(args.output_representative_sequences,'w') as output_sequences: | |
| 118 for line in input_sequences.readlines(): | |
| 119 if line[0] == '>': | |
| 120 seq_name = line[1:-1] | |
| 121 if not ref_seq_cluster.has_key(seq_name): | |
| 122 string = seq_name + " not found as reference sequence" | |
| 123 raise ValueError(string) | |
| 124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') | |
| 125 else: | |
| 126 output_sequences.write(line) | |
| 127 | |
| 128 def format_cd_hit_outputs(args): | |
| 129 if args.input_mapping != None: | |
| 130 mapping_info, categories = extract_mapping_info(args.input_mapping) | |
| 131 else: | |
| 132 mapping_info = None | |
| 133 categories = None | |
| 134 | |
| 135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) | |
| 136 | |
| 137 if args.input_representative_sequences != None: | |
| 138 rename_representative_sequences(args, ref_seq_cluster) | |
| 139 | |
| 140 if __name__ == "__main__": | |
| 141 parser = argparse.ArgumentParser() | |
| 142 parser.add_argument('--input_cluster_info', required=True) | |
| 143 parser.add_argument('--input_representative_sequences') | |
| 144 parser.add_argument('--output_representative_sequences') | |
| 145 parser.add_argument('--input_mapping') | |
| 146 parser.add_argument('--output_category_distribution') | |
| 147 parser.add_argument('--number_sum') | |
| 148 args = parser.parse_args() | |
| 149 | |
| 150 format_cd_hit_outputs(args) |
