Mercurial > repos > bebatut > format_cd_hit_output
comparison format_cd_hit_output.py @ 1:64da677bcee2 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/format_cd_hit_output/ commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author | bgruening |
---|---|
date | Wed, 19 Oct 2022 14:42:33 +0000 |
parents | 4015e9d6d277 |
children |
comparison
equal
deleted
inserted
replaced
0:4015e9d6d277 | 1:64da677bcee2 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
3 | 3 |
4 import sys | |
5 import os | |
6 import argparse | 4 import argparse |
7 import copy | 5 |
8 import operator | |
9 from sets import Set | |
10 | 6 |
11 def extract_mapping_info(input_mapping_filepath): | 7 def extract_mapping_info(input_mapping_filepath): |
12 mapping_info = {} | 8 mapping_info = {} |
13 categories = Set([]) | 9 categories = set([]) |
14 | 10 |
15 with open(input_mapping_filepath,'r') as mapping_file: | 11 with open(input_mapping_filepath, 'r') as mapping_file: |
16 for line in mapping_file.readlines(): | 12 for line in mapping_file.readlines(): |
17 split_line = line[:-1].split('\t') | 13 split_line = line[:-1].split('\t') |
18 mapping_info.setdefault(split_line[0],split_line[1]) | 14 mapping_info.setdefault(split_line[0], split_line[1]) |
19 categories.add(split_line[1]) | 15 categories.add(split_line[1]) |
20 | 16 |
21 return mapping_info, categories | 17 return mapping_info, categories |
22 | 18 |
23 def init_category_distribution(categories = None): | 19 |
24 cluster_category_distribution = {} | 20 def init_category_distribution(categories=None): |
25 if categories != None: | 21 cluster_categ_distri = {} |
22 if categories is not None: | |
26 for category in categories: | 23 for category in categories: |
27 cluster_category_distribution[category] = 0 | 24 cluster_categ_distri[category] = 0 |
28 return cluster_category_distribution | 25 return cluster_categ_distri |
29 | 26 |
30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 27 |
31 cluster_category_distribution, categories, output_category_distribution_file, | 28 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
32 cluster_seq_number): | 29 cluster_categ_distri, categories, |
30 output_category_distribution_file, cluster_seq_number): | |
33 if cluster_name != '': | 31 if cluster_name != '': |
34 if categories != None: | 32 if categories is not None: |
35 output_category_distribution_file.write(cluster_name) | 33 string = cluster_name |
36 output_category_distribution_file.write('\t' + str(cluster_seq_number)) | 34 string += '\t' + str(cluster_seq_number) |
37 for category in categories: | 35 for category in categories: |
38 output_category_distribution_file.write('\t') | 36 string += '\t' |
39 output_category_distribution_file.write(str(cluster_category_distribution[category])) | 37 string += str(cluster_categ_distri[category]) |
40 output_category_distribution_file.write('\n') | 38 string += '\n' |
39 output_category_distribution_file.write(string) | |
41 | 40 |
42 if cluster_ref_seq == '': | 41 if cluster_ref_seq == '': |
43 string = "No reference sequence found for " | 42 string = "No reference sequence found for " |
44 string += cluster_name | 43 string += cluster_name |
45 raise ValueError(string) | 44 raise ValueError(string) |
46 | 45 |
47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) | 46 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) |
48 | 47 |
49 def extract_cluster_info(args, mapping_info = None, categories = None): | 48 |
49 def extract_cluster_info(args, mapping_info=None, categories=None): | |
50 ref_seq_cluster = {} | 50 ref_seq_cluster = {} |
51 | 51 |
52 if args.output_category_distribution != None: | 52 if args.output_category_distribution is not None: |
53 if mapping_info == None or categories == None: | 53 if mapping_info is None or categories is None: |
54 string = "A file with category distribution is expected but " | 54 string = "A file with category distribution is expected but " |
55 string += "no mapping information are available" | 55 string += "no mapping information are available" |
56 raise ValueError(string) | 56 raise ValueError(string) |
57 output_cat_distri_file = open(args.output_category_distribution, 'w') | 57 output_cat_distri_file = open(args.output_category_distribution, 'w') |
58 output_cat_distri_file.write('Cluster\tSequence_number') | 58 output_cat_distri_file.write('Cluster\tSequence_number') |
61 | 61 |
62 output_cat_distri_file.write('\n') | 62 output_cat_distri_file.write('\n') |
63 else: | 63 else: |
64 output_cat_distri_file = None | 64 output_cat_distri_file = None |
65 | 65 |
66 with open(args.input_cluster_info,'r') as cluster_info_file: | 66 with open(args.input_cluster_info, 'r') as cluster_info_file: |
67 cluster_name = '' | 67 cluster_name = '' |
68 cluster_category_distribution = init_category_distribution(categories) | 68 cluster_categ_distri = init_category_distribution(categories) |
69 cluster_ref_seq = '' | 69 cluster_ref_seq = '' |
70 cluster_seq_number = 0 | 70 cluster_seq_number = 0 |
71 for line in cluster_info_file.readlines(): | 71 for line in cluster_info_file.readlines(): |
72 if line[0] == '>': | 72 if line[0] == '>': |
73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 73 flush_cluster_info( |
74 cluster_category_distribution, categories, | 74 cluster_name, |
75 output_cat_distri_file, cluster_seq_number) | 75 cluster_ref_seq, |
76 ref_seq_cluster, | |
77 cluster_categ_distri, | |
78 categories, | |
79 output_cat_distri_file, | |
80 cluster_seq_number) | |
76 cluster_name = line[1:-1] | 81 cluster_name = line[1:-1] |
77 cluster_name = cluster_name.replace(' ','_') | 82 cluster_name = cluster_name.replace(' ', '_') |
78 cluster_category_distribution = init_category_distribution(categories) | 83 cluster_categ_distri = init_category_distribution(categories) |
79 cluster_ref_seq = '' | 84 cluster_ref_seq = '' |
80 cluster_seq_number = 0 | 85 cluster_seq_number = 0 |
81 else: | 86 else: |
82 seq_info = line[:-1].split('\t')[1].split(' ') | 87 seq_info = line[:-1].split('\t')[1].split(' ') |
83 seq_name = seq_info[1][1:-3] | 88 seq_name = seq_info[1][1:-3] |
84 cluster_seq_number += 1 | 89 cluster_seq_number += 1 |
85 | 90 |
86 if categories != None: | 91 if categories is not None: |
87 seq_count = 1 | 92 seq_count = 1 |
88 if args.number_sum != None: | 93 if args.number_sum is not None: |
89 if seq_name.find('size') != -1: | 94 if seq_name.find('size') != -1: |
90 substring = seq_name[seq_name.find('size'):-1] | 95 substring = seq_name[seq_name.find('size'):-1] |
91 seq_count = int(substring.split('=')[1]) | 96 seq_count = int(substring.split('=')[1]) |
92 if not mapping_info.has_key(seq_name): | 97 if seq_name not in mapping_info: |
93 string = seq_name + " not found in mapping" | 98 string = seq_name + " not found in mapping" |
94 raise ValueError(string) | 99 raise ValueError(string) |
95 category = mapping_info[seq_name] | 100 category = mapping_info[seq_name] |
96 cluster_category_distribution[category] += seq_count | 101 cluster_categ_distri[category] += seq_count |
97 | 102 |
98 if seq_info[-1] == '*': | 103 if seq_info[-1] == '*': |
99 if cluster_ref_seq != '': | 104 if cluster_ref_seq != '': |
100 string = "A reference sequence (" + cluster_ref_seq | 105 string = "A reference sequence (" + cluster_ref_seq |
101 string += ") already found for cluster " + cluster_name | 106 string += ") already found for cluster " + cluster_name |
102 string += " (" + seq_name + ")" | 107 string += " (" + seq_name + ")" |
103 raise ValueError(string) | 108 raise ValueError(string) |
104 cluster_ref_seq = seq_name | 109 cluster_ref_seq = seq_name |
105 | 110 |
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 111 flush_cluster_info( |
107 cluster_category_distribution, categories, output_cat_distri_file, | 112 cluster_name, |
113 cluster_ref_seq, | |
114 ref_seq_cluster, | |
115 cluster_categ_distri, | |
116 categories, | |
117 output_cat_distri_file, | |
108 cluster_seq_number) | 118 cluster_seq_number) |
109 | 119 |
110 if args.output_category_distribution != None: | 120 if args.output_category_distribution is not None: |
111 output_cat_distri_file.close() | 121 output_cat_distri_file.close() |
112 | 122 |
113 return ref_seq_cluster | 123 return ref_seq_cluster |
114 | 124 |
125 | |
115 def rename_representative_sequences(args, ref_seq_cluster): | 126 def rename_representative_sequences(args, ref_seq_cluster): |
116 with open(args.input_representative_sequences,'r') as input_sequences: | 127 with open(args.input_representative_sequences, 'r') as input_sequences: |
117 with open(args.output_representative_sequences,'w') as output_sequences: | 128 with open(args.output_representative_sequences, 'w') as output_seq: |
118 for line in input_sequences.readlines(): | 129 for line in input_sequences.readlines(): |
119 if line[0] == '>': | 130 if line[0] == '>': |
120 seq_name = line[1:-1] | 131 seq_name = line[1:-1] |
121 if not ref_seq_cluster.has_key(seq_name): | 132 if seq_name not in ref_seq_cluster: |
122 string = seq_name + " not found as reference sequence" | 133 string = seq_name + " not found as reference sequence" |
123 raise ValueError(string) | 134 raise ValueError(string) |
124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') | 135 string = '>' + ref_seq_cluster[seq_name] + '\n' |
136 output_seq.write(string) | |
125 else: | 137 else: |
126 output_sequences.write(line) | 138 output_seq.write(line) |
139 | |
127 | 140 |
128 def format_cd_hit_outputs(args): | 141 def format_cd_hit_outputs(args): |
129 if args.input_mapping != None: | 142 if args.input_mapping is not None: |
130 mapping_info, categories = extract_mapping_info(args.input_mapping) | 143 mapping_info, categories = extract_mapping_info(args.input_mapping) |
131 else: | 144 else: |
132 mapping_info = None | 145 mapping_info = None |
133 categories = None | 146 categories = None |
134 | 147 |
135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) | 148 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) |
136 | 149 |
137 if args.input_representative_sequences != None: | 150 if args.input_representative_sequences is not None: |
138 rename_representative_sequences(args, ref_seq_cluster) | 151 rename_representative_sequences(args, ref_seq_cluster) |
152 | |
139 | 153 |
140 if __name__ == "__main__": | 154 if __name__ == "__main__": |
141 parser = argparse.ArgumentParser() | 155 parser = argparse.ArgumentParser() |
142 parser.add_argument('--input_cluster_info', required=True) | 156 parser.add_argument('--input_cluster_info', required=True) |
143 parser.add_argument('--input_representative_sequences') | 157 parser.add_argument('--input_representative_sequences') |