comparison compare_humann2_output.py @ 2:05766022dfc4 draft

"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/compare_humann2_output commit dc55dc3b5275d1d6aac390698c0c6e0ab8fbf2f7"
author bebatut
date Mon, 14 Sep 2020 13:50:30 +0000
parents 9959fa526f1a
children eaa95ea1195c
comparison
equal deleted inserted replaced
1:c1aca37cb1fc 2:05766022dfc4
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3
4 import sys
5 import os
6 import argparse 4 import argparse
7 import re
8 5
9 def extract_abundances(filepath, nb_charact_to_extract): 6
7 def extract_abundances(fp, nb_charact_to_extract):
10 abundances = {} 8 abundances = {}
11 more_abund_charact = [] 9 more_abund_charact = []
12 abund_sum = 0 10 abund_sum = 0
13 with open(filepath, 'r') as abundance_file: 11 with open(fp, 'r') as abundance_f:
14 for line in abundance_file.readlines()[1:]: 12 for line in abundance_f.readlines()[1:]:
15 split_line = line[:-1].split('\t') 13 split_line = line[:-1].split('\t')
16 charact_id = split_line[0] 14 charact_id = split_line[0]
17 abund = float(split_line[1]) 15 abund = float(split_line[1])
18 abundances[charact_id] = 100*abund 16 abundances[charact_id] = 100*abund
19 abund_sum += abundances[charact_id] 17 abund_sum += abundances[charact_id]
20 18
21 if len(more_abund_charact) < nb_charact_to_extract: 19 if len(more_abund_charact) < nb_charact_to_extract:
22 more_abund_charact.append(charact_id) 20 more_abund_charact.append(charact_id)
23 else: 21 else:
24 best_pos = None 22 best_pos = None
25 for i in range(len(more_abund_charact)-1,-1,-1): 23 for i in range(len(more_abund_charact)-1, -1, -1):
26 if abundances[more_abund_charact[i]] < abund: 24 if abundances[more_abund_charact[i]] < abund:
27 best_pos = i 25 best_pos = i
28 else: 26 else:
29 break 27 break
30 if best_pos != None: 28 if best_pos is not None:
31 tmp_more_abund_charact = more_abund_charact 29 tmp_more_abund_charact = more_abund_charact
32 more_abund_charact = tmp_more_abund_charact[:best_pos] 30 more_abund_charact = tmp_more_abund_charact[:best_pos]
33 more_abund_charact += [charact_id] 31 more_abund_charact += [charact_id]
34 more_abund_charact += tmp_more_abund_charact[best_pos:-1] 32 more_abund_charact += tmp_more_abund_charact[best_pos:-1]
35 return abundances, more_abund_charact 33 return abundances, more_abund_charact
36 34
35
37 def format_characteristic_name(all_name): 36 def format_characteristic_name(all_name):
38 if all_name.find(':') != -1: 37 if all_name.find(':') != -1:
39 charact_id = all_name.split(':')[0] 38 charact_id = all_name.split(':')[0]
40 charact_name = all_name.split(':')[1][1:] 39 char_name = all_name.split(':')[1][1:]
41 else: 40 else:
42 charact_id = all_name 41 charact_id = all_name
43 charact_name = '' 42 char_name = ''
44 43
45 charact_name = charact_name.replace('/',' ') 44 char_name = char_name.replace('/', ' ')
46 charact_name = charact_name.replace('-',' ') 45 char_name = char_name.replace('-', ' ')
47 charact_name = charact_name.replace("'",'') 46 char_name = char_name.replace("'", '')
48 if charact_name.find('(') != -1 and charact_name.find(')') != -1: 47 if char_name.find('(') != -1 and char_name.find(')') != -1:
49 open_bracket = charact_name.find('(') 48 open_bracket = char_name.find('(')
50 close_bracket = charact_name.find(')')+1 49 close_bracket = char_name.find(')')+1
51 charact_name = charact_name[:open_bracket] + charact_name[close_bracket:] 50 char_name = char_name[:open_bracket] + char_name[close_bracket:]
52 return charact_id,charact_name 51 return charact_id, char_name
53 52
54 def write_more_abundant_charat(abundances,more_abund_charact, output_filepath): 53
55 with open(output_filepath,'w') as output_file: 54 def write_more_abundant_charat(abundances, more_abund_charact, output_fp):
56 output_file.write('id\tname\t') 55 with open(output_fp, 'w') as output_f:
57 output_file.write('\t'.join(abundances.keys()) + '\n') 56 output_f.write('id\tname\t%s\n' % '\t'.join(abundances.keys()))
58 57
59 for mac in more_abund_charact: 58 for mac in more_abund_charact:
60 charact_id,charact_name = format_characteristic_name(mac) 59 charact_id, charact_name = format_characteristic_name(mac)
61 output_file.write(charact_id + '\t' + charact_name) 60 output_f.write('%s\t%s' % (charact_id, charact_name))
62 for sample in abundances: 61 for sample in abundances:
63 abund = abundances[sample].get(mac, 0) 62 abund = abundances[sample].get(mac, 0)
64 output_file.write('\t' + str(abund)) 63 output_f.write('\t%s' % (abund))
65 output_file.write('\n') 64 output_f.write('\n')
66 65
67 def extract_similar_characteristics(abundances, sim_output_filepath,
68 specific_output_files):
69 sim_characteristics = set(abundances[abundances.keys()[0]].keys())
70 for sample in abundances.keys()[1:]:
71 sim_characteristics.intersection_update(abundances[sample].keys())
72 print 'Similar between all samples:', len(sim_characteristics)
73 66
74 with open(sim_output_filepath, 'w') as sim_output_file: 67 def extract_similar_characteristics(abund, sim_output_fp, output_files):
75 sim_output_file.write('id\tname\t' + '\t'.join(abundances.keys()) + '\n') 68 abund_keys = list(abund)
69 sim_characteristics = set(abund[abund_keys[0]].keys())
70 for sample in abund_keys[1:]:
71 sim_characteristics.intersection_update(abund[sample].keys())
72 print('Similar between all samples: %s' % len(sim_characteristics))
73
74 with open(sim_output_fp, 'w') as sim_output_f:
75 sim_output_f.write('id\tname\t%s\n' % '\t'.join(abund_keys))
76 for charact in list(sim_characteristics): 76 for charact in list(sim_characteristics):
77 charact_id,charact_name = format_characteristic_name(charact) 77 charact_id, charact_name = format_characteristic_name(charact)
78 sim_output_file.write(charact_id + '\t' + charact_name) 78 sim_output_f.write('%s\t%s' % (charact_id, charact_name))
79 for sample in abundances.keys(): 79 for sample in abund_keys:
80 sim_output_file.write('\t' + str(abundances[sample][charact])) 80 sim_output_f.write('\t%s' % abund[sample][charact])
81 sim_output_file.write('\n') 81 sim_output_f.write('\n')
82 82
83 print 'Specific to samples:' 83 print('Specific to samples:')
84 diff_characteristics = {} 84 diff_char = {}
85 for i in range(len(abundances.keys())): 85 for i in range(len(abund_keys)):
86 sample = abundances.keys()[i] 86 sample = abund_keys[i]
87 print ' ', sample, "" 87 print(' %s' % sample )
88 print ' All:', len(abundances[sample].keys()) 88 print(' All: %s' % len(abund[sample].keys()))
89 diff_characteristics[sample] = set(abundances[sample].keys()) 89 diff_char[sample] = set(abund[sample].keys())
90 diff_characteristics[sample].difference_update(sim_characteristics) 90 diff_char[sample].difference_update(sim_characteristics)
91 print ' Number of specific characteristics:', 91 perc = 100*len(diff_char[sample])/(1.*len(abund[sample].keys()))
92 print len(diff_characteristics[sample]) 92 print(' Number of specific characteristics: %s' % len(diff_char[sample]))
93 print ' Percentage of specific characteristics:', 93 print(' Percentage of specific characteristics: %s' % perc)
94 print 100*len(diff_characteristics[sample])/(1.*len(abundances[sample].keys()))
95 94
96 relative_abundance = 0 95 relative_abundance = 0
97 with open(specific_output_files[i], 'w') as output_file: 96 with open(output_files[i], 'w') as output_f:
98 output_file.write('id\tname\tabundances\n') 97 output_f.write('id\tname\tabundances\n')
99 for charact in list(diff_characteristics[sample]): 98 for charact in list(diff_char[sample]):
100 charact_id,charact_name = format_characteristic_name(charact) 99 charact_id, charact_name = format_characteristic_name(charact)
101 output_file.write(charact_id + '\t' + charact_name + '\t') 100 output_f.write('%s\t%s' % (charact_id, charact_name))
102 output_file.write(str(abundances[sample][charact]) + '\n') 101 output_f.write('%s\n' % abund[sample][charact])
103 relative_abundance += abundances[sample][charact] 102 relative_abundance += abund[sample][charact]
104 print ' Relative abundance of specific characteristics(%):', relative_abundance 103 print(' Relative abundance of specific characteristics: %s' % relative_abundance)
105 104
106 return sim_characteristics 105 return sim_characteristics
107 106
107
108 def compare_humann2_output(args): 108 def compare_humann2_output(args):
109 abundances = {} 109 abund = {}
110 more_abund_charact = [] 110 more_abund_charact = []
111 111
112 for i in range(len(args.sample_name)): 112 for i in range(len(args.sample_name)):
113 abundances[args.sample_name[i]], mac = extract_abundances(args.charact_input_file[i], 113 abund[args.sample_name[i]], mac = extract_abundances(
114 args.charact_input_fp[i],
114 args.most_abundant_characteristics_to_extract) 115 args.most_abundant_characteristics_to_extract)
115 more_abund_charact += mac 116 more_abund_charact += mac
116 117
117 write_more_abundant_charat(abundances, list(set(more_abund_charact)), 118 write_more_abundant_charat(
118 args.more_abundant_output_file) 119 abund,
119 sim_characteristics = extract_similar_characteristics(abundances, 120 list(set(more_abund_charact)),
120 args.similar_output_file, args.specific_output_file) 121 args.more_abundant_output_fp)
122 extract_similar_characteristics(
123 abund,
124 args.similar_output_fp,
125 args.specific_output_fp)
126
121 127
122 if __name__ == '__main__': 128 if __name__ == '__main__':
123 parser = argparse.ArgumentParser() 129 parser = argparse.ArgumentParser()
124 parser.add_argument('--sample_name', required=True, action='append') 130 parser.add_argument('--sample_name', required=True, action='append')
125 parser.add_argument('--charact_input_file', required=True, action='append') 131 parser.add_argument('--charact_input_fp', required=True, action='append')
126 parser.add_argument('--most_abundant_characteristics_to_extract', required=True, 132 parser.add_argument(
127 type = int) 133 '--most_abundant_characteristics_to_extract',
128 parser.add_argument('--more_abundant_output_file', required=True) 134 required=True,
129 parser.add_argument('--similar_output_file', required=True) 135 type=int)
130 parser.add_argument('--specific_output_file', required=True,action='append') 136 parser.add_argument('--more_abundant_output_fp', required=True)
137 parser.add_argument('--similar_output_fp', required=True)
138 parser.add_argument(
139 '--specific_output_fp',
140 required=True,
141 action='append')
131 args = parser.parse_args() 142 args = parser.parse_args()
132 143
133 if len(args.sample_name) != len(args.charact_input_file): 144 if len(args.sample_name) != len(args.charact_input_fp):
134 raise ValueError("Same number of values (in same order) are expected for --sample_name and --charact_input_file") 145 string = "Same number of values (in same order) are expected for "
135 if len(args.sample_name) != len(args.specific_output_file): 146 string += "--sample_name and --charact_input_fp"
136 raise ValueError("Same number of values (in same order) are expected for --sample_name and --specific_output_file") 147 raise ValueError(string)
148 if len(args.sample_name) != len(args.specific_output_fp):
149 string = "Same number of values (in same order) are expected for "
150 string += "--sample_name and --specific_output_fp"
151 raise ValueError(string)
137 152
138 compare_humann2_output(args) 153 compare_humann2_output(args)