comparison combine_metaphlan2_humann2.py @ 1:e25efca0a49c draft

"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit 4a40a73a6eb981310a3c1c54dfa98b5460a843f9"
author bebatut
date Mon, 14 Sep 2020 12:19:49 +0000
parents 31394a0c0242
children fdfb35745104
comparison
equal deleted inserted replaced
0:31394a0c0242 1:e25efca0a49c
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3
4 import sys
5 import os
6 import argparse 4 import argparse
7 import re
8 5
9 def extract_clade_abundance(metaphlan2_filepath): 6
10 clade_abundance = {} 7 def extract_clade_abundance(metaphlan2_fp):
11 with open(metaphlan2_filepath, 'r') as metaphlan2_file: 8 clade_abund = {}
12 for line in metaphlan2_file.readlines(): 9 with open(metaphlan2_fp, 'r') as metaphlan2_f:
10 for line in metaphlan2_f.readlines():
13 if line.find('g__') == -1: 11 if line.find('g__') == -1:
14 continue 12 continue
15 13
16 split_line = line[:-1].split('\t') 14 split_line = line[:-1].split('\t')
17 taxo = split_line[0] 15 taxo = split_line[0]
18 abundance = split_line[1] 16 abundance = split_line[1]
19 17
20 genus = taxo[(taxo.find('g__')+3):] 18 genus = taxo[(taxo.find('g__')+3):]
21 if genus.find('|') != -1: 19 if genus.find('|') != -1:
22 genus = genus[:(genus.find('|'))] 20 genus = genus[:(genus.find('|'))]
23 clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) 21 clade_abund.setdefault(genus, {'abundance': 0, 'species': {}})
24 if taxo.find('t__') != -1: 22 if taxo.find('t__') != -1:
25 continue 23 continue
26 elif taxo.find('s__') != -1: 24 elif taxo.find('s__') != -1:
27 species = taxo[(taxo.find('s__')+3):] 25 species = taxo[(taxo.find('s__')+3):]
28 clade_abundance[genus]['species'].setdefault(species, abundance) 26 clade_abund[genus]['species'].setdefault(
27 species,
28 abundance)
29 else: 29 else:
30 clade_abundance[genus]['abundance'] = abundance 30 clade_abund[genus]['abundance'] = abundance
31 return clade_abundance 31 return clade_abund
32 32
33 def compute_overall_abundance(humann2_file): 33
34 def compute_overall_abundance(humann2_fp):
34 overall_abundance = 0 35 overall_abundance = 0
35 with open(args.humann2_file, 'r') as humann2_file: 36 with open(humann2_fp, 'r') as humann2_f:
36 for line in humann2_file.readlines(): 37 for line in humann2_f.readlines():
37 if line.find('|') != -1 or line.startswith('#'): 38 if line.find('|') != -1 or line.startswith('#'):
38 continue 39 continue
39 split_line = line[:-1].split('\t') 40 split_line = line[:-1].split('\t')
40 overall_abundance += float(split_line[1]) 41 overall_abundance += float(split_line[1])
41 return overall_abundance 42 return overall_abundance
42 43
44
43 def format_characteristic_name(name): 45 def format_characteristic_name(name):
44 formatted_name = name 46 formatted_n = name
45 formatted_name = formatted_name.replace('/',' ') 47 formatted_n = formatted_n.replace('/', ' ')
46 formatted_name = formatted_name.replace('-',' ') 48 formatted_n = formatted_n.replace('-', ' ')
47 formatted_name = formatted_name.replace("'",'') 49 formatted_n = formatted_n.replace("'", '')
48 if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: 50 if formatted_n.find('(') != -1 and formatted_n.find(')') != -1:
49 open_bracket = formatted_name.find('(') 51 open_bracket = formatted_n.find('(')
50 close_bracket = formatted_name.find(')')+1 52 close_bracket = formatted_n.find(')')+1
51 formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] 53 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
52 return formatted_name 54 return formatted_n
55
53 56
54 def combine_metaphlan2_humann2(args): 57 def combine_metaphlan2_humann2(args):
55 clade_abundance = extract_clade_abundance(args.metaphlan2_file) 58 clade_abund = extract_clade_abundance(args.metaphlan2_fp)
56 overall_abundance = compute_overall_abundance(args.humann2_file) 59 overall_abund = compute_overall_abundance(args.humann2_fp)
57 60
58 with open(args.output_file, 'w') as output_file: 61 with open(args.output_fp, 'w') as output_f:
59 output_file.write('genus\t') 62 s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t'
60 output_file.write('genus_abundance\t') 63 s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type)
61 output_file.write('species\t') 64 output_f.write(s)
62 output_file.write('species_abundance\t') 65 with open(args.humann2_fp, 'r') as humann2_f:
63 output_file.write(args.type + '_id\t') 66 for line in humann2_f.readlines():
64 output_file.write(args.type + '_name\t')
65 output_file.write(args.type + '_abundance\n')
66 with open(args.humann2_file, 'r') as humann2_file:
67 for line in humann2_file.readlines():
68 if line.find('|') == -1: 67 if line.find('|') == -1:
69 continue 68 continue
70 69
71 split_line = line[:-1].split('\t') 70 split_line = line[:-1].split('\t')
72 abundance = 100*float(split_line[1])/overall_abundance 71 abundance = 100*float(split_line[1])/overall_abund
73 annotation = split_line[0].split('|') 72 annotation = split_line[0].split('|')
74 characteristic = annotation[0].split(':') 73 charact = annotation[0].split(':')
75 characteristic_id = characteristic[0] 74 charact_id = charact[0]
76 characteristic_name = '' 75 char_name = ''
77 if len(characteristic) > 1: 76 if len(charact) > 1:
78 characteristic_name = format_characteristic_name(characteristic[-1]) 77 char_name = format_characteristic_name(charact[-1])
79 taxo = annotation[1].split('.') 78 taxo = annotation[1].split('.')
80 79
81 if taxo[0] == 'unclassified': 80 if taxo[0] == 'unclassified':
82 continue 81 continue
83 genus = taxo[0][3:] 82 genus = taxo[0][3:]
84 species = taxo[1][3:] 83 species = taxo[1][3:]
85 84
86 if not clade_abundance.has_key(genus): 85 if genus not in clade_abund:
87 print "no", genus, "found in", args.metaphlan2_file 86 print("no %s found in %s" % (genus, args.metaphlan2_fp))
88 continue 87 continue
89 if not clade_abundance[genus]['species'].has_key(species): 88 if species not in clade_abund[genus]['species']:
90 print "no", species, "found in", args.metaphlan2_file, 89 print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus))
91 print "for", genus
92 continue 90 continue
93 output_file.write(genus + '\t') 91
94 output_file.write(clade_abundance[genus]['abundance'] + '\t') 92 s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance'])
95 output_file.write(species + '\t') 93 s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species])
96 output_file.write(clade_abundance[genus]['species'][species] + '\t') 94 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
97 output_file.write(characteristic_id + '\t') 95 output_f.write(s)
98 output_file.write(characteristic_name + '\t') 96
99 output_file.write(str(abundance) + '\n')
100 97
101 if __name__ == '__main__': 98 if __name__ == '__main__':
102 parser = argparse.ArgumentParser() 99 parser = argparse.ArgumentParser()
103 parser.add_argument('--humann2_file', required=True) 100 parser.add_argument('--humann2_fp', required=True)
104 parser.add_argument('--metaphlan2_file', required=True) 101 parser.add_argument('--metaphlan2_fp', required=True)
105 parser.add_argument('--output_file', required=True) 102 parser.add_argument('--output_fp', required=True)
106 parser.add_argument('--type', required=True, 103 parser.add_argument(
107 choices = ['gene_families','pathways']) 104 '--type',
105 required=True,
106 choices=['gene_families', 'pathways'])
108 args = parser.parse_args() 107 args = parser.parse_args()
109 108
110 combine_metaphlan2_humann2(args) 109 combine_metaphlan2_humann2(args)