Mercurial > repos > bebatut > combine_metaphlan2_humann2
diff combine_metaphlan2_humann2.py @ 1:e25efca0a49c draft
"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit 4a40a73a6eb981310a3c1c54dfa98b5460a843f9"
author | bebatut |
---|---|
date | Mon, 14 Sep 2020 12:19:49 +0000 |
parents | 31394a0c0242 |
children | fdfb35745104 |
line wrap: on
line diff
--- a/combine_metaphlan2_humann2.py Fri Apr 15 09:15:21 2016 -0400 +++ b/combine_metaphlan2_humann2.py Mon Sep 14 12:19:49 2020 +0000 @@ -1,15 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import sys -import os import argparse -import re + -def extract_clade_abundance(metaphlan2_filepath): - clade_abundance = {} - with open(metaphlan2_filepath, 'r') as metaphlan2_file: - for line in metaphlan2_file.readlines(): +def extract_clade_abundance(metaphlan2_fp): + clade_abund = {} + with open(metaphlan2_fp, 'r') as metaphlan2_f: + for line in metaphlan2_f.readlines(): if line.find('g__') == -1: continue @@ -20,91 +18,92 @@ genus = taxo[(taxo.find('g__')+3):] if genus.find('|') != -1: genus = genus[:(genus.find('|'))] - clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) + clade_abund.setdefault(genus, {'abundance': 0, 'species': {}}) if taxo.find('t__') != -1: continue elif taxo.find('s__') != -1: species = taxo[(taxo.find('s__')+3):] - clade_abundance[genus]['species'].setdefault(species, abundance) + clade_abund[genus]['species'].setdefault( + species, + abundance) else: - clade_abundance[genus]['abundance'] = abundance - return clade_abundance + clade_abund[genus]['abundance'] = abundance + return clade_abund -def compute_overall_abundance(humann2_file): + +def compute_overall_abundance(humann2_fp): overall_abundance = 0 - with open(args.humann2_file, 'r') as humann2_file: - for line in humann2_file.readlines(): + with open(humann2_fp, 'r') as humann2_f: + for line in humann2_f.readlines(): if line.find('|') != -1 or line.startswith('#'): continue split_line = line[:-1].split('\t') overall_abundance += float(split_line[1]) return overall_abundance + def format_characteristic_name(name): - formatted_name = name - formatted_name = formatted_name.replace('/',' ') - formatted_name = formatted_name.replace('-',' ') - formatted_name = formatted_name.replace("'",'') - if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: - open_bracket = formatted_name.find('(') - close_bracket = formatted_name.find(')')+1 - formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] - return formatted_name + formatted_n = name + formatted_n = formatted_n.replace('/', ' ') + formatted_n = formatted_n.replace('-', ' ') + formatted_n = formatted_n.replace("'", '') + if formatted_n.find('(') != -1 and formatted_n.find(')') != -1: + open_bracket = formatted_n.find('(') + close_bracket = formatted_n.find(')')+1 + formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] + return formatted_n + def combine_metaphlan2_humann2(args): - clade_abundance = extract_clade_abundance(args.metaphlan2_file) - overall_abundance = compute_overall_abundance(args.humann2_file) + clade_abund = extract_clade_abundance(args.metaphlan2_fp) + overall_abund = compute_overall_abundance(args.humann2_fp) - with open(args.output_file, 'w') as output_file: - output_file.write('genus\t') - output_file.write('genus_abundance\t') - output_file.write('species\t') - output_file.write('species_abundance\t') - output_file.write(args.type + '_id\t') - output_file.write(args.type + '_name\t') - output_file.write(args.type + '_abundance\n') - with open(args.humann2_file, 'r') as humann2_file: - for line in humann2_file.readlines(): + with open(args.output_fp, 'w') as output_f: + s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t' + s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type) + output_f.write(s) + with open(args.humann2_fp, 'r') as humann2_f: + for line in humann2_f.readlines(): if line.find('|') == -1: continue split_line = line[:-1].split('\t') - abundance = 100*float(split_line[1])/overall_abundance + abundance = 100*float(split_line[1])/overall_abund annotation = split_line[0].split('|') - characteristic = annotation[0].split(':') - characteristic_id = characteristic[0] - characteristic_name = '' - if len(characteristic) > 1: - characteristic_name = format_characteristic_name(characteristic[-1]) + charact = annotation[0].split(':') + charact_id = charact[0] + char_name = '' + if len(charact) > 1: + char_name = format_characteristic_name(charact[-1]) taxo = annotation[1].split('.') - + if taxo[0] == 'unclassified': continue genus = taxo[0][3:] species = taxo[1][3:] - if not clade_abundance.has_key(genus): - print "no", genus, "found in", args.metaphlan2_file + if genus not in clade_abund: + print("no %s found in %s" % (genus, args.metaphlan2_fp)) continue - if not clade_abundance[genus]['species'].has_key(species): - print "no", species, "found in", args.metaphlan2_file, - print "for", genus + if species not in clade_abund[genus]['species']: + print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus)) continue - output_file.write(genus + '\t') - output_file.write(clade_abundance[genus]['abundance'] + '\t') - output_file.write(species + '\t') - output_file.write(clade_abundance[genus]['species'][species] + '\t') - output_file.write(characteristic_id + '\t') - output_file.write(characteristic_name + '\t') - output_file.write(str(abundance) + '\n') + + s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance']) + s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species]) + s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) + output_f.write(s) + if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--humann2_file', required=True) - parser.add_argument('--metaphlan2_file', required=True) - parser.add_argument('--output_file', required=True) - parser.add_argument('--type', required=True, - choices = ['gene_families','pathways']) + parser.add_argument('--humann2_fp', required=True) + parser.add_argument('--metaphlan2_fp', required=True) + parser.add_argument('--output_fp', required=True) + parser.add_argument( + '--type', + required=True, + choices=['gene_families', 'pathways']) args = parser.parse_args() - combine_metaphlan2_humann2(args) \ No newline at end of file + combine_metaphlan2_humann2(args)