Mercurial > repos > bebatut > combine_metaphlan2_humann2
diff combine_metaphlan2_humann2.py @ 0:31394a0c0242 draft
planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit e6bee6545960c2a1ae3ca3031ec74d7c26d0b0ce-dirty
author | bebatut |
---|---|
date | Fri, 15 Apr 2016 09:15:21 -0400 |
parents | |
children | e25efca0a49c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_metaphlan2_humann2.py Fri Apr 15 09:15:21 2016 -0400 @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import argparse +import re + +def extract_clade_abundance(metaphlan2_filepath): + clade_abundance = {} + with open(metaphlan2_filepath, 'r') as metaphlan2_file: + for line in metaphlan2_file.readlines(): + if line.find('g__') == -1: + continue + + split_line = line[:-1].split('\t') + taxo = split_line[0] + abundance = split_line[1] + + genus = taxo[(taxo.find('g__')+3):] + if genus.find('|') != -1: + genus = genus[:(genus.find('|'))] + clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) + if taxo.find('t__') != -1: + continue + elif taxo.find('s__') != -1: + species = taxo[(taxo.find('s__')+3):] + clade_abundance[genus]['species'].setdefault(species, abundance) + else: + clade_abundance[genus]['abundance'] = abundance + return clade_abundance + +def compute_overall_abundance(humann2_file): + overall_abundance = 0 + with open(args.humann2_file, 'r') as humann2_file: + for line in humann2_file.readlines(): + if line.find('|') != -1 or line.startswith('#'): + continue + split_line = line[:-1].split('\t') + overall_abundance += float(split_line[1]) + return overall_abundance + +def format_characteristic_name(name): + formatted_name = name + formatted_name = formatted_name.replace('/',' ') + formatted_name = formatted_name.replace('-',' ') + formatted_name = formatted_name.replace("'",'') + if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: + open_bracket = formatted_name.find('(') + close_bracket = formatted_name.find(')')+1 + formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] + return formatted_name + +def combine_metaphlan2_humann2(args): + clade_abundance = extract_clade_abundance(args.metaphlan2_file) + overall_abundance = compute_overall_abundance(args.humann2_file) + + with open(args.output_file, 'w') as output_file: + output_file.write('genus\t') + output_file.write('genus_abundance\t') + output_file.write('species\t') + output_file.write('species_abundance\t') + output_file.write(args.type + '_id\t') + output_file.write(args.type + '_name\t') + output_file.write(args.type + '_abundance\n') + with open(args.humann2_file, 'r') as humann2_file: + for line in humann2_file.readlines(): + if line.find('|') == -1: + continue + + split_line = line[:-1].split('\t') + abundance = 100*float(split_line[1])/overall_abundance + annotation = split_line[0].split('|') + characteristic = annotation[0].split(':') + characteristic_id = characteristic[0] + characteristic_name = '' + if len(characteristic) > 1: + characteristic_name = format_characteristic_name(characteristic[-1]) + taxo = annotation[1].split('.') + + if taxo[0] == 'unclassified': + continue + genus = taxo[0][3:] + species = taxo[1][3:] + + if not clade_abundance.has_key(genus): + print "no", genus, "found in", args.metaphlan2_file + continue + if not clade_abundance[genus]['species'].has_key(species): + print "no", species, "found in", args.metaphlan2_file, + print "for", genus + continue + output_file.write(genus + '\t') + output_file.write(clade_abundance[genus]['abundance'] + '\t') + output_file.write(species + '\t') + output_file.write(clade_abundance[genus]['species'][species] + '\t') + output_file.write(characteristic_id + '\t') + output_file.write(characteristic_name + '\t') + output_file.write(str(abundance) + '\n') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--humann2_file', required=True) + parser.add_argument('--metaphlan2_file', required=True) + parser.add_argument('--output_file', required=True) + parser.add_argument('--type', required=True, + choices = ['gene_families','pathways']) + args = parser.parse_args() + + combine_metaphlan2_humann2(args) \ No newline at end of file