Mercurial > repos > bebatut > combine_metaphlan2_humann2
view combine_metaphlan2_humann2.py @ 0:31394a0c0242 draft
planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit e6bee6545960c2a1ae3ca3031ec74d7c26d0b0ce-dirty
author | bebatut |
---|---|
date | Fri, 15 Apr 2016 09:15:21 -0400 |
parents | |
children | e25efca0a49c |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os import argparse import re def extract_clade_abundance(metaphlan2_filepath): clade_abundance = {} with open(metaphlan2_filepath, 'r') as metaphlan2_file: for line in metaphlan2_file.readlines(): if line.find('g__') == -1: continue split_line = line[:-1].split('\t') taxo = split_line[0] abundance = split_line[1] genus = taxo[(taxo.find('g__')+3):] if genus.find('|') != -1: genus = genus[:(genus.find('|'))] clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) if taxo.find('t__') != -1: continue elif taxo.find('s__') != -1: species = taxo[(taxo.find('s__')+3):] clade_abundance[genus]['species'].setdefault(species, abundance) else: clade_abundance[genus]['abundance'] = abundance return clade_abundance def compute_overall_abundance(humann2_file): overall_abundance = 0 with open(args.humann2_file, 'r') as humann2_file: for line in humann2_file.readlines(): if line.find('|') != -1 or line.startswith('#'): continue split_line = line[:-1].split('\t') overall_abundance += float(split_line[1]) return overall_abundance def format_characteristic_name(name): formatted_name = name formatted_name = formatted_name.replace('/',' ') formatted_name = formatted_name.replace('-',' ') formatted_name = formatted_name.replace("'",'') if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: open_bracket = formatted_name.find('(') close_bracket = formatted_name.find(')')+1 formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] return formatted_name def combine_metaphlan2_humann2(args): clade_abundance = extract_clade_abundance(args.metaphlan2_file) overall_abundance = compute_overall_abundance(args.humann2_file) with open(args.output_file, 'w') as output_file: output_file.write('genus\t') output_file.write('genus_abundance\t') output_file.write('species\t') output_file.write('species_abundance\t') output_file.write(args.type + '_id\t') output_file.write(args.type + '_name\t') output_file.write(args.type + '_abundance\n') with open(args.humann2_file, 'r') as humann2_file: for line in humann2_file.readlines(): if line.find('|') == -1: continue split_line = line[:-1].split('\t') abundance = 100*float(split_line[1])/overall_abundance annotation = split_line[0].split('|') characteristic = annotation[0].split(':') characteristic_id = characteristic[0] characteristic_name = '' if len(characteristic) > 1: characteristic_name = format_characteristic_name(characteristic[-1]) taxo = annotation[1].split('.') if taxo[0] == 'unclassified': continue genus = taxo[0][3:] species = taxo[1][3:] if not clade_abundance.has_key(genus): print "no", genus, "found in", args.metaphlan2_file continue if not clade_abundance[genus]['species'].has_key(species): print "no", species, "found in", args.metaphlan2_file, print "for", genus continue output_file.write(genus + '\t') output_file.write(clade_abundance[genus]['abundance'] + '\t') output_file.write(species + '\t') output_file.write(clade_abundance[genus]['species'][species] + '\t') output_file.write(characteristic_id + '\t') output_file.write(characteristic_name + '\t') output_file.write(str(abundance) + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--humann2_file', required=True) parser.add_argument('--metaphlan2_file', required=True) parser.add_argument('--output_file', required=True) parser.add_argument('--type', required=True, choices = ['gene_families','pathways']) args = parser.parse_args() combine_metaphlan2_humann2(args)