diff combine_metaphlan2_humann2.py @ 1:e25efca0a49c draft

"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit 4a40a73a6eb981310a3c1c54dfa98b5460a843f9"
author bebatut
date Mon, 14 Sep 2020 12:19:49 +0000
parents 31394a0c0242
children fdfb35745104
line wrap: on
line diff
--- a/combine_metaphlan2_humann2.py	Fri Apr 15 09:15:21 2016 -0400
+++ b/combine_metaphlan2_humann2.py	Mon Sep 14 12:19:49 2020 +0000
@@ -1,15 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
-import os
 import argparse
-import re
+
 
-def extract_clade_abundance(metaphlan2_filepath):
-    clade_abundance = {}
-    with open(metaphlan2_filepath, 'r') as metaphlan2_file:
-        for line in metaphlan2_file.readlines():
+def extract_clade_abundance(metaphlan2_fp):
+    clade_abund = {}
+    with open(metaphlan2_fp, 'r') as metaphlan2_f:
+        for line in metaphlan2_f.readlines():
             if line.find('g__') == -1:
                 continue
 
@@ -20,91 +18,92 @@
             genus = taxo[(taxo.find('g__')+3):]
             if genus.find('|') != -1:
                 genus = genus[:(genus.find('|'))]
-            clade_abundance.setdefault(genus, {'abundance':0, 'species':{}})
+            clade_abund.setdefault(genus, {'abundance': 0, 'species': {}})
             if taxo.find('t__') != -1:
                 continue
             elif taxo.find('s__') != -1:
                 species = taxo[(taxo.find('s__')+3):]
-                clade_abundance[genus]['species'].setdefault(species, abundance)
+                clade_abund[genus]['species'].setdefault(
+                    species,
+                    abundance)
             else:
-                clade_abundance[genus]['abundance'] = abundance
-    return clade_abundance
+                clade_abund[genus]['abundance'] = abundance
+    return clade_abund
 
-def compute_overall_abundance(humann2_file):
+
+def compute_overall_abundance(humann2_fp):
     overall_abundance = 0
-    with open(args.humann2_file, 'r') as humann2_file:
-        for line in humann2_file.readlines():
+    with open(humann2_fp, 'r') as humann2_f:
+        for line in humann2_f.readlines():
             if line.find('|') != -1 or line.startswith('#'):
                 continue
             split_line = line[:-1].split('\t')
             overall_abundance += float(split_line[1])
     return overall_abundance
 
+
 def format_characteristic_name(name):
-    formatted_name = name
-    formatted_name = formatted_name.replace('/',' ')
-    formatted_name = formatted_name.replace('-',' ')
-    formatted_name = formatted_name.replace("'",'')
-    if formatted_name.find('(') != -1 and formatted_name.find(')') != -1:
-        open_bracket = formatted_name.find('(')
-        close_bracket = formatted_name.find(')')+1
-        formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:]
-    return formatted_name
+    formatted_n = name
+    formatted_n = formatted_n.replace('/', ' ')
+    formatted_n = formatted_n.replace('-', ' ')
+    formatted_n = formatted_n.replace("'", '')
+    if formatted_n.find('(') != -1 and formatted_n.find(')') != -1:
+        open_bracket = formatted_n.find('(')
+        close_bracket = formatted_n.find(')')+1
+        formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
+    return formatted_n
+
 
 def combine_metaphlan2_humann2(args):
-    clade_abundance = extract_clade_abundance(args.metaphlan2_file)
-    overall_abundance = compute_overall_abundance(args.humann2_file)
+    clade_abund = extract_clade_abundance(args.metaphlan2_fp)
+    overall_abund = compute_overall_abundance(args.humann2_fp)
 
-    with open(args.output_file, 'w') as output_file:
-        output_file.write('genus\t')
-        output_file.write('genus_abundance\t')
-        output_file.write('species\t')
-        output_file.write('species_abundance\t')
-        output_file.write(args.type + '_id\t')
-        output_file.write(args.type + '_name\t')
-        output_file.write(args.type + '_abundance\n')
-        with open(args.humann2_file, 'r') as humann2_file:
-            for line in humann2_file.readlines():
+    with open(args.output_fp, 'w') as output_f:
+        s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t'
+        s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type)
+        output_f.write(s)
+        with open(args.humann2_fp, 'r') as humann2_f:
+            for line in humann2_f.readlines():
                 if line.find('|') == -1:
                     continue
 
                 split_line = line[:-1].split('\t')
-                abundance = 100*float(split_line[1])/overall_abundance
+                abundance = 100*float(split_line[1])/overall_abund
                 annotation = split_line[0].split('|')
-                characteristic = annotation[0].split(':')
-                characteristic_id = characteristic[0]
-                characteristic_name = ''
-                if len(characteristic) > 1:
-                    characteristic_name = format_characteristic_name(characteristic[-1])
+                charact = annotation[0].split(':')
+                charact_id = charact[0]
+                char_name = ''
+                if len(charact) > 1:
+                    char_name = format_characteristic_name(charact[-1])
                 taxo = annotation[1].split('.')
-                
+
                 if taxo[0] == 'unclassified':
                     continue
                 genus = taxo[0][3:]
                 species = taxo[1][3:]
 
-                if not clade_abundance.has_key(genus):
-                    print "no", genus, "found in", args.metaphlan2_file
+                if genus not in clade_abund:
+                    print("no %s found in %s" % (genus, args.metaphlan2_fp))
                     continue
-                if not clade_abundance[genus]['species'].has_key(species):
-                    print "no", species, "found in", args.metaphlan2_file,
-                    print "for", genus
+                if species not in clade_abund[genus]['species']:
+                    print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus))
                     continue
-                output_file.write(genus + '\t')
-                output_file.write(clade_abundance[genus]['abundance'] + '\t')
-                output_file.write(species + '\t')
-                output_file.write(clade_abundance[genus]['species'][species] + '\t')
-                output_file.write(characteristic_id + '\t')
-                output_file.write(characteristic_name + '\t')
-                output_file.write(str(abundance) + '\n')
+
+                s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance'])
+                s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species])
+                s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
+                output_f.write(s)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--humann2_file', required=True)
-    parser.add_argument('--metaphlan2_file', required=True)
-    parser.add_argument('--output_file', required=True)
-    parser.add_argument('--type', required=True, 
-        choices = ['gene_families','pathways'])
+    parser.add_argument('--humann2_fp', required=True)
+    parser.add_argument('--metaphlan2_fp', required=True)
+    parser.add_argument('--output_fp', required=True)
+    parser.add_argument(
+        '--type',
+        required=True,
+        choices=['gene_families', 'pathways'])
     args = parser.parse_args()
 
-    combine_metaphlan2_humann2(args)
\ No newline at end of file
+    combine_metaphlan2_humann2(args)