diff compare_humann2_output.py @ 2:05766022dfc4 draft

"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/compare_humann2_output commit dc55dc3b5275d1d6aac390698c0c6e0ab8fbf2f7"
author bebatut
date Mon, 14 Sep 2020 13:50:30 +0000
parents 9959fa526f1a
children eaa95ea1195c
line wrap: on
line diff
--- a/compare_humann2_output.py	Wed Apr 20 09:15:27 2016 -0400
+++ b/compare_humann2_output.py	Mon Sep 14 13:50:30 2020 +0000
@@ -1,17 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
-import os
 import argparse
-import re
+
 
-def extract_abundances(filepath, nb_charact_to_extract):
+def extract_abundances(fp, nb_charact_to_extract):
     abundances = {}
     more_abund_charact = []
     abund_sum = 0
-    with open(filepath, 'r') as abundance_file:
-        for line in abundance_file.readlines()[1:]:
+    with open(fp, 'r') as abundance_f:
+        for line in abundance_f.readlines()[1:]:
             split_line = line[:-1].split('\t')
             charact_id = split_line[0]
             abund = float(split_line[1])
@@ -22,117 +20,134 @@
                 more_abund_charact.append(charact_id)
             else:
                 best_pos = None
-                for i in range(len(more_abund_charact)-1,-1,-1):
+                for i in range(len(more_abund_charact)-1, -1, -1):
                     if abundances[more_abund_charact[i]] < abund:
                         best_pos = i
                     else:
                         break
-                if best_pos != None:
+                if best_pos is not None:
                     tmp_more_abund_charact = more_abund_charact
                     more_abund_charact = tmp_more_abund_charact[:best_pos]
                     more_abund_charact += [charact_id]
                     more_abund_charact += tmp_more_abund_charact[best_pos:-1]
     return abundances, more_abund_charact
 
+
 def format_characteristic_name(all_name):
     if all_name.find(':') != -1:
         charact_id = all_name.split(':')[0]
-        charact_name = all_name.split(':')[1][1:]
+        char_name = all_name.split(':')[1][1:]
     else:
         charact_id = all_name
-        charact_name = ''
+        char_name = ''
 
-    charact_name = charact_name.replace('/',' ')
-    charact_name = charact_name.replace('-',' ')
-    charact_name = charact_name.replace("'",'')
-    if charact_name.find('(') != -1 and charact_name.find(')') != -1:
-        open_bracket = charact_name.find('(')
-        close_bracket = charact_name.find(')')+1
-        charact_name = charact_name[:open_bracket] + charact_name[close_bracket:]
-    return charact_id,charact_name
+    char_name = char_name.replace('/', ' ')
+    char_name = char_name.replace('-', ' ')
+    char_name = char_name.replace("'", '')
+    if char_name.find('(') != -1 and char_name.find(')') != -1:
+        open_bracket = char_name.find('(')
+        close_bracket = char_name.find(')')+1
+        char_name = char_name[:open_bracket] + char_name[close_bracket:]
+    return charact_id, char_name
 
-def write_more_abundant_charat(abundances,more_abund_charact, output_filepath):
-    with open(output_filepath,'w') as output_file:
-        output_file.write('id\tname\t')
-        output_file.write('\t'.join(abundances.keys()) + '\n')
+
+def write_more_abundant_charat(abundances, more_abund_charact, output_fp):
+    with open(output_fp, 'w') as output_f:
+        output_f.write('id\tname\t%s\n' % '\t'.join(abundances.keys()))
 
         for mac in more_abund_charact:
-            charact_id,charact_name = format_characteristic_name(mac)
-            output_file.write(charact_id + '\t' + charact_name)
+            charact_id, charact_name = format_characteristic_name(mac)
+            output_f.write('%s\t%s' % (charact_id, charact_name))
             for sample in abundances:
                 abund = abundances[sample].get(mac, 0)
-                output_file.write('\t' + str(abund))
-            output_file.write('\n')
+                output_f.write('\t%s' % (abund))
+            output_f.write('\n')
+
 
-def extract_similar_characteristics(abundances, sim_output_filepath,
-    specific_output_files):
-    sim_characteristics = set(abundances[abundances.keys()[0]].keys())
-    for sample in abundances.keys()[1:]:
-        sim_characteristics.intersection_update(abundances[sample].keys())
-    print 'Similar between all samples:', len(sim_characteristics)
+def extract_similar_characteristics(abund, sim_output_fp, output_files):
+    abund_keys = list(abund)
+    sim_characteristics = set(abund[abund_keys[0]].keys())
+    for sample in abund_keys[1:]:
+        sim_characteristics.intersection_update(abund[sample].keys())
+    print('Similar between all samples: %s' % len(sim_characteristics))
 
-    with open(sim_output_filepath, 'w') as sim_output_file:
-        sim_output_file.write('id\tname\t' + '\t'.join(abundances.keys()) + '\n')
+    with open(sim_output_fp, 'w') as sim_output_f:
+        sim_output_f.write('id\tname\t%s\n' % '\t'.join(abund_keys))
         for charact in list(sim_characteristics):
-            charact_id,charact_name = format_characteristic_name(charact)
-            sim_output_file.write(charact_id + '\t' + charact_name)
-            for sample in abundances.keys():
-                sim_output_file.write('\t' + str(abundances[sample][charact]))
-            sim_output_file.write('\n')
+            charact_id, charact_name = format_characteristic_name(charact)
+            sim_output_f.write('%s\t%s' % (charact_id, charact_name))
+            for sample in abund_keys:
+                sim_output_f.write('\t%s' % abund[sample][charact])
+            sim_output_f.write('\n')
 
-    print 'Specific to samples:'
-    diff_characteristics = {}
-    for i in range(len(abundances.keys())):
-        sample = abundances.keys()[i]
-        print ' ', sample, ""
-        print '    All:', len(abundances[sample].keys())
-        diff_characteristics[sample] = set(abundances[sample].keys())
-        diff_characteristics[sample].difference_update(sim_characteristics)
-        print '    Number of specific characteristics:', 
-        print len(diff_characteristics[sample])
-        print '    Percentage of specific characteristics:',
-        print 100*len(diff_characteristics[sample])/(1.*len(abundances[sample].keys()))
+    print('Specific to samples:')
+    diff_char = {}
+    for i in range(len(abund_keys)):
+        sample = abund_keys[i]
+        print(' %s' % sample )
+        print('    All: %s' % len(abund[sample].keys()))
+        diff_char[sample] = set(abund[sample].keys())
+        diff_char[sample].difference_update(sim_characteristics)
+        perc = 100*len(diff_char[sample])/(1.*len(abund[sample].keys()))
+        print('    Number of specific characteristics: %s' % len(diff_char[sample]))
+        print('    Percentage of specific characteristics: %s' % perc)
 
         relative_abundance = 0
-        with open(specific_output_files[i], 'w') as output_file:
-            output_file.write('id\tname\tabundances\n')
-            for charact in list(diff_characteristics[sample]):
-                charact_id,charact_name = format_characteristic_name(charact)
-                output_file.write(charact_id + '\t' + charact_name + '\t')
-                output_file.write(str(abundances[sample][charact]) + '\n')
-                relative_abundance += abundances[sample][charact]
-        print '    Relative abundance of specific characteristics(%):', relative_abundance
+        with open(output_files[i], 'w') as output_f:
+            output_f.write('id\tname\tabundances\n')
+            for charact in list(diff_char[sample]):
+                charact_id, charact_name = format_characteristic_name(charact)
+                output_f.write('%s\t%s' % (charact_id, charact_name))
+                output_f.write('%s\n' % abund[sample][charact])
+                relative_abundance += abund[sample][charact]
+        print('    Relative abundance of specific characteristics: %s' % relative_abundance)
 
     return sim_characteristics
 
+
 def compare_humann2_output(args):
-    abundances = {}
+    abund = {}
     more_abund_charact = []
 
     for i in range(len(args.sample_name)):
-        abundances[args.sample_name[i]], mac = extract_abundances(args.charact_input_file[i],
+        abund[args.sample_name[i]], mac = extract_abundances(
+            args.charact_input_fp[i],
             args.most_abundant_characteristics_to_extract)
         more_abund_charact += mac
 
-    write_more_abundant_charat(abundances, list(set(more_abund_charact)), 
-        args.more_abundant_output_file)
-    sim_characteristics = extract_similar_characteristics(abundances, 
-        args.similar_output_file, args.specific_output_file)
+    write_more_abundant_charat(
+        abund,
+        list(set(more_abund_charact)),
+        args.more_abundant_output_fp)
+    extract_similar_characteristics(
+        abund,
+        args.similar_output_fp,
+        args.specific_output_fp)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--sample_name', required=True, action='append')
-    parser.add_argument('--charact_input_file', required=True, action='append')
-    parser.add_argument('--most_abundant_characteristics_to_extract', required=True,
-        type = int)
-    parser.add_argument('--more_abundant_output_file', required=True)
-    parser.add_argument('--similar_output_file', required=True)
-    parser.add_argument('--specific_output_file', required=True,action='append')
+    parser.add_argument('--charact_input_fp', required=True, action='append')
+    parser.add_argument(
+        '--most_abundant_characteristics_to_extract',
+        required=True,
+        type=int)
+    parser.add_argument('--more_abundant_output_fp', required=True)
+    parser.add_argument('--similar_output_fp', required=True)
+    parser.add_argument(
+        '--specific_output_fp',
+        required=True,
+        action='append')
     args = parser.parse_args()
 
-    if len(args.sample_name) != len(args.charact_input_file):
-        raise ValueError("Same number of values (in same order) are expected for --sample_name and --charact_input_file")
-    if len(args.sample_name) != len(args.specific_output_file):
-        raise ValueError("Same number of values (in same order) are expected for --sample_name and --specific_output_file")
+    if len(args.sample_name) != len(args.charact_input_fp):
+        string = "Same number of values (in same order) are expected for "
+        string += "--sample_name and --charact_input_fp"
+        raise ValueError(string)
+    if len(args.sample_name) != len(args.specific_output_fp):
+        string = "Same number of values (in same order) are expected for "
+        string += "--sample_name and --specific_output_fp"
+        raise ValueError(string)
 
-    compare_humann2_output(args)
\ No newline at end of file
+    compare_humann2_output(args)