Repository 'lefse'
hg clone https://toolshed.g2.bx.psu.edu/repos/george-weingart/lefse

Changeset 2:a31c10fe09c8 (2015-07-07)
Previous changeset 1:db64b6287cd6 (2014-08-20)
Commit message:
Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea")
added:
format_input.py
b
diff -r db64b6287cd6 -r a31c10fe09c8 format_input.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/format_input.py Tue Jul 07 13:52:29 2015 -0400
[
b'@@ -0,0 +1,453 @@\n+#!/usr/bin/env python\n+\n+import sys,os,argparse,pickle,re,numpy\n+\n+\n+\n+\n+#*************************************************************************************************************** \n+#*   Log of change                                                                                             *\n+#*   January 16, 2014  - George Weingart - george.weingart@gmail.com                                           *\n+#*                                                                                                             *\n+#*   biom Support                                                                                              *\n+#*   Modified the program to enable it to accept biom files as input                                           *\n+#*                                                                                                             *\n+#*   Added two optional input parameters:\t                                                                   *\n+#*   1. biom_c is the name of the biom metadata to be used as class                                            *\n+#*   2. biom_s is the name of the biom metadata to be used as subclass                                         *\n+#*   class and subclass are used in the same context as the original                                           *\n+#*   parameters class and subclass                                                                             *\n+#*   These parameters are totally optional, the default is the program                                         *\n+#*   chooses as class the first metadata received from the conversion                                          *\n+#*   of the biom file into a sequential (pcl) file as generated by                                             *\n+#*   breadcrumbs, and similarly, the second metadata is selected as                                            *\n+#*   subclass.                                                                                                 *\n+#*   The syntax or logic for the original non-biom case was NOT changed.                                       *\n+#*                                                                                                             *\n+#*   <*******************  IMPORTANT NOTE   *************************>                                         *\n+#*   The biom case requires breadcrumbs and therefore there is a                                               *\n+#*      a conditional import of the breadcrumbs modules                                                        *\n+#*   If the User uses a biom input and breadcrumbs is not detected,                                            *\n+#*       the run is abnormally ended                                                                           *\n+#*   breadcrumbs itself needs a biom environment, so if the immport                                            *\n+#*       of biom in breadcrumbs fails,  the run is also abnormally \n+#*       ended (Only if the input file was biom)                                                               *\n+#*                                                                                                             *\n+#*   USAGE EXAMPLES                                                                                            *\n+#*   --------------                                                                                            *\n+#*   Case #1: Using a sequential file as input (Old version - did not change                                   *\n+#*  ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000               * \n+#*   Case #2: Using a biom file as input                                                                       *\n+#*  ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in  -o 1000000                            *\n+#*   Case #3: Using a biom file as input and override the class and subclass'..b' =  CommonArea[\'MetadataNames\'].index(params[\'biom_subclass\']) +1 #* Set up the index for that metadata\n+\t\t\telse:\n+\t\t\t\tFlagError = True\n+\t\tif FlagError == True:\t\t#* If the User passed an invalid class\n+\t\t\tprint "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\\n"\n+\t\t\tparams[\'class\'] =  2\n+\t\t\tparams[\'subclass\'] =  3\n+\treturn params\n+ \t\n+\t\n+\n+if  __name__ == \'__main__\':\n+\tCommonArea = dict()\t\t\t#Build a Common Area to pass variables in the biom case\n+\tparams = read_params(sys.argv)\n+\n+\t#*************************************************************\n+\t#* Conditionally import breadcrumbs if file is a biom file   *\n+\t#* If it is and no breadcrumbs found - abnormally exit       *\n+\t#*************************************************************\n+\tif  params[\'input_file\'].endswith(\'.biom\'):\n+\t\ttry:\n+\t\t\tfrom lefsebiom.ConstantsBreadCrumbs import *\t \n+\t\t\tfrom lefsebiom.AbundanceTable import *\n+\t\texcept ImportError:\n+\t\t\tsys.stderr.write("************************************************************************************************************ \\n")\n+\t\t\tsys.stderr.write("* Error:   Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \\n")\n+\t\t\tsys.stderr.write("************************************************************************************************************ \\n")\n+\t\t\texit(1)\n+\n+\t\n+\tif type(params[\'subclass\']) is int and int(params[\'subclass\']) < 1:\n+\t\tparams[\'subclass\'] = None\n+\tif type(params[\'subject\']) is int and int(params[\'subject\']) < 1:\n+\t\tparams[\'subject\'] = None\n+\n+\n+\tCommonArea = read_input_file(sys.argv[1], CommonArea)\t\t#Pass The CommonArea to the Read\n+\tdata = CommonArea[\'ReturnedData\']\t\t\t\t\t#Select the data\n+\n+\tif sys.argv[1].endswith(\'biom\'):\t#*\tCheck if biom:\n+\t\tparams = check_params_for_biom_case(params, CommonArea)\t#Check the params for the biom case\n+\n+\tif params[\'feats_dir\'] == "c":\n+\t\tdata = transpose(data)\n+\n+\tncl = 1\n+\tif not params[\'subclass\'] is None: ncl += 1\t\n+\tif not params[\'subject\'] is None: ncl += 1\t\n+\n+\tfirst_line = zip(*data)[0]\n+\t\n+\tfirst_line = modify_feature_names(list(first_line))\n+\n+\tdata = zip(\tfirst_line,\n+\t\t\t*sort_by_cl(zip(*data)[1:],\n+\t\t\t  ncl,\n+\t\t\t  params[\'class\']-1,\n+\t\t\t  params[\'subclass\']-1 if not params[\'subclass\'] is None else None,\n+\t\t\t  params[\'subject\']-1 if not params[\'subject\'] is None else None))\n+#\tdata.insert(0,first_line)\n+#\tdata = remove_missing(data,params[\'missing_p\'])\n+\tcls = {}\n+\n+\tcls_i = [(\'class\',params[\'class\']-1)]\n+\tif params[\'subclass\'] > 0: cls_i.append((\'subclass\',params[\'subclass\']-1))\n+\tif params[\'subject\'] > 0: cls_i.append((\'subject\',params[\'subject\']-1))\n+\tcls_i.sort(lambda x, y: -cmp(x[1],y[1]))\n+\tfor v in cls_i: cls[v[0]] = data.pop(v[1])[1:]\n+\tif not params[\'subclass\'] > 0: cls[\'subclass\'] = [str(cl)+"_subcl" for cl in cls[\'class\']]\n+\t\n+\tcls[\'subclass\'] = rename_same_subcl(cls[\'class\'],cls[\'subclass\'])\n+#\tif \'subclass\' in cls.keys(): cls = group_small_subclasses(cls,params[\'subcl_min_card\'])\n+\tclass_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))\n+    \n+\tfeats = dict([(d[0],d[1:]) for d in data])\n+    \n+\tfeats = add_missing_levels(feats)\n+    \n+\tfeats = numerical_values(feats,params[\'norm_v\'])\n+\tout = {}\n+\tout[\'feats\'] = feats\n+\tout[\'norm\'] = params[\'norm_v\'] \n+\tout[\'cls\'] = cls\n+\tout[\'class_sl\'] = class_sl\n+\tout[\'subclass_sl\'] = subclass_sl\n+\tout[\'class_hierarchy\'] = class_hierarchy\n+\n+\tif params[\'output_table\']:\n+\t\twith open( params[\'output_table\'], "w") as outf: \n+\t\t\tif \'class\' in cls: outf.write( "\\t".join(list(["class"])+list(cls[\'class\'])) + "\\n" )\n+\t\t\tif \'subclass\' in cls: outf.write( "\\t".join(list(["subclass"])+list(cls[\'subclass\'])) + "\\n" )\n+\t\t\tif \'subject\' in cls: outf.write( "\\t".join(list(["subject"])+list(cls[\'subject\']))  + "\\n" )\n+\t\t\tfor k,v in out[\'feats\'].items(): outf.write( "\\t".join([k]+[str(vv) for vv in v]) + "\\n" )\n+\n+\twith open(params[\'output_file\'], \'wb\') as back_file:\n+\t\tpickle.dump(out,back_file)    \t\n+\n'