Previous changeset 1:db64b6287cd6 (2014-08-20) |
Commit message:
Fixed bug due to numerical approximation after normalization affecting root-level clades (e.g. "Bacteria" or "Archaea") |
added:
format_input.py |
b |
diff -r db64b6287cd6 -r a31c10fe09c8 format_input.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/format_input.py Tue Jul 07 13:52:29 2015 -0400 |
[ |
b'@@ -0,0 +1,453 @@\n+#!/usr/bin/env python\n+\n+import sys,os,argparse,pickle,re,numpy\n+\n+\n+\n+\n+#*************************************************************************************************************** \n+#* Log of change *\n+#* January 16, 2014 - George Weingart - george.weingart@gmail.com *\n+#* *\n+#* biom Support *\n+#* Modified the program to enable it to accept biom files as input *\n+#* *\n+#* Added two optional input parameters:\t *\n+#* 1. biom_c is the name of the biom metadata to be used as class *\n+#* 2. biom_s is the name of the biom metadata to be used as subclass *\n+#* class and subclass are used in the same context as the original *\n+#* parameters class and subclass *\n+#* These parameters are totally optional, the default is the program *\n+#* chooses as class the first metadata received from the conversion *\n+#* of the biom file into a sequential (pcl) file as generated by *\n+#* breadcrumbs, and similarly, the second metadata is selected as *\n+#* subclass. *\n+#* The syntax or logic for the original non-biom case was NOT changed. *\n+#* *\n+#* <******************* IMPORTANT NOTE *************************> *\n+#* The biom case requires breadcrumbs and therefore there is a *\n+#* a conditional import of the breadcrumbs modules *\n+#* If the User uses a biom input and breadcrumbs is not detected, *\n+#* the run is abnormally ended *\n+#* breadcrumbs itself needs a biom environment, so if the immport *\n+#* of biom in breadcrumbs fails, the run is also abnormally \n+#* ended (Only if the input file was biom) *\n+#* *\n+#* USAGE EXAMPLES *\n+#* -------------- *\n+#* Case #1: Using a sequential file as input (Old version - did not change *\n+#* ./format_input.py hmp_aerobiosis_small.txt hmp_aerobiosis_small.in -c 1 -s 2 -u 3 -o 1000000 * \n+#* Case #2: Using a biom file as input *\n+#* ./format_input.py hmp_aerobiosis_small.biom hmp_aerobiosis_small.in -o 1000000 *\n+#* Case #3: Using a biom file as input and override the class and subclass'..b' = CommonArea[\'MetadataNames\'].index(params[\'biom_subclass\']) +1 #* Set up the index for that metadata\n+\t\t\telse:\n+\t\t\t\tFlagError = True\n+\t\tif FlagError == True:\t\t#* If the User passed an invalid class\n+\t\t\tprint "**Invalid biom class or subclass passed - Using defaults: First metadata=class, Second Metadata=subclass\\n"\n+\t\t\tparams[\'class\'] = 2\n+\t\t\tparams[\'subclass\'] = 3\n+\treturn params\n+ \t\n+\t\n+\n+if __name__ == \'__main__\':\n+\tCommonArea = dict()\t\t\t#Build a Common Area to pass variables in the biom case\n+\tparams = read_params(sys.argv)\n+\n+\t#*************************************************************\n+\t#* Conditionally import breadcrumbs if file is a biom file *\n+\t#* If it is and no breadcrumbs found - abnormally exit *\n+\t#*************************************************************\n+\tif params[\'input_file\'].endswith(\'.biom\'):\n+\t\ttry:\n+\t\t\tfrom lefsebiom.ConstantsBreadCrumbs import *\t \n+\t\t\tfrom lefsebiom.AbundanceTable import *\n+\t\texcept ImportError:\n+\t\t\tsys.stderr.write("************************************************************************************************************ \\n")\n+\t\t\tsys.stderr.write("* Error: Breadcrumbs libraries not detected - required to process biom files - run abnormally terminated * \\n")\n+\t\t\tsys.stderr.write("************************************************************************************************************ \\n")\n+\t\t\texit(1)\n+\n+\t\n+\tif type(params[\'subclass\']) is int and int(params[\'subclass\']) < 1:\n+\t\tparams[\'subclass\'] = None\n+\tif type(params[\'subject\']) is int and int(params[\'subject\']) < 1:\n+\t\tparams[\'subject\'] = None\n+\n+\n+\tCommonArea = read_input_file(sys.argv[1], CommonArea)\t\t#Pass The CommonArea to the Read\n+\tdata = CommonArea[\'ReturnedData\']\t\t\t\t\t#Select the data\n+\n+\tif sys.argv[1].endswith(\'biom\'):\t#*\tCheck if biom:\n+\t\tparams = check_params_for_biom_case(params, CommonArea)\t#Check the params for the biom case\n+\n+\tif params[\'feats_dir\'] == "c":\n+\t\tdata = transpose(data)\n+\n+\tncl = 1\n+\tif not params[\'subclass\'] is None: ncl += 1\t\n+\tif not params[\'subject\'] is None: ncl += 1\t\n+\n+\tfirst_line = zip(*data)[0]\n+\t\n+\tfirst_line = modify_feature_names(list(first_line))\n+\n+\tdata = zip(\tfirst_line,\n+\t\t\t*sort_by_cl(zip(*data)[1:],\n+\t\t\t ncl,\n+\t\t\t params[\'class\']-1,\n+\t\t\t params[\'subclass\']-1 if not params[\'subclass\'] is None else None,\n+\t\t\t params[\'subject\']-1 if not params[\'subject\'] is None else None))\n+#\tdata.insert(0,first_line)\n+#\tdata = remove_missing(data,params[\'missing_p\'])\n+\tcls = {}\n+\n+\tcls_i = [(\'class\',params[\'class\']-1)]\n+\tif params[\'subclass\'] > 0: cls_i.append((\'subclass\',params[\'subclass\']-1))\n+\tif params[\'subject\'] > 0: cls_i.append((\'subject\',params[\'subject\']-1))\n+\tcls_i.sort(lambda x, y: -cmp(x[1],y[1]))\n+\tfor v in cls_i: cls[v[0]] = data.pop(v[1])[1:]\n+\tif not params[\'subclass\'] > 0: cls[\'subclass\'] = [str(cl)+"_subcl" for cl in cls[\'class\']]\n+\t\n+\tcls[\'subclass\'] = rename_same_subcl(cls[\'class\'],cls[\'subclass\'])\n+#\tif \'subclass\' in cls.keys(): cls = group_small_subclasses(cls,params[\'subcl_min_card\'])\n+\tclass_sl,subclass_sl,class_hierarchy = get_class_slices(zip(*cls.values()))\n+ \n+\tfeats = dict([(d[0],d[1:]) for d in data])\n+ \n+\tfeats = add_missing_levels(feats)\n+ \n+\tfeats = numerical_values(feats,params[\'norm_v\'])\n+\tout = {}\n+\tout[\'feats\'] = feats\n+\tout[\'norm\'] = params[\'norm_v\'] \n+\tout[\'cls\'] = cls\n+\tout[\'class_sl\'] = class_sl\n+\tout[\'subclass_sl\'] = subclass_sl\n+\tout[\'class_hierarchy\'] = class_hierarchy\n+\n+\tif params[\'output_table\']:\n+\t\twith open( params[\'output_table\'], "w") as outf: \n+\t\t\tif \'class\' in cls: outf.write( "\\t".join(list(["class"])+list(cls[\'class\'])) + "\\n" )\n+\t\t\tif \'subclass\' in cls: outf.write( "\\t".join(list(["subclass"])+list(cls[\'subclass\'])) + "\\n" )\n+\t\t\tif \'subject\' in cls: outf.write( "\\t".join(list(["subject"])+list(cls[\'subject\'])) + "\\n" )\n+\t\t\tfor k,v in out[\'feats\'].items(): outf.write( "\\t".join([k]+[str(vv) for vv in v]) + "\\n" )\n+\n+\twith open(params[\'output_file\'], \'wb\') as back_file:\n+\t\tpickle.dump(out,back_file) \t\n+\n' |