Mercurial > repos > george-weingart > maaslin
view src/PCLToGraphlanCoreGene.py @ 0:e0b5980139d9
maaslin
author | george-weingart |
---|---|
date | Tue, 13 May 2014 22:00:40 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ##################################################################################### #Copyright (C) <2012> # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal in the #Software without restriction, including without limitation the rights to use, copy, #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, #and to permit persons to whom the Software is furnished to do so, subject to #the following conditions: # #The above copyright notice and this permission notice shall be included in all copies #or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), # authored by the Huttenhower lab at the Harvard School of Public Health # (contact Timothy Tickle, ttickle@hsph.harvard.edu). ##################################################################################### __author__ = "Timothy Tickle" __copyright__ = "Copyright 2012" __credits__ = ["Timothy Tickle"] __license__ = "" __version__ = "" __maintainer__ = "Timothy Tickle" __email__ = "ttickle@sph.harvard.edu" __status__ = "Development" import argparse import csv from operator import itemgetter import re import sys #Helper function which returns a boolean indicator of an input string being parsable as an int def funcIsInt(strInt): try: int(strInt) return True except: return False #Helper function that gets the index of the name and gives the last value of the list for - or the first value depending on the position # This supports the ranging in the read.config files #If no range is given then the result is just one index of the given name def funcGetIndices(lsFeature, lsFunctionNames): if(len(lsFeature)) == 1: if(funcIsInt(lsFeature[0])): return int(lsFeature[0])-1 return [lsFeatureNames.index(lsFeature[0])] if(len(lsFeature)) == 2: iIndices = [] iPosition = 1 for sFeature in lsFeature: if(sFeature==""): if(iPosition==1): iIndices.append(2) elif(iPosition==2): iIndices.append(len(lsFunctionNames)-1) elif(funcIsInt(sFeature)): iIndices.append(int(sFeature)-1) else: iIndices.append(lsFeatureNames.index(sFeature)) iPosition = iPosition + 1 return iIndices #Constants #The line indicating the rows to read c_MatrixName = "Matrix:" c_DataMatrix = "Abundance" c_strRows = "Read_PCL_Rows:" #Set up arguments reader argp = argparse.ArgumentParser( prog = "PCLToGraphlanCoreGene.py", description = """Converts PCL files to Graphlan core gene files.""" ) #Arguments argp.add_argument("strInputPCL", metavar = "PCLFile", type = argparse.FileType("r"), help ="Input PCl file used in maaslin") argp.add_argument("strInputRC", metavar = "RCFile", type = argparse.FileType("r"), help ="Input read config file used in maaslin") argp.add_argument("strOutputCoreGene", metavar = "CoreGeneFile", type = argparse.FileType("w"), help ="Output core gene file for graphlan") args = argp.parse_args( ) #Read in read config table and get the rows/columns to use #Indicates if we are reading a data matrix fIsData = False #Holds the indices ranges #List of lists,each internal list hold 1 or 2 indices, if two it indicates a range from the first to the second llsIndices = [] csvRC = open(args.strInputRC,'r') if isinstance(args.strInputRC, str) else args.strInputRC fRC = csv.reader(csvRC, delimiter=" ") for sLine in fRC: #Get the row indices or names if len(sLine): if sLine[0] == c_MatrixName: fIsData = sLine[1] == c_DataMatrix if sLine[0] == c_strRows: if fIsData: llsIndices = [sIndexRange.split("-") for sIndexRange in sLine[1].split(",")] break csvRC.close() # Check to make sure RC file is read if len(llsIndices)==0: print("PCLToGraphlanCoreGene:: Could Not find indices in RC file "+args.strInputRC+".") #Read in the PCL file and parse the file names to core genes format csvPCL = open(args.strInputPCL,'r') if isinstance(args.strInputPCL, str) else args.strInputPCL fPCL = csv.reader(csvPCL,delimiter="\t") #The first column of the csv file lsFeatureNames = [sLine[0] for sLine in fPCL] csvPCL.close() # Check to make sure PCL file is read if len(lsFeatureNames)==0: print("PCLToGraphlanCoreGene:: Could Not find features in PCL file "+args.strInputPCL+".") #If the indices are names switch with numbers otherwise subtract 1 because they are ment for R liConvertedRangedIndices = [funcGetIndices(sIndex,lsFeatureNames) for sIndex in llsIndices] if len(llsIndices)>0 else [] llsIndices = None #If there are any ranges, reduce to lists of indices liConvertedIndices = [] for lsIndices in liConvertedRangedIndices: lsIndices.sort() iLenIndices = len(lsIndices) if iLenIndices > 2: print "Error, received more than 2 indices in a range. Stopped." exit() liConvertedIndices.extend(lsIndices if iLenIndices == 1 else range(lsIndices[0],lsIndices[1]+1)) liConvertedRangedIndices = None #Collapse all indices to a set which is then sorted liConvertedIndices = sorted(list(set(liConvertedIndices))) #Reduce name of features to just bugs indicated by indices lsFeatureNames = itemgetter(*liConvertedIndices)(lsFeatureNames) liConvertedIndices = None #Change the bug names to the correct formatting (clades seperated by .) lsFeatureNames = sorted(lsFeatureNames) lsFeatureNames = [re.sub("^[A-Za-z]__","",sBug) for sBug in lsFeatureNames] lsFeatureNames = [[re.sub("\|*[A-Za-z]__|\|",".",sBug)] for sBug in lsFeatureNames] #If this is an OTU, append the number and the genus level together for a more descriptive termal name lsFeatureNamesModForOTU = [] for sBug in lsFeatureNames: lsBug = sBug[0].split(".") if(len(lsBug))> 1: if(lsBug[-1].isdigit()): lsBug[-2]=lsBug[-2]+"_"+lsBug[-1] lsBug = lsBug[0:-1] lsFeatureNamesModForOTU.append([".".join(lsBug)]) else: lsFeatureNamesModForOTU.append([lsBug[0]]) #Output core gene file csvCG = open(args.strOutputCoreGene,'w') if isinstance(args.strOutputCoreGene, str) else args.strOutputCoreGene fCG = csv.writer(csvCG) fCG.writerows(lsFeatureNamesModForOTU) csvCG.close()