maaslin: src/PCLToGraphlanCoreGene.py comparison

comparison src/PCLToGraphlanCoreGene.py @ 0:e0b5980139d9

maaslin

author	george-weingart
date	Tue, 13 May 2014 22:00:40 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e0b5980139d9
+#!/usr/bin/env python
+#####################################################################################
+#Copyright (C) <2012>
+#
+#Permission is hereby granted, free of charge, to any person obtaining a copy of
+#this software and associated documentation files (the "Software"), to deal in the
+#Software without restriction, including without limitation the rights to use, copy,
+#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+#and to permit persons to whom the Software is furnished to do so, subject to
+#the following conditions:
+#
+#The above copyright notice and this permission notice shall be included in all copies
+#or substantial portions of the Software.
+#
+#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
+# authored by the Huttenhower lab at the Harvard School of Public Health
+# (contact Timothy Tickle, ttickle@hsph.harvard.edu).
+#####################################################################################
+__author__ = "Timothy Tickle"
+__copyright__ = "Copyright 2012"
+__credits__ = ["Timothy Tickle"]
+__license__ = ""
+__version__ = ""
+__maintainer__ = "Timothy Tickle"
+__email__ = "ttickle@sph.harvard.edu"
+__status__ = "Development"
+import argparse
+import csv
+from operator import itemgetter
+import re
+import sys
+#Helper function which returns a boolean indicator of an input string being parsable as an int
+def funcIsInt(strInt):
+try:
+int(strInt)
+return True
+except:
+return False
+#Helper function that gets the index of the name and gives the last value of the list for - or the first value depending on the position
+# This supports the ranging in the read.config files
+#If no range is given then the result is just one index of the given name
+def funcGetIndices(lsFeature, lsFunctionNames):
+if(len(lsFeature)) == 1:
+if(funcIsInt(lsFeature[0])):
+return int(lsFeature[0])-1
+return [lsFeatureNames.index(lsFeature[0])]
+if(len(lsFeature)) == 2:
+iIndices = []
+iPosition = 1
+for sFeature in lsFeature:
+if(sFeature==""):
+if(iPosition==1):
+iIndices.append(2)
+elif(iPosition==2):
+iIndices.append(len(lsFunctionNames)-1)
+elif(funcIsInt(sFeature)):
+iIndices.append(int(sFeature)-1)
+else:
+iIndices.append(lsFeatureNames.index(sFeature))
+iPosition = iPosition + 1
+return iIndices
+#Constants
+#The line indicating the rows to read
+c_MatrixName = "Matrix:"
+c_DataMatrix = "Abundance"
+c_strRows = "Read_PCL_Rows:"
+#Set up arguments reader
+argp = argparse.ArgumentParser( prog = "PCLToGraphlanCoreGene.py",
+description = """Converts PCL files to Graphlan core gene files.""" )
+#Arguments
+argp.add_argument("strInputPCL", metavar = "PCLFile", type = argparse.FileType("r"), help ="Input PCl file used in maaslin")
+argp.add_argument("strInputRC", metavar = "RCFile", type = argparse.FileType("r"), help ="Input read config file used in maaslin")
+argp.add_argument("strOutputCoreGene", metavar = "CoreGeneFile", type = argparse.FileType("w"), help ="Output core gene file for graphlan")
+args = argp.parse_args( )
+#Read in read config table and get the rows/columns to use
+#Indicates if we are reading a data matrix
+fIsData = False
+#Holds the indices ranges
+#List of lists,each internal list hold 1 or 2 indices, if two it indicates a range from the first to the second
+llsIndices = []
+csvRC = open(args.strInputRC,'r') if isinstance(args.strInputRC, str) else args.strInputRC
+fRC = csv.reader(csvRC, delimiter=" ")
+for sLine in fRC:
+#Get the row indices or names
+if len(sLine):
+if sLine[0] == c_MatrixName:
+fIsData = sLine[1] == c_DataMatrix
+if sLine[0] == c_strRows:
+if fIsData:
+llsIndices = [sIndexRange.split("-") for sIndexRange in sLine[1].split(",")]
+break
+csvRC.close()
+# Check to make sure RC file is read
+if len(llsIndices)==0:
+print("PCLToGraphlanCoreGene:: Could Not find indices in RC file "+args.strInputRC+".")
+#Read in the PCL file and parse the file names to core genes format
+csvPCL = open(args.strInputPCL,'r') if isinstance(args.strInputPCL, str) else args.strInputPCL
+fPCL = csv.reader(csvPCL,delimiter="\t")
+#The first column of the csv file
+lsFeatureNames = [sLine[0] for sLine in fPCL]
+csvPCL.close()
+# Check to make sure PCL file is read
+if len(lsFeatureNames)==0:
+print("PCLToGraphlanCoreGene:: Could Not find features in PCL file "+args.strInputPCL+".")
+#If the indices are names switch with numbers otherwise subtract 1 because they are ment for R
+liConvertedRangedIndices = [funcGetIndices(sIndex,lsFeatureNames) for sIndex in llsIndices] if len(llsIndices)>0 else []
+llsIndices = None
+#If there are any ranges, reduce to lists of indices
+liConvertedIndices = []
+for lsIndices in liConvertedRangedIndices:
+lsIndices.sort()
+iLenIndices = len(lsIndices)
+if iLenIndices > 2:
+print "Error, received more than 2 indices in a range. Stopped."
+exit()
+liConvertedIndices.extend(lsIndices if iLenIndices == 1 else range(lsIndices[0],lsIndices[1]+1))
+liConvertedRangedIndices = None
+#Collapse all indices to a set which is then sorted
+liConvertedIndices = sorted(list(set(liConvertedIndices)))
+#Reduce name of features to just bugs indicated by indices
+lsFeatureNames = itemgetter(*liConvertedIndices)(lsFeatureNames)
+liConvertedIndices = None
+#Change the bug names to the correct formatting (clades seperated by .)
+lsFeatureNames = sorted(lsFeatureNames)
+lsFeatureNames = [re.sub("^[A-Za-z]__","",sBug) for sBug in lsFeatureNames]
+lsFeatureNames = [[re.sub("\|*[A-Za-z]__|\|",".",sBug)] for sBug in lsFeatureNames]
+#If this is an OTU, append the number and the genus level together for a more descriptive termal name
+lsFeatureNamesModForOTU = []
+for sBug in lsFeatureNames:
+lsBug = sBug[0].split(".")
+if(len(lsBug))> 1:
+if(lsBug[-1].isdigit()):
+lsBug[-2]=lsBug[-2]+"_"+lsBug[-1]
+lsBug = lsBug[0:-1]
+lsFeatureNamesModForOTU.append([".".join(lsBug)])
+else:
+lsFeatureNamesModForOTU.append([lsBug[0]])
+#Output core gene file
+csvCG = open(args.strOutputCoreGene,'w') if isinstance(args.strOutputCoreGene, str) else args.strOutputCoreGene
+fCG = csv.writer(csvCG)
+fCG.writerows(lsFeatureNamesModForOTU)
+csvCG.close()

Mercurial > repos > george-weingart > maaslin

comparison src/PCLToGraphlanCoreGene.py @ 0:e0b5980139d9