Mercurial > repos > george-weingart > maaslin
comparison src/PCLToGraphlanCoreGene.py @ 0:e0b5980139d9
maaslin
| author | george-weingart |
|---|---|
| date | Tue, 13 May 2014 22:00:40 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e0b5980139d9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ##################################################################################### | |
| 3 #Copyright (C) <2012> | |
| 4 # | |
| 5 #Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| 6 #this software and associated documentation files (the "Software"), to deal in the | |
| 7 #Software without restriction, including without limitation the rights to use, copy, | |
| 8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
| 9 #and to permit persons to whom the Software is furnished to do so, subject to | |
| 10 #the following conditions: | |
| 11 # | |
| 12 #The above copyright notice and this permission notice shall be included in all copies | |
| 13 #or substantial portions of the Software. | |
| 14 # | |
| 15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
| 16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
| 17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
| 18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
| 19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
| 20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
| 21 # | |
| 22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | |
| 23 # authored by the Huttenhower lab at the Harvard School of Public Health | |
| 24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | |
| 25 ##################################################################################### | |
| 26 | |
| 27 __author__ = "Timothy Tickle" | |
| 28 __copyright__ = "Copyright 2012" | |
| 29 __credits__ = ["Timothy Tickle"] | |
| 30 __license__ = "" | |
| 31 __version__ = "" | |
| 32 __maintainer__ = "Timothy Tickle" | |
| 33 __email__ = "ttickle@sph.harvard.edu" | |
| 34 __status__ = "Development" | |
| 35 | |
| 36 import argparse | |
| 37 import csv | |
| 38 from operator import itemgetter | |
| 39 import re | |
| 40 import sys | |
| 41 | |
| 42 #Helper function which returns a boolean indicator of an input string being parsable as an int | |
| 43 def funcIsInt(strInt): | |
| 44 try: | |
| 45 int(strInt) | |
| 46 return True | |
| 47 except: | |
| 48 return False | |
| 49 | |
| 50 #Helper function that gets the index of the name and gives the last value of the list for - or the first value depending on the position | |
| 51 # This supports the ranging in the read.config files | |
| 52 #If no range is given then the result is just one index of the given name | |
| 53 def funcGetIndices(lsFeature, lsFunctionNames): | |
| 54 if(len(lsFeature)) == 1: | |
| 55 if(funcIsInt(lsFeature[0])): | |
| 56 return int(lsFeature[0])-1 | |
| 57 return [lsFeatureNames.index(lsFeature[0])] | |
| 58 if(len(lsFeature)) == 2: | |
| 59 iIndices = [] | |
| 60 iPosition = 1 | |
| 61 for sFeature in lsFeature: | |
| 62 if(sFeature==""): | |
| 63 if(iPosition==1): | |
| 64 iIndices.append(2) | |
| 65 elif(iPosition==2): | |
| 66 iIndices.append(len(lsFunctionNames)-1) | |
| 67 elif(funcIsInt(sFeature)): | |
| 68 iIndices.append(int(sFeature)-1) | |
| 69 else: | |
| 70 iIndices.append(lsFeatureNames.index(sFeature)) | |
| 71 iPosition = iPosition + 1 | |
| 72 return iIndices | |
| 73 | |
| 74 #Constants | |
| 75 #The line indicating the rows to read | |
| 76 c_MatrixName = "Matrix:" | |
| 77 c_DataMatrix = "Abundance" | |
| 78 c_strRows = "Read_PCL_Rows:" | |
| 79 | |
| 80 #Set up arguments reader | |
| 81 argp = argparse.ArgumentParser( prog = "PCLToGraphlanCoreGene.py", | |
| 82 description = """Converts PCL files to Graphlan core gene files.""" ) | |
| 83 | |
| 84 #Arguments | |
| 85 argp.add_argument("strInputPCL", metavar = "PCLFile", type = argparse.FileType("r"), help ="Input PCl file used in maaslin") | |
| 86 argp.add_argument("strInputRC", metavar = "RCFile", type = argparse.FileType("r"), help ="Input read config file used in maaslin") | |
| 87 argp.add_argument("strOutputCoreGene", metavar = "CoreGeneFile", type = argparse.FileType("w"), help ="Output core gene file for graphlan") | |
| 88 | |
| 89 args = argp.parse_args( ) | |
| 90 | |
| 91 #Read in read config table and get the rows/columns to use | |
| 92 #Indicates if we are reading a data matrix | |
| 93 fIsData = False | |
| 94 #Holds the indices ranges | |
| 95 #List of lists,each internal list hold 1 or 2 indices, if two it indicates a range from the first to the second | |
| 96 llsIndices = [] | |
| 97 csvRC = open(args.strInputRC,'r') if isinstance(args.strInputRC, str) else args.strInputRC | |
| 98 fRC = csv.reader(csvRC, delimiter=" ") | |
| 99 for sLine in fRC: | |
| 100 #Get the row indices or names | |
| 101 if len(sLine): | |
| 102 if sLine[0] == c_MatrixName: | |
| 103 fIsData = sLine[1] == c_DataMatrix | |
| 104 if sLine[0] == c_strRows: | |
| 105 if fIsData: | |
| 106 llsIndices = [sIndexRange.split("-") for sIndexRange in sLine[1].split(",")] | |
| 107 break | |
| 108 csvRC.close() | |
| 109 | |
| 110 # Check to make sure RC file is read | |
| 111 if len(llsIndices)==0: | |
| 112 print("PCLToGraphlanCoreGene:: Could Not find indices in RC file "+args.strInputRC+".") | |
| 113 | |
| 114 #Read in the PCL file and parse the file names to core genes format | |
| 115 csvPCL = open(args.strInputPCL,'r') if isinstance(args.strInputPCL, str) else args.strInputPCL | |
| 116 fPCL = csv.reader(csvPCL,delimiter="\t") | |
| 117 #The first column of the csv file | |
| 118 lsFeatureNames = [sLine[0] for sLine in fPCL] | |
| 119 csvPCL.close() | |
| 120 | |
| 121 # Check to make sure PCL file is read | |
| 122 if len(lsFeatureNames)==0: | |
| 123 print("PCLToGraphlanCoreGene:: Could Not find features in PCL file "+args.strInputPCL+".") | |
| 124 | |
| 125 #If the indices are names switch with numbers otherwise subtract 1 because they are ment for R | |
| 126 liConvertedRangedIndices = [funcGetIndices(sIndex,lsFeatureNames) for sIndex in llsIndices] if len(llsIndices)>0 else [] | |
| 127 llsIndices = None | |
| 128 | |
| 129 #If there are any ranges, reduce to lists of indices | |
| 130 liConvertedIndices = [] | |
| 131 for lsIndices in liConvertedRangedIndices: | |
| 132 lsIndices.sort() | |
| 133 iLenIndices = len(lsIndices) | |
| 134 if iLenIndices > 2: | |
| 135 print "Error, received more than 2 indices in a range. Stopped." | |
| 136 exit() | |
| 137 liConvertedIndices.extend(lsIndices if iLenIndices == 1 else range(lsIndices[0],lsIndices[1]+1)) | |
| 138 liConvertedRangedIndices = None | |
| 139 | |
| 140 #Collapse all indices to a set which is then sorted | |
| 141 liConvertedIndices = sorted(list(set(liConvertedIndices))) | |
| 142 | |
| 143 #Reduce name of features to just bugs indicated by indices | |
| 144 lsFeatureNames = itemgetter(*liConvertedIndices)(lsFeatureNames) | |
| 145 liConvertedIndices = None | |
| 146 | |
| 147 #Change the bug names to the correct formatting (clades seperated by .) | |
| 148 lsFeatureNames = sorted(lsFeatureNames) | |
| 149 lsFeatureNames = [re.sub("^[A-Za-z]__","",sBug) for sBug in lsFeatureNames] | |
| 150 lsFeatureNames = [[re.sub("\|*[A-Za-z]__|\|",".",sBug)] for sBug in lsFeatureNames] | |
| 151 | |
| 152 #If this is an OTU, append the number and the genus level together for a more descriptive termal name | |
| 153 lsFeatureNamesModForOTU = [] | |
| 154 for sBug in lsFeatureNames: | |
| 155 lsBug = sBug[0].split(".") | |
| 156 if(len(lsBug))> 1: | |
| 157 if(lsBug[-1].isdigit()): | |
| 158 lsBug[-2]=lsBug[-2]+"_"+lsBug[-1] | |
| 159 lsBug = lsBug[0:-1] | |
| 160 lsFeatureNamesModForOTU.append([".".join(lsBug)]) | |
| 161 else: | |
| 162 lsFeatureNamesModForOTU.append([lsBug[0]]) | |
| 163 | |
| 164 #Output core gene file | |
| 165 csvCG = open(args.strOutputCoreGene,'w') if isinstance(args.strOutputCoreGene, str) else args.strOutputCoreGene | |
| 166 fCG = csv.writer(csvCG) | |
| 167 fCG.writerows(lsFeatureNamesModForOTU) | |
| 168 csvCG.close() |
