| 0 | 1 #!/usr/bin/env python | 
|  | 2 ##################################################################################### | 
|  | 3 #Copyright (C) <2012> | 
|  | 4 # | 
|  | 5 #Permission is hereby granted, free of charge, to any person obtaining a copy of | 
|  | 6 #this software and associated documentation files (the "Software"), to deal in the | 
|  | 7 #Software without restriction, including without limitation the rights to use, copy, | 
|  | 8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | 
|  | 9 #and to permit persons to whom the Software is furnished to do so, subject to | 
|  | 10 #the following conditions: | 
|  | 11 # | 
|  | 12 #The above copyright notice and this permission notice shall be included in all copies | 
|  | 13 #or substantial portions of the Software. | 
|  | 14 # | 
|  | 15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | 
|  | 16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | 
|  | 17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | 
|  | 18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | 
|  | 19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | 
|  | 20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 
|  | 21 # | 
|  | 22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models), | 
|  | 23 # authored by the Huttenhower lab at the Harvard School of Public Health | 
|  | 24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu). | 
|  | 25 ##################################################################################### | 
|  | 26 | 
|  | 27 __author__ = "Timothy Tickle" | 
|  | 28 __copyright__ = "Copyright 2012" | 
|  | 29 __credits__ = ["Timothy Tickle"] | 
|  | 30 __license__ = "" | 
|  | 31 __version__ = "" | 
|  | 32 __maintainer__ = "Timothy Tickle" | 
|  | 33 __email__ = "ttickle@sph.harvard.edu" | 
|  | 34 __status__ = "Development" | 
|  | 35 | 
|  | 36 import argparse | 
|  | 37 import csv | 
|  | 38 from operator import itemgetter | 
|  | 39 import re | 
|  | 40 import sys | 
|  | 41 | 
|  | 42 #Helper function which returns a boolean indicator of an input string being parsable as an int | 
|  | 43 def funcIsInt(strInt): | 
|  | 44   try: | 
|  | 45     int(strInt) | 
|  | 46     return True | 
|  | 47   except: | 
|  | 48     return False | 
|  | 49 | 
|  | 50 #Helper function that gets the index of the name and gives the last value of the list for - or the first value depending on the position | 
|  | 51 # This supports the ranging in the read.config files | 
|  | 52 #If no range is given then the result is just one index of the given name | 
|  | 53 def funcGetIndices(lsFeature, lsFunctionNames): | 
|  | 54   if(len(lsFeature)) == 1: | 
|  | 55       if(funcIsInt(lsFeature[0])): | 
|  | 56         return int(lsFeature[0])-1 | 
|  | 57       return [lsFeatureNames.index(lsFeature[0])] | 
|  | 58   if(len(lsFeature)) == 2: | 
|  | 59     iIndices = [] | 
|  | 60     iPosition = 1 | 
|  | 61     for sFeature in lsFeature: | 
|  | 62       if(sFeature==""): | 
|  | 63         if(iPosition==1): | 
|  | 64           iIndices.append(2) | 
|  | 65         elif(iPosition==2): | 
|  | 66           iIndices.append(len(lsFunctionNames)-1) | 
|  | 67       elif(funcIsInt(sFeature)): | 
|  | 68         iIndices.append(int(sFeature)-1) | 
|  | 69       else: | 
|  | 70         iIndices.append(lsFeatureNames.index(sFeature)) | 
|  | 71       iPosition = iPosition + 1 | 
|  | 72     return iIndices | 
|  | 73 | 
|  | 74 #Constants | 
|  | 75 #The line indicating the rows to read | 
|  | 76 c_MatrixName = "Matrix:" | 
|  | 77 c_DataMatrix = "Abundance" | 
|  | 78 c_strRows = "Read_PCL_Rows:" | 
|  | 79 | 
|  | 80 #Set up arguments reader | 
|  | 81 argp = argparse.ArgumentParser( prog = "PCLToGraphlanCoreGene.py", | 
|  | 82     description = """Converts PCL files to Graphlan core gene files.""" ) | 
|  | 83 | 
|  | 84 #Arguments | 
|  | 85 argp.add_argument("strInputPCL", metavar = "PCLFile", type = argparse.FileType("r"), help ="Input PCl file used in maaslin") | 
|  | 86 argp.add_argument("strInputRC", metavar = "RCFile", type = argparse.FileType("r"), help ="Input read config file used in maaslin") | 
|  | 87 argp.add_argument("strOutputCoreGene", metavar = "CoreGeneFile", type = argparse.FileType("w"), help ="Output core gene file for graphlan") | 
|  | 88 | 
|  | 89 args = argp.parse_args( ) | 
|  | 90 | 
|  | 91 #Read in read config table and get the rows/columns to use | 
|  | 92 #Indicates if we are reading a data matrix | 
|  | 93 fIsData = False | 
|  | 94 #Holds the indices ranges | 
|  | 95 #List of lists,each internal list hold 1 or 2 indices, if two it indicates a range from the first to the second | 
|  | 96 llsIndices = [] | 
|  | 97 csvRC = open(args.strInputRC,'r') if isinstance(args.strInputRC, str) else args.strInputRC | 
|  | 98 fRC = csv.reader(csvRC, delimiter=" ") | 
|  | 99 for sLine in fRC: | 
|  | 100   #Get the row indices or names | 
|  | 101   if len(sLine): | 
|  | 102     if sLine[0] == c_MatrixName: | 
|  | 103       fIsData = sLine[1] == c_DataMatrix | 
|  | 104     if sLine[0] == c_strRows: | 
|  | 105       if fIsData: | 
|  | 106         llsIndices = [sIndexRange.split("-") for sIndexRange in sLine[1].split(",")] | 
|  | 107         break | 
|  | 108 csvRC.close() | 
|  | 109 | 
|  | 110 # Check to make sure RC file is read | 
|  | 111 if len(llsIndices)==0: | 
|  | 112   print("PCLToGraphlanCoreGene:: Could Not find indices in RC file "+args.strInputRC+".") | 
|  | 113 | 
|  | 114 #Read in the PCL file and parse the file names to core genes format | 
|  | 115 csvPCL = open(args.strInputPCL,'r') if isinstance(args.strInputPCL, str) else args.strInputPCL | 
|  | 116 fPCL = csv.reader(csvPCL,delimiter="\t") | 
|  | 117 #The first column of the csv file | 
|  | 118 lsFeatureNames = [sLine[0] for sLine in fPCL] | 
|  | 119 csvPCL.close() | 
|  | 120 | 
|  | 121 # Check to make sure PCL file is read | 
|  | 122 if len(lsFeatureNames)==0: | 
|  | 123   print("PCLToGraphlanCoreGene:: Could Not find features in PCL file "+args.strInputPCL+".") | 
|  | 124 | 
|  | 125 #If the indices are names switch with numbers otherwise subtract 1 because they are ment for R | 
|  | 126 liConvertedRangedIndices = [funcGetIndices(sIndex,lsFeatureNames) for sIndex in llsIndices] if len(llsIndices)>0 else [] | 
|  | 127 llsIndices = None | 
|  | 128 | 
|  | 129 #If there are any ranges, reduce to lists of indices | 
|  | 130 liConvertedIndices = [] | 
|  | 131 for lsIndices in liConvertedRangedIndices: | 
|  | 132   lsIndices.sort() | 
|  | 133   iLenIndices = len(lsIndices) | 
|  | 134   if iLenIndices > 2: | 
|  | 135     print "Error, received more than 2 indices in a range. Stopped." | 
|  | 136     exit() | 
|  | 137   liConvertedIndices.extend(lsIndices if iLenIndices == 1 else range(lsIndices[0],lsIndices[1]+1)) | 
|  | 138 liConvertedRangedIndices = None | 
|  | 139 | 
|  | 140 #Collapse all indices to a set which is then sorted | 
|  | 141 liConvertedIndices = sorted(list(set(liConvertedIndices))) | 
|  | 142 | 
|  | 143 #Reduce name of features to just bugs indicated by indices | 
|  | 144 lsFeatureNames = itemgetter(*liConvertedIndices)(lsFeatureNames) | 
|  | 145 liConvertedIndices = None | 
|  | 146 | 
|  | 147 #Change the bug names to the correct formatting (clades seperated by .) | 
|  | 148 lsFeatureNames = sorted(lsFeatureNames) | 
|  | 149 lsFeatureNames = [re.sub("^[A-Za-z]__","",sBug) for sBug in lsFeatureNames] | 
|  | 150 lsFeatureNames = [[re.sub("\|*[A-Za-z]__|\|",".",sBug)] for sBug in lsFeatureNames] | 
|  | 151 | 
|  | 152 #If this is an OTU, append the number and the genus level together for a more descriptive termal name | 
|  | 153 lsFeatureNamesModForOTU = [] | 
|  | 154 for sBug in lsFeatureNames: | 
|  | 155   lsBug = sBug[0].split(".") | 
|  | 156   if(len(lsBug))> 1: | 
|  | 157     if(lsBug[-1].isdigit()): | 
|  | 158       lsBug[-2]=lsBug[-2]+"_"+lsBug[-1] | 
|  | 159       lsBug = lsBug[0:-1] | 
|  | 160     lsFeatureNamesModForOTU.append([".".join(lsBug)]) | 
|  | 161   else: | 
|  | 162     lsFeatureNamesModForOTU.append([lsBug[0]]) | 
|  | 163 | 
|  | 164 #Output core gene file | 
|  | 165 csvCG = open(args.strOutputCoreGene,'w') if isinstance(args.strOutputCoreGene, str) else args.strOutputCoreGene | 
|  | 166 fCG = csv.writer(csvCG) | 
|  | 167 fCG.writerows(lsFeatureNamesModForOTU) | 
|  | 168 csvCG.close() |