comparison SMART/Java/Python/cleanGff.py @ 40:cd852f3e04ab

Uploaded
author m-zytnicki
date Thu, 30 May 2013 03:03:34 -0400
parents 769e306b7933
children 169d364ddd91
comparison
equal deleted inserted replaced
39:1236e5a49595 40:cd852f3e04ab
41 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress 41 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
42 42
43 count = {} 43 count = {}
44 44
45 class ParsedLine(object): 45 class ParsedLine(object):
46 def __init__(self, line, cpt): 46 def __init__(self, line, cpt):
47 self.line = line 47 self.line = line
48 self.cpt = cpt 48 self.cpt = cpt
49 self.parse() 49 self.parse()
50 50
51 def parse(self): 51 def parse(self):
52 self.line = self.line.strip() 52 self.line = self.line.strip()
53 self.splittedLine = self.line.split(None, 8) 53 self.splittedLine = self.line.split(None, 8)
54 if len(self.splittedLine) < 9: 54 if len(self.splittedLine) < 9:
55 raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) 55 raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line))
56 self.type = self.splittedLine[2] 56 self.type = self.splittedLine[2]
57 self.parseOptions() 57 self.parseOptions()
58 self.getId() 58 self.getId()
59 self.getParents() 59 self.getParents()
60 60
61 def parseOptions(self): 61 def parseOptions(self):
62 self.parsedOptions = {} 62 self.parsedOptions = {}
63 for option in self.splittedLine[8].split(";"): 63 for option in self.splittedLine[8].split(";"):
64 option = option.strip() 64 option = option.strip()
65 if option == "": continue 65 if option == "": continue
66 posSpace = option.find(" ") 66 posSpace = option.find(" ")
67 posEqual = option.find("=") 67 posEqual = option.find("=")
68 if posEqual != -1 and (posEqual < posSpace or posSpace == -1): 68 if posEqual != -1 and (posEqual < posSpace or posSpace == -1):
69 key, value = option.split("=", 1) 69 key, value = option.split("=", 1)
70 elif posSpace != -1: 70 elif posSpace != -1:
71 key, value = option.split(None, 1) 71 key, value = option.split(None, 1)
72 else: 72 else:
73 key = "ID" 73 key = "ID"
74 value = option 74 value = option
75 self.parsedOptions[key.strip()] = value.strip(" \"") 75 self.parsedOptions[key.strip()] = value.strip(" \"")
76 76
77 def getId(self): 77 def getId(self):
78 for key in self.parsedOptions: 78 for key in self.parsedOptions:
79 if key.lower() == "id": 79 if key.lower() == "id":
80 self.id = self.parsedOptions[key] 80 self.id = self.parsedOptions[key]
81 return 81 return
82 if "Parent" in self.parsedOptions: 82 if "Parent" in self.parsedOptions:
83 parent = self.parsedOptions["Parent"].split(",")[0] 83 parent = self.parsedOptions["Parent"].split(",")[0]
84 if parent not in count: 84 if parent not in count:
85 count[parent] = {} 85 count[parent] = {}
86 if self.type not in count[parent]: 86 if self.type not in count[parent]:
87 count[parent][self.type] = 0 87 count[parent][self.type] = 0
88 count[parent][self.type] += 1 88 count[parent][self.type] += 1
89 self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) 89 self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type])
90 else: 90 else:
91 self.id = "smart%d" % (self.cpt) 91 self.id = "smart%d" % (self.cpt)
92 self.parsedOptions["ID"] = self.id 92 self.parsedOptions["ID"] = self.id
93 93
94 def getParents(self): 94 def getParents(self):
95 for key in self.parsedOptions: 95 for key in self.parsedOptions:
96 if key.lower() in ("parent", "derives_from"): 96 if key.lower() in ("parent", "derives_from"):
97 self.parents = self.parsedOptions[key].split(",") 97 self.parents = self.parsedOptions[key].split(",")
98 return 98 return
99 self.parents = None 99 self.parents = None
100 100
101 def removeParent(self): 101 def removeParent(self):
102 for key in self.parsedOptions.keys(): 102 for key in self.parsedOptions.keys():
103 if key.lower() in ("parent", "derives_from"): 103 if key.lower() in ("parent", "derives_from"):
104 del self.parsedOptions[key] 104 del self.parsedOptions[key]
105 105
106 def export(self): 106 def export(self):
107 self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) 107 self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()])
108 return "%s\n" % ("\t".join(self.splittedLine)) 108 return "%s\n" % ("\t".join(self.splittedLine))
109 109
110 110
111 class CleanGff(object): 111 class CleanGff(object):
112 112
113 def __init__(self, verbosity = 1): 113 def __init__(self, verbosity = 1):
114 self.verbosity = verbosity 114 self.verbosity = verbosity
115 self.lines = {} 115 self.lines = {}
116 self.acceptedTypes = [] 116 self.acceptedTypes = []
117 self.parents = [] 117 self.parents = []
118 self.children = {} 118 self.children = {}
119 119
120 def setInputFileName(self, name): 120 def setInputFileName(self, name):
121 self.inputFile = open(name) 121 self.inputFile = open(name)
122 122
123 def setOutputFileName(self, name): 123 def setOutputFileName(self, name):
124 self.outputFile = open(name, "w") 124 self.outputFile = open(name, "w")
125 125
126 def setAcceptedTypes(self, types): 126 def setAcceptedTypes(self, types):
127 self.acceptedTypes = types 127 self.acceptedTypes = types
128 128
129 def parse(self): 129 def parse(self):
130 progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) 130 progress = UnlimitedProgress(100000, "Reading input file", self.verbosity)
131 for cpt, line in enumerate(self.inputFile): 131 for cpt, line in enumerate(self.inputFile):
132 if not line or line[0] == "#": continue 132 if not line or line[0] == "#": continue
133 if line[0] == ">": break 133 if line[0] == ">": break
134 parsedLine = ParsedLine(line, cpt) 134 parsedLine = ParsedLine(line, cpt)
135 if parsedLine.type in self.acceptedTypes: 135 if parsedLine.type in self.acceptedTypes:
136 self.lines[parsedLine.id] = parsedLine 136 if parsedLine.id in self.lines:
137 progress.inc() 137 cpt = 1
138 progress.done() 138 while "%s-%d" % (parsedLine.id, cpt) in self.lines:
139 139 cpt += 1
140 def sort(self): 140 parsedLine.id = "%s-%d" % (parsedLine.id, cpt)
141 progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) 141 self.lines[parsedLine.id] = parsedLine
142 for line in self.lines.values(): 142 progress.inc()
143 parentFound = False 143 progress.done()
144 if line.parents: 144
145 for parent in line.parents: 145 def sort(self):
146 if parent in self.lines: 146 progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity)
147 parentFound = True 147 for line in self.lines.values():
148 if parent in self.children: 148 parentFound = False
149 self.children[parent].append(line) 149 if line.parents:
150 else: 150 for parent in line.parents:
151 self.children[parent] = [line] 151 if parent in self.lines:
152 if not parentFound: 152 parentFound = True
153 line.removeParent() 153 if parent in self.children:
154 self.parents.append(line) 154 self.children[parent].append(line)
155 progress.inc() 155 else:
156 progress.done() 156 self.children[parent] = [line]
157 157 if not parentFound:
158 def write(self): 158 line.removeParent()
159 progress = Progress(len(self.parents), "Writing output file", self.verbosity) 159 self.parents.append(line)
160 for line in self.parents: 160 progress.inc()
161 self.writeLine(line) 161 progress.done()
162 progress.inc() 162
163 self.outputFile.close() 163 def write(self):
164 progress.done() 164 progress = Progress(len(self.parents), "Writing output file", self.verbosity)
165 165 for line in self.parents:
166 def writeLine(self, line): 166 self.writeLine(line)
167 self.outputFile.write(line.export()) 167 progress.inc()
168 if line.id in self.children: 168 self.outputFile.close()
169 for child in self.children[line.id]: 169 progress.done()
170 self.writeLine(child) 170
171 171 def writeLine(self, line):
172 def run(self): 172 self.outputFile.write(line.export())
173 self.parse() 173 if line.id in self.children:
174 self.sort() 174 for child in self.children[line.id]:
175 self.write() 175 self.writeLine(child)
176
177 def run(self):
178 self.parse()
179 self.sort()
180 self.write()
176 181
177 182
178 if __name__ == "__main__": 183 if __name__ == "__main__":
179 184
180 # parse command line 185 # parse command line
181 description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" 186 description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]"
182 187
183 parser = OptionParser(description = description) 188 parser = OptionParser(description = description)
184 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") 189 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]")
185 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") 190 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
186 parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") 191 parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]")
187 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 192 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
188 (options, args) = parser.parse_args() 193 (options, args) = parser.parse_args()
189 194
190 cleanGff = CleanGff(options.verbosity) 195 cleanGff = CleanGff(options.verbosity)
191 cleanGff.setInputFileName(options.inputFileName) 196 cleanGff.setInputFileName(options.inputFileName)
192 cleanGff.setOutputFileName(options.outputFileName) 197 cleanGff.setOutputFileName(options.outputFileName)
193 cleanGff.setAcceptedTypes(options.types.split(",")) 198 cleanGff.setAcceptedTypes(options.types.split(","))
194 cleanGff.run() 199 cleanGff.run()
195 200