comparison uclust2otutable.py @ 0:e85e7ba38aff draft

"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/uc2otutable commit 0db3cb4e9a87400bb2f8402ffc23334e24ad4b4e-dirty"
author qfabrepo
date Mon, 14 Sep 2020 04:52:36 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e85e7ba38aff
1 import sys
2 import progress
3 import subprocess
4 import tempfile
5 import traceback
6 import argparse
7
8 parser = argparse.ArgumentParser(
9 description="This script converts uclust format from vsearch to tabular format"
10 )
11 parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0")
12 parser.add_argument("-i","--input",dest="uclust",default=False,help="input filename in uclust format")
13 parser.add_argument("-o","--output",dest="otutable",default=False,help="output filename")
14
15
16 if(len(sys.argv) == 1):
17 parser.print_help(sys.stderr)
18 sys.exit()
19
20 args = parser.parse_args()
21
22 ucFileName = args.uclust
23 outFileName = args.otutable
24
25
26 # Tab-separated fields:
27 # 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=Label
28 # Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NotMatched
29 # For C and D types, PctId is average id with seed.
30 # QueryStart and SeedStart are zero-based relative to start of sequence.
31 # If minus strand, SeedStart is relative to reverse-complemented seed.
32
33 MaxError = -1
34
35 Type = '?'
36 ClusterNr = -1
37 Size = -1
38 PctId = -1.0
39 LocalScore = -1.0
40 Evalue = -1.0
41 Strand = '.'
42 QueryStart = -1
43 SeedStart = -1
44 Alignment = ""
45 QueryLabel = ""
46 TargetLabel = ""
47 FileName = "?"
48 Line = ""
49
50 TRUNC_LABELS=0
51
52 def GetSampleId(Label):
53 sep=";"
54 SampleID_temp = Label.split(sep,1)[0]
55 SampleID = SampleID_temp.split('_',1)[-1]
56 return SampleID
57
58 def OnRec():
59 global OTUs, Samples, OTUTable
60 if Type != 'H':
61 return
62
63 OTUId = TargetLabel
64 if OTUId not in OTUIds:
65 OTUIds.append(OTUId)
66 OTUTable[OTUId] = {}
67
68 SampleId = GetSampleId(QueryLabel)
69 if SampleId not in SampleIds:
70 SampleIds.append(SampleId)
71
72 N = GetSizeFromLabel(QueryLabel, 1)
73 try:
74 OTUTable[OTUId][SampleId] += N
75 except:
76 OTUTable[OTUId][SampleId] = N
77
78 def Die(Msg):
79 print >> sys.stderr
80 print >> sys.stderr
81
82 traceback.print_stack()
83 s = ""
84 for i in range(0, len(sys.argv)):
85 if i > 0:
86 s += " "
87 s += sys.argv[i]
88 print >> sys.stderr, s
89 print >> sys.stderr, "**ERROR**", Msg
90 print >> sys.stderr
91 print >> sys.stderr
92 sys.exit(1)
93 print("NOTHERE!!")
94
95 def Warning(Msg):
96 print >> sys.stderr
97 print >> sys.stderr, sys.argv
98 print >> sys.stderr, "**WARNING**", Msg
99
100 def isgap(c):
101 return c == '-' or c == '.'
102
103 def GetSeqCount(FileName):
104 Tmp = tempfile.TemporaryFile()
105 try:
106 TmpFile = Tmp.file
107 except:
108 TmpFile = Tmp
109 s = subprocess.call([ "grep", "-c", "^>", FileName ], stdout=TmpFile)
110 TmpFile.seek(0)
111 s = TmpFile.read()
112 return int(s)
113
114 def GetSeqsDict(FileName):
115 return ReadSeqsFast(FileName, False)
116
117 def ReadSeqsDict(FileName, Progress = False):
118 return ReadSeqsFast(FileName, Progress)
119
120 def ReadSeqsOnSeq(FileName, OnSeq, Progress = False):
121 ReadSeqs3(FileName, OnSeq, Progress)
122
123 def ReadSeqsFastFile(File, Progress = False):
124 Seqs = {}
125 Id = ""
126 N = 0
127 while 1:
128 if N%10000 == 0 and Progress:
129 sys.stderr.write("%u seqs\r" % (N))
130 Line = File.readline()
131 if len(Line) == 0:
132 if Progress:
133 sys.stderr.write("%u seqs\n" % (N))
134 return Seqs
135 if len(Line) == 0:
136 continue
137 Line = Line.strip()
138 if Line[0] == ">":
139 N += 1
140 Id = Line[1:]
141 if TRUNC_LABELS:
142 Id = Id.split()[0]
143 Seqs[Id] = ""
144 else:
145 if Id == "":
146 Die("FASTA file does not start with '>'")
147 Seqs[Id] = Seqs[Id] + Line
148
149 def ReadSeqsFast(FileName, Progress = True):
150 File = open(FileName)
151 return ReadSeqsFastFile(File, Progress)
152
153 def ReadSeqs(FileName, toupper=False, stripgaps=False, Progress=False):
154 if not toupper and not stripgaps:
155 return ReadSeqsFast(FileName, False)
156
157 Seqs = {}
158 Id = ""
159 File = open(FileName)
160 while 1:
161 Line = File.readline()
162 if len(Line) == 0:
163 return Seqs
164 Line = Line.strip()
165 if len(Line) == 0:
166 continue
167 if Line[0] == ">":
168 Id = Line[1:]
169 if TRUNC_LABELS:
170 Id = Id.split()[0]
171 if Id in Seqs.keys():
172 Die("Duplicate id '%s' in '%s'" % (Id, FileName))
173 Seqs[Id] = ""
174 else:
175 if Id == "":
176 Die("FASTA file '%s' does not start with '>'" % FileName)
177 if toupper:
178 Line = Line.upper()
179 if stripgaps:
180 Line = Line.replace("-", "")
181 Line = Line.replace(".", "")
182 Seqs[Id] = Seqs[Id] + Line
183
184 def ReadSeqs2(FileName, ShowProgress = True):
185 Seqs = []
186 Labels = []
187 File = open(FileName)
188 if ShowProgress:
189 progress.InitFile(File, FileName)
190 while 1:
191 progress.File()
192 Line = File.readline()
193 if len(Line) == 0:
194 if ShowProgress:
195 print >> sys.stderr, "\n"
196 return Labels, Seqs
197 Line = Line.strip()
198 if len(Line) == 0:
199 continue
200 if Line[0] == ">":
201 Id = Line[1:]
202 if TRUNC_LABELS:
203 Id = Id.split()[0]
204 Labels.append(Id)
205 Seqs.append("")
206 else:
207 i = len(Seqs)-1
208 Seqs[i] = Seqs[i] + Line
209
210 def ReadSeqs3(FileName, OnSeq, ShowProgress = True):
211 File = open(FileName)
212 if ShowProgress:
213 progress.InitFile(File, FileName)
214 Label = ""
215 Seq = ""
216 while 1:
217 Line = File.readline()
218 if len(Line) == 0:
219 if Seq != "":
220 OnSeq(Label, Seq)
221 if ShowProgress:
222 print >> sys.stderr, "\n"
223 return
224 Line = Line.strip()
225 if len(Line) == 0:
226 continue
227 if Line[0] == ">":
228 if Seq != "":
229 if ShowProgress:
230 progress.File()
231 if TRUNC_LABELS:
232 Label = Label.split()[0]
233 OnSeq(Label, Seq)
234 Label = Line[1:]
235 Seq = ""
236 else:
237 Seq += Line
238
239 def WriteSeq(File, Seq, Label = ""):
240 if Label != "":
241 print >> File, ">" + Label
242 BLOCKLENGTH = 80
243 SeqLength = len(Seq)
244 BlockCount = int((SeqLength + (BLOCKLENGTH-1))/BLOCKLENGTH)
245 for BlockIndex in range(0, BlockCount):
246 Block = Seq[BlockIndex*BLOCKLENGTH:]
247 Block = Block[:BLOCKLENGTH]
248 print >> File, Block
249
250 def GetSizeFromLabel(Label, Default = -1):
251 Fields = Label.split(";")
252 for Field in Fields:
253 if Field.startswith("size="):
254 return int(Field[5:])
255 if Default == -1:
256 Die("Missing size >" + Label)
257 return Default
258
259 def StripSizeFromLabel(Label):
260 Fields = Label.split(";")
261 NewLabel = ""
262 for Field in Fields:
263 if Field.startswith("size="):
264 continue
265 if NewLabel != "":
266 NewLabel += ";"
267 NewLabel += Field
268 return NewLabel
269
270 def GetQualFromLabel(Label):
271 n = Label.find("qual=")
272 assert n >= 0
273 return Label[n+5:-1]
274
275 def StripQualFromLabel(Label):
276 n = Label.find("qual=")
277 assert n >= 0
278 return Label[:n]
279
280 def GetField(Label, Name, Default):
281 Fields = Label.split(';')
282 for Field in Fields:
283 if Field.startswith(Name + "="):
284 n = len(Name) + 1
285 return Field[n:]
286 if Default == "":
287 Die("Field %s= not found in >%s" % (Name, Label))
288 return Default
289
290 def GetIntFieldFromLabel(Label, Name, Default):
291 return int(GetField(Label, Name, Default))
292
293 def GetFieldFromLabel(Label, Name, Default):
294 return GetField(Label, Name, Default)
295
296 def DeleteFieldFromLabel(Label, Name):
297 NewLabel = ""
298 Fields = Label.split(';')
299 for Field in Fields:
300 if len(Field) > 0 and not Field.startswith(Name + "="):
301 NewLabel += Field + ';'
302 return NewLabel
303
304 def ReplaceSize(Label, Size):
305 Fields = Label.split(";")
306 NewLabel = ""
307 Done = False
308 for Field in Fields:
309 if Field.startswith("size="):
310 NewLabel += "size=%u;" % Size
311 Done = True
312 else:
313 if Field != "":
314 NewLabel += Field + ";"
315 if not Done:
316 die.Die("size= not found in >" + Label)
317 return NewLabel
318
319 def Error(s):
320 print >> sys.stderr, "*** ERROR ***", s, sys.argv
321 sys.exit(1)
322
323 def ProgressFile(File, FileSize):
324 # if not sys.stderr.isatty():
325 # return
326 Pos = File.tell()
327 Pct = (100.0*Pos)/FileSize
328 Str = "%s %5.1f%%\r" % (FileName, Pct)
329 sys.stderr.write(Str)
330
331 def Progress(i, N):
332 # if not sys.stderr.isatty():
333 return
334 Pct = (100.0*i)/N
335 Str = "%5.1f%%\r" % Pct
336 sys.stderr.write(Str)
337
338 def PrintLine():
339 print(Line)
340
341 def ParseRec(Line):
342 global Type
343 global ClusterNr
344 global Size
345 global PctId
346 global Strand
347 global QueryStart
348 global SeedStart
349 global Alignment
350 global QueryLabel
351 global TargetLabel
352 global LocalScore
353 global Evalue
354
355 Fields = Line.split("\t")
356 N = len(Fields)
357 if N != 9 and N != 10:
358 Error("Expected 9 or 10 fields in .uc record, got: " + Line)
359 Type = Fields[0]
360
361 try:
362 ClusterNr = int(Fields[1])
363 except:
364 ClusterNr = -1
365
366 try:
367 Size = int(Fields[2])
368 except:
369 Size = -1
370
371 Fields2 = Fields[3].split('/')
372 LocalScore = -1.0
373 Evalue = -1.0
374 if len(Fields2) == 3:
375 try:
376 PctId = float(Fields2[0])
377 LocalScore = float(Fields2[1])
378 Evalue = float(Fields2[2])
379 except:
380 PctId = -1.0
381 else:
382 try:
383 PctId = float(Fields[3])
384 except:
385 PctId = -1.0
386
387 Strand = Fields[4]
388
389 try:
390 QueryStart = int(Fields[5])
391 except:
392 QueryStart = -1
393
394 try:
395 SeedStart = int(Fields[6])
396 except:
397 SeedStart = -1
398
399 Alignment = Fields[7]
400 QueryLabel = Fields[8]
401
402 if len(Fields) > 9:
403 TargetLabel = Fields[9]
404
405 def GetRec(File, OnRecord):
406 global Line
407 while 1:
408 Line = File.readline()
409 if len(Line) == 0:
410 return 0
411 if Line[0] == '#':
412 continue
413 Line = Line.strip()
414 if len(Line) == 0:
415 return 1
416 ParseRec(Line)
417 Ok = OnRecord()
418 if Ok != None and Ok == 0:
419 return 0
420 return 1
421
422 def ReadRecs(argFileName, OnRecord, ShowProgress = False):
423 return ReadFile(argFileName, OnRecord, ShowProgress)
424
425 def ReadRecsOnRec(argFileName, OnRecord, ShowProgress = True):
426 return ReadFile(argFileName, OnRecord, ShowProgress)
427
428 def GetRecs(argFileName, OnRecord, ShowProgress = True):
429 return ReadFile(argFileName, OnRecord, ShowProgress)
430
431 def ReadFile(argFileName, OnRecord, ShowProgress = True):
432 global FileName
433 FileName = argFileName
434 File = open(FileName)
435
436 if ShowProgress:
437 progress.InitFile(File, FileName)
438 while GetRec(File, OnRecord):
439 if ShowProgress:
440 progress.File()
441 if ShowProgress:
442 progress.FileDone()
443
444 OTUIds = []
445 SampleIds = []
446 OTUTable = {}
447
448 ReadRecs(ucFileName, OnRec)
449
450 fout=open(outFileName,'w')
451
452 s = "OTUId"
453 for SampleId in SampleIds:
454 s += "\t" + SampleId
455
456 fout.write("%s\n" % s)
457
458 for OTUId in OTUIds:
459 s = OTUId
460 for SampleId in SampleIds:
461 try:
462 n = OTUTable[OTUId][SampleId]
463 except:
464 n = 0
465 s += "\t" + str(n)
466 fout.write("%s\n" % s)
467
468 fout.close()