Mercurial > repos > qfabrepo > metadegalaxy_uc2otutable
comparison uclust2otutable.py @ 0:e85e7ba38aff draft
"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/uc2otutable commit 0db3cb4e9a87400bb2f8402ffc23334e24ad4b4e-dirty"
author | qfabrepo |
---|---|
date | Mon, 14 Sep 2020 04:52:36 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e85e7ba38aff |
---|---|
1 import sys | |
2 import progress | |
3 import subprocess | |
4 import tempfile | |
5 import traceback | |
6 import argparse | |
7 | |
8 parser = argparse.ArgumentParser( | |
9 description="This script converts uclust format from vsearch to tabular format" | |
10 ) | |
11 parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0") | |
12 parser.add_argument("-i","--input",dest="uclust",default=False,help="input filename in uclust format") | |
13 parser.add_argument("-o","--output",dest="otutable",default=False,help="output filename") | |
14 | |
15 | |
16 if(len(sys.argv) == 1): | |
17 parser.print_help(sys.stderr) | |
18 sys.exit() | |
19 | |
20 args = parser.parse_args() | |
21 | |
22 ucFileName = args.uclust | |
23 outFileName = args.otutable | |
24 | |
25 | |
26 # Tab-separated fields: | |
27 # 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=Label | |
28 # Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NotMatched | |
29 # For C and D types, PctId is average id with seed. | |
30 # QueryStart and SeedStart are zero-based relative to start of sequence. | |
31 # If minus strand, SeedStart is relative to reverse-complemented seed. | |
32 | |
33 MaxError = -1 | |
34 | |
35 Type = '?' | |
36 ClusterNr = -1 | |
37 Size = -1 | |
38 PctId = -1.0 | |
39 LocalScore = -1.0 | |
40 Evalue = -1.0 | |
41 Strand = '.' | |
42 QueryStart = -1 | |
43 SeedStart = -1 | |
44 Alignment = "" | |
45 QueryLabel = "" | |
46 TargetLabel = "" | |
47 FileName = "?" | |
48 Line = "" | |
49 | |
50 TRUNC_LABELS=0 | |
51 | |
52 def GetSampleId(Label): | |
53 sep=";" | |
54 SampleID_temp = Label.split(sep,1)[0] | |
55 SampleID = SampleID_temp.split('_',1)[-1] | |
56 return SampleID | |
57 | |
58 def OnRec(): | |
59 global OTUs, Samples, OTUTable | |
60 if Type != 'H': | |
61 return | |
62 | |
63 OTUId = TargetLabel | |
64 if OTUId not in OTUIds: | |
65 OTUIds.append(OTUId) | |
66 OTUTable[OTUId] = {} | |
67 | |
68 SampleId = GetSampleId(QueryLabel) | |
69 if SampleId not in SampleIds: | |
70 SampleIds.append(SampleId) | |
71 | |
72 N = GetSizeFromLabel(QueryLabel, 1) | |
73 try: | |
74 OTUTable[OTUId][SampleId] += N | |
75 except: | |
76 OTUTable[OTUId][SampleId] = N | |
77 | |
78 def Die(Msg): | |
79 print >> sys.stderr | |
80 print >> sys.stderr | |
81 | |
82 traceback.print_stack() | |
83 s = "" | |
84 for i in range(0, len(sys.argv)): | |
85 if i > 0: | |
86 s += " " | |
87 s += sys.argv[i] | |
88 print >> sys.stderr, s | |
89 print >> sys.stderr, "**ERROR**", Msg | |
90 print >> sys.stderr | |
91 print >> sys.stderr | |
92 sys.exit(1) | |
93 print("NOTHERE!!") | |
94 | |
95 def Warning(Msg): | |
96 print >> sys.stderr | |
97 print >> sys.stderr, sys.argv | |
98 print >> sys.stderr, "**WARNING**", Msg | |
99 | |
100 def isgap(c): | |
101 return c == '-' or c == '.' | |
102 | |
103 def GetSeqCount(FileName): | |
104 Tmp = tempfile.TemporaryFile() | |
105 try: | |
106 TmpFile = Tmp.file | |
107 except: | |
108 TmpFile = Tmp | |
109 s = subprocess.call([ "grep", "-c", "^>", FileName ], stdout=TmpFile) | |
110 TmpFile.seek(0) | |
111 s = TmpFile.read() | |
112 return int(s) | |
113 | |
114 def GetSeqsDict(FileName): | |
115 return ReadSeqsFast(FileName, False) | |
116 | |
117 def ReadSeqsDict(FileName, Progress = False): | |
118 return ReadSeqsFast(FileName, Progress) | |
119 | |
120 def ReadSeqsOnSeq(FileName, OnSeq, Progress = False): | |
121 ReadSeqs3(FileName, OnSeq, Progress) | |
122 | |
123 def ReadSeqsFastFile(File, Progress = False): | |
124 Seqs = {} | |
125 Id = "" | |
126 N = 0 | |
127 while 1: | |
128 if N%10000 == 0 and Progress: | |
129 sys.stderr.write("%u seqs\r" % (N)) | |
130 Line = File.readline() | |
131 if len(Line) == 0: | |
132 if Progress: | |
133 sys.stderr.write("%u seqs\n" % (N)) | |
134 return Seqs | |
135 if len(Line) == 0: | |
136 continue | |
137 Line = Line.strip() | |
138 if Line[0] == ">": | |
139 N += 1 | |
140 Id = Line[1:] | |
141 if TRUNC_LABELS: | |
142 Id = Id.split()[0] | |
143 Seqs[Id] = "" | |
144 else: | |
145 if Id == "": | |
146 Die("FASTA file does not start with '>'") | |
147 Seqs[Id] = Seqs[Id] + Line | |
148 | |
149 def ReadSeqsFast(FileName, Progress = True): | |
150 File = open(FileName) | |
151 return ReadSeqsFastFile(File, Progress) | |
152 | |
153 def ReadSeqs(FileName, toupper=False, stripgaps=False, Progress=False): | |
154 if not toupper and not stripgaps: | |
155 return ReadSeqsFast(FileName, False) | |
156 | |
157 Seqs = {} | |
158 Id = "" | |
159 File = open(FileName) | |
160 while 1: | |
161 Line = File.readline() | |
162 if len(Line) == 0: | |
163 return Seqs | |
164 Line = Line.strip() | |
165 if len(Line) == 0: | |
166 continue | |
167 if Line[0] == ">": | |
168 Id = Line[1:] | |
169 if TRUNC_LABELS: | |
170 Id = Id.split()[0] | |
171 if Id in Seqs.keys(): | |
172 Die("Duplicate id '%s' in '%s'" % (Id, FileName)) | |
173 Seqs[Id] = "" | |
174 else: | |
175 if Id == "": | |
176 Die("FASTA file '%s' does not start with '>'" % FileName) | |
177 if toupper: | |
178 Line = Line.upper() | |
179 if stripgaps: | |
180 Line = Line.replace("-", "") | |
181 Line = Line.replace(".", "") | |
182 Seqs[Id] = Seqs[Id] + Line | |
183 | |
184 def ReadSeqs2(FileName, ShowProgress = True): | |
185 Seqs = [] | |
186 Labels = [] | |
187 File = open(FileName) | |
188 if ShowProgress: | |
189 progress.InitFile(File, FileName) | |
190 while 1: | |
191 progress.File() | |
192 Line = File.readline() | |
193 if len(Line) == 0: | |
194 if ShowProgress: | |
195 print >> sys.stderr, "\n" | |
196 return Labels, Seqs | |
197 Line = Line.strip() | |
198 if len(Line) == 0: | |
199 continue | |
200 if Line[0] == ">": | |
201 Id = Line[1:] | |
202 if TRUNC_LABELS: | |
203 Id = Id.split()[0] | |
204 Labels.append(Id) | |
205 Seqs.append("") | |
206 else: | |
207 i = len(Seqs)-1 | |
208 Seqs[i] = Seqs[i] + Line | |
209 | |
210 def ReadSeqs3(FileName, OnSeq, ShowProgress = True): | |
211 File = open(FileName) | |
212 if ShowProgress: | |
213 progress.InitFile(File, FileName) | |
214 Label = "" | |
215 Seq = "" | |
216 while 1: | |
217 Line = File.readline() | |
218 if len(Line) == 0: | |
219 if Seq != "": | |
220 OnSeq(Label, Seq) | |
221 if ShowProgress: | |
222 print >> sys.stderr, "\n" | |
223 return | |
224 Line = Line.strip() | |
225 if len(Line) == 0: | |
226 continue | |
227 if Line[0] == ">": | |
228 if Seq != "": | |
229 if ShowProgress: | |
230 progress.File() | |
231 if TRUNC_LABELS: | |
232 Label = Label.split()[0] | |
233 OnSeq(Label, Seq) | |
234 Label = Line[1:] | |
235 Seq = "" | |
236 else: | |
237 Seq += Line | |
238 | |
239 def WriteSeq(File, Seq, Label = ""): | |
240 if Label != "": | |
241 print >> File, ">" + Label | |
242 BLOCKLENGTH = 80 | |
243 SeqLength = len(Seq) | |
244 BlockCount = int((SeqLength + (BLOCKLENGTH-1))/BLOCKLENGTH) | |
245 for BlockIndex in range(0, BlockCount): | |
246 Block = Seq[BlockIndex*BLOCKLENGTH:] | |
247 Block = Block[:BLOCKLENGTH] | |
248 print >> File, Block | |
249 | |
250 def GetSizeFromLabel(Label, Default = -1): | |
251 Fields = Label.split(";") | |
252 for Field in Fields: | |
253 if Field.startswith("size="): | |
254 return int(Field[5:]) | |
255 if Default == -1: | |
256 Die("Missing size >" + Label) | |
257 return Default | |
258 | |
259 def StripSizeFromLabel(Label): | |
260 Fields = Label.split(";") | |
261 NewLabel = "" | |
262 for Field in Fields: | |
263 if Field.startswith("size="): | |
264 continue | |
265 if NewLabel != "": | |
266 NewLabel += ";" | |
267 NewLabel += Field | |
268 return NewLabel | |
269 | |
270 def GetQualFromLabel(Label): | |
271 n = Label.find("qual=") | |
272 assert n >= 0 | |
273 return Label[n+5:-1] | |
274 | |
275 def StripQualFromLabel(Label): | |
276 n = Label.find("qual=") | |
277 assert n >= 0 | |
278 return Label[:n] | |
279 | |
280 def GetField(Label, Name, Default): | |
281 Fields = Label.split(';') | |
282 for Field in Fields: | |
283 if Field.startswith(Name + "="): | |
284 n = len(Name) + 1 | |
285 return Field[n:] | |
286 if Default == "": | |
287 Die("Field %s= not found in >%s" % (Name, Label)) | |
288 return Default | |
289 | |
290 def GetIntFieldFromLabel(Label, Name, Default): | |
291 return int(GetField(Label, Name, Default)) | |
292 | |
293 def GetFieldFromLabel(Label, Name, Default): | |
294 return GetField(Label, Name, Default) | |
295 | |
296 def DeleteFieldFromLabel(Label, Name): | |
297 NewLabel = "" | |
298 Fields = Label.split(';') | |
299 for Field in Fields: | |
300 if len(Field) > 0 and not Field.startswith(Name + "="): | |
301 NewLabel += Field + ';' | |
302 return NewLabel | |
303 | |
304 def ReplaceSize(Label, Size): | |
305 Fields = Label.split(";") | |
306 NewLabel = "" | |
307 Done = False | |
308 for Field in Fields: | |
309 if Field.startswith("size="): | |
310 NewLabel += "size=%u;" % Size | |
311 Done = True | |
312 else: | |
313 if Field != "": | |
314 NewLabel += Field + ";" | |
315 if not Done: | |
316 die.Die("size= not found in >" + Label) | |
317 return NewLabel | |
318 | |
319 def Error(s): | |
320 print >> sys.stderr, "*** ERROR ***", s, sys.argv | |
321 sys.exit(1) | |
322 | |
323 def ProgressFile(File, FileSize): | |
324 # if not sys.stderr.isatty(): | |
325 # return | |
326 Pos = File.tell() | |
327 Pct = (100.0*Pos)/FileSize | |
328 Str = "%s %5.1f%%\r" % (FileName, Pct) | |
329 sys.stderr.write(Str) | |
330 | |
331 def Progress(i, N): | |
332 # if not sys.stderr.isatty(): | |
333 return | |
334 Pct = (100.0*i)/N | |
335 Str = "%5.1f%%\r" % Pct | |
336 sys.stderr.write(Str) | |
337 | |
338 def PrintLine(): | |
339 print(Line) | |
340 | |
341 def ParseRec(Line): | |
342 global Type | |
343 global ClusterNr | |
344 global Size | |
345 global PctId | |
346 global Strand | |
347 global QueryStart | |
348 global SeedStart | |
349 global Alignment | |
350 global QueryLabel | |
351 global TargetLabel | |
352 global LocalScore | |
353 global Evalue | |
354 | |
355 Fields = Line.split("\t") | |
356 N = len(Fields) | |
357 if N != 9 and N != 10: | |
358 Error("Expected 9 or 10 fields in .uc record, got: " + Line) | |
359 Type = Fields[0] | |
360 | |
361 try: | |
362 ClusterNr = int(Fields[1]) | |
363 except: | |
364 ClusterNr = -1 | |
365 | |
366 try: | |
367 Size = int(Fields[2]) | |
368 except: | |
369 Size = -1 | |
370 | |
371 Fields2 = Fields[3].split('/') | |
372 LocalScore = -1.0 | |
373 Evalue = -1.0 | |
374 if len(Fields2) == 3: | |
375 try: | |
376 PctId = float(Fields2[0]) | |
377 LocalScore = float(Fields2[1]) | |
378 Evalue = float(Fields2[2]) | |
379 except: | |
380 PctId = -1.0 | |
381 else: | |
382 try: | |
383 PctId = float(Fields[3]) | |
384 except: | |
385 PctId = -1.0 | |
386 | |
387 Strand = Fields[4] | |
388 | |
389 try: | |
390 QueryStart = int(Fields[5]) | |
391 except: | |
392 QueryStart = -1 | |
393 | |
394 try: | |
395 SeedStart = int(Fields[6]) | |
396 except: | |
397 SeedStart = -1 | |
398 | |
399 Alignment = Fields[7] | |
400 QueryLabel = Fields[8] | |
401 | |
402 if len(Fields) > 9: | |
403 TargetLabel = Fields[9] | |
404 | |
405 def GetRec(File, OnRecord): | |
406 global Line | |
407 while 1: | |
408 Line = File.readline() | |
409 if len(Line) == 0: | |
410 return 0 | |
411 if Line[0] == '#': | |
412 continue | |
413 Line = Line.strip() | |
414 if len(Line) == 0: | |
415 return 1 | |
416 ParseRec(Line) | |
417 Ok = OnRecord() | |
418 if Ok != None and Ok == 0: | |
419 return 0 | |
420 return 1 | |
421 | |
422 def ReadRecs(argFileName, OnRecord, ShowProgress = False): | |
423 return ReadFile(argFileName, OnRecord, ShowProgress) | |
424 | |
425 def ReadRecsOnRec(argFileName, OnRecord, ShowProgress = True): | |
426 return ReadFile(argFileName, OnRecord, ShowProgress) | |
427 | |
428 def GetRecs(argFileName, OnRecord, ShowProgress = True): | |
429 return ReadFile(argFileName, OnRecord, ShowProgress) | |
430 | |
431 def ReadFile(argFileName, OnRecord, ShowProgress = True): | |
432 global FileName | |
433 FileName = argFileName | |
434 File = open(FileName) | |
435 | |
436 if ShowProgress: | |
437 progress.InitFile(File, FileName) | |
438 while GetRec(File, OnRecord): | |
439 if ShowProgress: | |
440 progress.File() | |
441 if ShowProgress: | |
442 progress.FileDone() | |
443 | |
444 OTUIds = [] | |
445 SampleIds = [] | |
446 OTUTable = {} | |
447 | |
448 ReadRecs(ucFileName, OnRec) | |
449 | |
450 fout=open(outFileName,'w') | |
451 | |
452 s = "OTUId" | |
453 for SampleId in SampleIds: | |
454 s += "\t" + SampleId | |
455 | |
456 fout.write("%s\n" % s) | |
457 | |
458 for OTUId in OTUIds: | |
459 s = OTUId | |
460 for SampleId in SampleIds: | |
461 try: | |
462 n = OTUTable[OTUId][SampleId] | |
463 except: | |
464 n = 0 | |
465 s += "\t" + str(n) | |
466 fout.write("%s\n" % s) | |
467 | |
468 fout.close() |