Mercurial > repos > qfabrepo > metadegalaxy_uc2otutable
view uclust2otutable.py @ 0:e85e7ba38aff draft
"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/uc2otutable commit 0db3cb4e9a87400bb2f8402ffc23334e24ad4b4e-dirty"
author | qfabrepo |
---|---|
date | Mon, 14 Sep 2020 04:52:36 +0000 |
parents | |
children |
line wrap: on
line source
import sys import progress import subprocess import tempfile import traceback import argparse parser = argparse.ArgumentParser( description="This script converts uclust format from vsearch to tabular format" ) parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0") parser.add_argument("-i","--input",dest="uclust",default=False,help="input filename in uclust format") parser.add_argument("-o","--output",dest="otutable",default=False,help="output filename") if(len(sys.argv) == 1): parser.print_help(sys.stderr) sys.exit() args = parser.parse_args() ucFileName = args.uclust outFileName = args.otutable # Tab-separated fields: # 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=Label # Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NotMatched # For C and D types, PctId is average id with seed. # QueryStart and SeedStart are zero-based relative to start of sequence. # If minus strand, SeedStart is relative to reverse-complemented seed. MaxError = -1 Type = '?' ClusterNr = -1 Size = -1 PctId = -1.0 LocalScore = -1.0 Evalue = -1.0 Strand = '.' QueryStart = -1 SeedStart = -1 Alignment = "" QueryLabel = "" TargetLabel = "" FileName = "?" Line = "" TRUNC_LABELS=0 def GetSampleId(Label): sep=";" SampleID_temp = Label.split(sep,1)[0] SampleID = SampleID_temp.split('_',1)[-1] return SampleID def OnRec(): global OTUs, Samples, OTUTable if Type != 'H': return OTUId = TargetLabel if OTUId not in OTUIds: OTUIds.append(OTUId) OTUTable[OTUId] = {} SampleId = GetSampleId(QueryLabel) if SampleId not in SampleIds: SampleIds.append(SampleId) N = GetSizeFromLabel(QueryLabel, 1) try: OTUTable[OTUId][SampleId] += N except: OTUTable[OTUId][SampleId] = N def Die(Msg): print >> sys.stderr print >> sys.stderr traceback.print_stack() s = "" for i in range(0, len(sys.argv)): if i > 0: s += " " s += sys.argv[i] print >> sys.stderr, s print >> sys.stderr, "**ERROR**", Msg print >> sys.stderr print >> sys.stderr sys.exit(1) print("NOTHERE!!") def Warning(Msg): print >> sys.stderr print >> sys.stderr, sys.argv print >> sys.stderr, "**WARNING**", Msg def isgap(c): return c == '-' or c == '.' def GetSeqCount(FileName): Tmp = tempfile.TemporaryFile() try: TmpFile = Tmp.file except: TmpFile = Tmp s = subprocess.call([ "grep", "-c", "^>", FileName ], stdout=TmpFile) TmpFile.seek(0) s = TmpFile.read() return int(s) def GetSeqsDict(FileName): return ReadSeqsFast(FileName, False) def ReadSeqsDict(FileName, Progress = False): return ReadSeqsFast(FileName, Progress) def ReadSeqsOnSeq(FileName, OnSeq, Progress = False): ReadSeqs3(FileName, OnSeq, Progress) def ReadSeqsFastFile(File, Progress = False): Seqs = {} Id = "" N = 0 while 1: if N%10000 == 0 and Progress: sys.stderr.write("%u seqs\r" % (N)) Line = File.readline() if len(Line) == 0: if Progress: sys.stderr.write("%u seqs\n" % (N)) return Seqs if len(Line) == 0: continue Line = Line.strip() if Line[0] == ">": N += 1 Id = Line[1:] if TRUNC_LABELS: Id = Id.split()[0] Seqs[Id] = "" else: if Id == "": Die("FASTA file does not start with '>'") Seqs[Id] = Seqs[Id] + Line def ReadSeqsFast(FileName, Progress = True): File = open(FileName) return ReadSeqsFastFile(File, Progress) def ReadSeqs(FileName, toupper=False, stripgaps=False, Progress=False): if not toupper and not stripgaps: return ReadSeqsFast(FileName, False) Seqs = {} Id = "" File = open(FileName) while 1: Line = File.readline() if len(Line) == 0: return Seqs Line = Line.strip() if len(Line) == 0: continue if Line[0] == ">": Id = Line[1:] if TRUNC_LABELS: Id = Id.split()[0] if Id in Seqs.keys(): Die("Duplicate id '%s' in '%s'" % (Id, FileName)) Seqs[Id] = "" else: if Id == "": Die("FASTA file '%s' does not start with '>'" % FileName) if toupper: Line = Line.upper() if stripgaps: Line = Line.replace("-", "") Line = Line.replace(".", "") Seqs[Id] = Seqs[Id] + Line def ReadSeqs2(FileName, ShowProgress = True): Seqs = [] Labels = [] File = open(FileName) if ShowProgress: progress.InitFile(File, FileName) while 1: progress.File() Line = File.readline() if len(Line) == 0: if ShowProgress: print >> sys.stderr, "\n" return Labels, Seqs Line = Line.strip() if len(Line) == 0: continue if Line[0] == ">": Id = Line[1:] if TRUNC_LABELS: Id = Id.split()[0] Labels.append(Id) Seqs.append("") else: i = len(Seqs)-1 Seqs[i] = Seqs[i] + Line def ReadSeqs3(FileName, OnSeq, ShowProgress = True): File = open(FileName) if ShowProgress: progress.InitFile(File, FileName) Label = "" Seq = "" while 1: Line = File.readline() if len(Line) == 0: if Seq != "": OnSeq(Label, Seq) if ShowProgress: print >> sys.stderr, "\n" return Line = Line.strip() if len(Line) == 0: continue if Line[0] == ">": if Seq != "": if ShowProgress: progress.File() if TRUNC_LABELS: Label = Label.split()[0] OnSeq(Label, Seq) Label = Line[1:] Seq = "" else: Seq += Line def WriteSeq(File, Seq, Label = ""): if Label != "": print >> File, ">" + Label BLOCKLENGTH = 80 SeqLength = len(Seq) BlockCount = int((SeqLength + (BLOCKLENGTH-1))/BLOCKLENGTH) for BlockIndex in range(0, BlockCount): Block = Seq[BlockIndex*BLOCKLENGTH:] Block = Block[:BLOCKLENGTH] print >> File, Block def GetSizeFromLabel(Label, Default = -1): Fields = Label.split(";") for Field in Fields: if Field.startswith("size="): return int(Field[5:]) if Default == -1: Die("Missing size >" + Label) return Default def StripSizeFromLabel(Label): Fields = Label.split(";") NewLabel = "" for Field in Fields: if Field.startswith("size="): continue if NewLabel != "": NewLabel += ";" NewLabel += Field return NewLabel def GetQualFromLabel(Label): n = Label.find("qual=") assert n >= 0 return Label[n+5:-1] def StripQualFromLabel(Label): n = Label.find("qual=") assert n >= 0 return Label[:n] def GetField(Label, Name, Default): Fields = Label.split(';') for Field in Fields: if Field.startswith(Name + "="): n = len(Name) + 1 return Field[n:] if Default == "": Die("Field %s= not found in >%s" % (Name, Label)) return Default def GetIntFieldFromLabel(Label, Name, Default): return int(GetField(Label, Name, Default)) def GetFieldFromLabel(Label, Name, Default): return GetField(Label, Name, Default) def DeleteFieldFromLabel(Label, Name): NewLabel = "" Fields = Label.split(';') for Field in Fields: if len(Field) > 0 and not Field.startswith(Name + "="): NewLabel += Field + ';' return NewLabel def ReplaceSize(Label, Size): Fields = Label.split(";") NewLabel = "" Done = False for Field in Fields: if Field.startswith("size="): NewLabel += "size=%u;" % Size Done = True else: if Field != "": NewLabel += Field + ";" if not Done: die.Die("size= not found in >" + Label) return NewLabel def Error(s): print >> sys.stderr, "*** ERROR ***", s, sys.argv sys.exit(1) def ProgressFile(File, FileSize): # if not sys.stderr.isatty(): # return Pos = File.tell() Pct = (100.0*Pos)/FileSize Str = "%s %5.1f%%\r" % (FileName, Pct) sys.stderr.write(Str) def Progress(i, N): # if not sys.stderr.isatty(): return Pct = (100.0*i)/N Str = "%5.1f%%\r" % Pct sys.stderr.write(Str) def PrintLine(): print(Line) def ParseRec(Line): global Type global ClusterNr global Size global PctId global Strand global QueryStart global SeedStart global Alignment global QueryLabel global TargetLabel global LocalScore global Evalue Fields = Line.split("\t") N = len(Fields) if N != 9 and N != 10: Error("Expected 9 or 10 fields in .uc record, got: " + Line) Type = Fields[0] try: ClusterNr = int(Fields[1]) except: ClusterNr = -1 try: Size = int(Fields[2]) except: Size = -1 Fields2 = Fields[3].split('/') LocalScore = -1.0 Evalue = -1.0 if len(Fields2) == 3: try: PctId = float(Fields2[0]) LocalScore = float(Fields2[1]) Evalue = float(Fields2[2]) except: PctId = -1.0 else: try: PctId = float(Fields[3]) except: PctId = -1.0 Strand = Fields[4] try: QueryStart = int(Fields[5]) except: QueryStart = -1 try: SeedStart = int(Fields[6]) except: SeedStart = -1 Alignment = Fields[7] QueryLabel = Fields[8] if len(Fields) > 9: TargetLabel = Fields[9] def GetRec(File, OnRecord): global Line while 1: Line = File.readline() if len(Line) == 0: return 0 if Line[0] == '#': continue Line = Line.strip() if len(Line) == 0: return 1 ParseRec(Line) Ok = OnRecord() if Ok != None and Ok == 0: return 0 return 1 def ReadRecs(argFileName, OnRecord, ShowProgress = False): return ReadFile(argFileName, OnRecord, ShowProgress) def ReadRecsOnRec(argFileName, OnRecord, ShowProgress = True): return ReadFile(argFileName, OnRecord, ShowProgress) def GetRecs(argFileName, OnRecord, ShowProgress = True): return ReadFile(argFileName, OnRecord, ShowProgress) def ReadFile(argFileName, OnRecord, ShowProgress = True): global FileName FileName = argFileName File = open(FileName) if ShowProgress: progress.InitFile(File, FileName) while GetRec(File, OnRecord): if ShowProgress: progress.File() if ShowProgress: progress.FileDone() OTUIds = [] SampleIds = [] OTUTable = {} ReadRecs(ucFileName, OnRec) fout=open(outFileName,'w') s = "OTUId" for SampleId in SampleIds: s += "\t" + SampleId fout.write("%s\n" % s) for OTUId in OTUIds: s = OTUId for SampleId in SampleIds: try: n = OTUTable[OTUId][SampleId] except: n = 0 s += "\t" + str(n) fout.write("%s\n" % s) fout.close()