Mercurial > repos > qfabrepo > metadegalaxy_uc2otutable
diff uclust2otutable.py @ 0:e85e7ba38aff draft
"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/uc2otutable commit 0db3cb4e9a87400bb2f8402ffc23334e24ad4b4e-dirty"
author | qfabrepo |
---|---|
date | Mon, 14 Sep 2020 04:52:36 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uclust2otutable.py Mon Sep 14 04:52:36 2020 +0000 @@ -0,0 +1,468 @@ +import sys +import progress +import subprocess +import tempfile +import traceback +import argparse + +parser = argparse.ArgumentParser( + description="This script converts uclust format from vsearch to tabular format" + ) +parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0") +parser.add_argument("-i","--input",dest="uclust",default=False,help="input filename in uclust format") +parser.add_argument("-o","--output",dest="otutable",default=False,help="output filename") + + +if(len(sys.argv) == 1): + parser.print_help(sys.stderr) + sys.exit() + +args = parser.parse_args() + +ucFileName = args.uclust +outFileName = args.otutable + + +# Tab-separated fields: +# 1=Type, 2=ClusterNr, 3=SeqLength or ClusterSize, 4=PctId, 5=Strand, 6=QueryStart, 7=SeedStart, 8=Alignment, 9=Label +# Record types (field 1): L=LibSeed, S=NewSeed, H=Hit, R=Reject, D=LibCluster, C=NewCluster, N=NotMatched +# For C and D types, PctId is average id with seed. +# QueryStart and SeedStart are zero-based relative to start of sequence. +# If minus strand, SeedStart is relative to reverse-complemented seed. + +MaxError = -1 + +Type = '?' +ClusterNr = -1 +Size = -1 +PctId = -1.0 +LocalScore = -1.0 +Evalue = -1.0 +Strand = '.' +QueryStart = -1 +SeedStart = -1 +Alignment = "" +QueryLabel = "" +TargetLabel = "" +FileName = "?" +Line = "" + +TRUNC_LABELS=0 + +def GetSampleId(Label): + sep=";" + SampleID_temp = Label.split(sep,1)[0] + SampleID = SampleID_temp.split('_',1)[-1] + return SampleID + +def OnRec(): + global OTUs, Samples, OTUTable + if Type != 'H': + return + + OTUId = TargetLabel + if OTUId not in OTUIds: + OTUIds.append(OTUId) + OTUTable[OTUId] = {} + + SampleId = GetSampleId(QueryLabel) + if SampleId not in SampleIds: + SampleIds.append(SampleId) + + N = GetSizeFromLabel(QueryLabel, 1) + try: + OTUTable[OTUId][SampleId] += N + except: + OTUTable[OTUId][SampleId] = N + +def Die(Msg): + print >> sys.stderr + print >> sys.stderr + + traceback.print_stack() + s = "" + for i in range(0, len(sys.argv)): + if i > 0: + s += " " + s += sys.argv[i] + print >> sys.stderr, s + print >> sys.stderr, "**ERROR**", Msg + print >> sys.stderr + print >> sys.stderr + sys.exit(1) + print("NOTHERE!!") + +def Warning(Msg): + print >> sys.stderr + print >> sys.stderr, sys.argv + print >> sys.stderr, "**WARNING**", Msg + +def isgap(c): + return c == '-' or c == '.' + +def GetSeqCount(FileName): + Tmp = tempfile.TemporaryFile() + try: + TmpFile = Tmp.file + except: + TmpFile = Tmp + s = subprocess.call([ "grep", "-c", "^>", FileName ], stdout=TmpFile) + TmpFile.seek(0) + s = TmpFile.read() + return int(s) + +def GetSeqsDict(FileName): + return ReadSeqsFast(FileName, False) + +def ReadSeqsDict(FileName, Progress = False): + return ReadSeqsFast(FileName, Progress) + +def ReadSeqsOnSeq(FileName, OnSeq, Progress = False): + ReadSeqs3(FileName, OnSeq, Progress) + +def ReadSeqsFastFile(File, Progress = False): + Seqs = {} + Id = "" + N = 0 + while 1: + if N%10000 == 0 and Progress: + sys.stderr.write("%u seqs\r" % (N)) + Line = File.readline() + if len(Line) == 0: + if Progress: + sys.stderr.write("%u seqs\n" % (N)) + return Seqs + if len(Line) == 0: + continue + Line = Line.strip() + if Line[0] == ">": + N += 1 + Id = Line[1:] + if TRUNC_LABELS: + Id = Id.split()[0] + Seqs[Id] = "" + else: + if Id == "": + Die("FASTA file does not start with '>'") + Seqs[Id] = Seqs[Id] + Line + +def ReadSeqsFast(FileName, Progress = True): + File = open(FileName) + return ReadSeqsFastFile(File, Progress) + +def ReadSeqs(FileName, toupper=False, stripgaps=False, Progress=False): + if not toupper and not stripgaps: + return ReadSeqsFast(FileName, False) + + Seqs = {} + Id = "" + File = open(FileName) + while 1: + Line = File.readline() + if len(Line) == 0: + return Seqs + Line = Line.strip() + if len(Line) == 0: + continue + if Line[0] == ">": + Id = Line[1:] + if TRUNC_LABELS: + Id = Id.split()[0] + if Id in Seqs.keys(): + Die("Duplicate id '%s' in '%s'" % (Id, FileName)) + Seqs[Id] = "" + else: + if Id == "": + Die("FASTA file '%s' does not start with '>'" % FileName) + if toupper: + Line = Line.upper() + if stripgaps: + Line = Line.replace("-", "") + Line = Line.replace(".", "") + Seqs[Id] = Seqs[Id] + Line + +def ReadSeqs2(FileName, ShowProgress = True): + Seqs = [] + Labels = [] + File = open(FileName) + if ShowProgress: + progress.InitFile(File, FileName) + while 1: + progress.File() + Line = File.readline() + if len(Line) == 0: + if ShowProgress: + print >> sys.stderr, "\n" + return Labels, Seqs + Line = Line.strip() + if len(Line) == 0: + continue + if Line[0] == ">": + Id = Line[1:] + if TRUNC_LABELS: + Id = Id.split()[0] + Labels.append(Id) + Seqs.append("") + else: + i = len(Seqs)-1 + Seqs[i] = Seqs[i] + Line + +def ReadSeqs3(FileName, OnSeq, ShowProgress = True): + File = open(FileName) + if ShowProgress: + progress.InitFile(File, FileName) + Label = "" + Seq = "" + while 1: + Line = File.readline() + if len(Line) == 0: + if Seq != "": + OnSeq(Label, Seq) + if ShowProgress: + print >> sys.stderr, "\n" + return + Line = Line.strip() + if len(Line) == 0: + continue + if Line[0] == ">": + if Seq != "": + if ShowProgress: + progress.File() + if TRUNC_LABELS: + Label = Label.split()[0] + OnSeq(Label, Seq) + Label = Line[1:] + Seq = "" + else: + Seq += Line + +def WriteSeq(File, Seq, Label = ""): + if Label != "": + print >> File, ">" + Label + BLOCKLENGTH = 80 + SeqLength = len(Seq) + BlockCount = int((SeqLength + (BLOCKLENGTH-1))/BLOCKLENGTH) + for BlockIndex in range(0, BlockCount): + Block = Seq[BlockIndex*BLOCKLENGTH:] + Block = Block[:BLOCKLENGTH] + print >> File, Block + +def GetSizeFromLabel(Label, Default = -1): + Fields = Label.split(";") + for Field in Fields: + if Field.startswith("size="): + return int(Field[5:]) + if Default == -1: + Die("Missing size >" + Label) + return Default + +def StripSizeFromLabel(Label): + Fields = Label.split(";") + NewLabel = "" + for Field in Fields: + if Field.startswith("size="): + continue + if NewLabel != "": + NewLabel += ";" + NewLabel += Field + return NewLabel + +def GetQualFromLabel(Label): + n = Label.find("qual=") + assert n >= 0 + return Label[n+5:-1] + +def StripQualFromLabel(Label): + n = Label.find("qual=") + assert n >= 0 + return Label[:n] + +def GetField(Label, Name, Default): + Fields = Label.split(';') + for Field in Fields: + if Field.startswith(Name + "="): + n = len(Name) + 1 + return Field[n:] + if Default == "": + Die("Field %s= not found in >%s" % (Name, Label)) + return Default + +def GetIntFieldFromLabel(Label, Name, Default): + return int(GetField(Label, Name, Default)) + +def GetFieldFromLabel(Label, Name, Default): + return GetField(Label, Name, Default) + +def DeleteFieldFromLabel(Label, Name): + NewLabel = "" + Fields = Label.split(';') + for Field in Fields: + if len(Field) > 0 and not Field.startswith(Name + "="): + NewLabel += Field + ';' + return NewLabel + +def ReplaceSize(Label, Size): + Fields = Label.split(";") + NewLabel = "" + Done = False + for Field in Fields: + if Field.startswith("size="): + NewLabel += "size=%u;" % Size + Done = True + else: + if Field != "": + NewLabel += Field + ";" + if not Done: + die.Die("size= not found in >" + Label) + return NewLabel + +def Error(s): + print >> sys.stderr, "*** ERROR ***", s, sys.argv + sys.exit(1) + +def ProgressFile(File, FileSize): +# if not sys.stderr.isatty(): +# return + Pos = File.tell() + Pct = (100.0*Pos)/FileSize + Str = "%s %5.1f%%\r" % (FileName, Pct) + sys.stderr.write(Str) + +def Progress(i, N): +# if not sys.stderr.isatty(): + return + Pct = (100.0*i)/N + Str = "%5.1f%%\r" % Pct + sys.stderr.write(Str) + +def PrintLine(): + print(Line) + +def ParseRec(Line): + global Type + global ClusterNr + global Size + global PctId + global Strand + global QueryStart + global SeedStart + global Alignment + global QueryLabel + global TargetLabel + global LocalScore + global Evalue + + Fields = Line.split("\t") + N = len(Fields) + if N != 9 and N != 10: + Error("Expected 9 or 10 fields in .uc record, got: " + Line) + Type = Fields[0] + + try: + ClusterNr = int(Fields[1]) + except: + ClusterNr = -1 + + try: + Size = int(Fields[2]) + except: + Size = -1 + + Fields2 = Fields[3].split('/') + LocalScore = -1.0 + Evalue = -1.0 + if len(Fields2) == 3: + try: + PctId = float(Fields2[0]) + LocalScore = float(Fields2[1]) + Evalue = float(Fields2[2]) + except: + PctId = -1.0 + else: + try: + PctId = float(Fields[3]) + except: + PctId = -1.0 + + Strand = Fields[4] + + try: + QueryStart = int(Fields[5]) + except: + QueryStart = -1 + + try: + SeedStart = int(Fields[6]) + except: + SeedStart = -1 + + Alignment = Fields[7] + QueryLabel = Fields[8] + + if len(Fields) > 9: + TargetLabel = Fields[9] + +def GetRec(File, OnRecord): + global Line + while 1: + Line = File.readline() + if len(Line) == 0: + return 0 + if Line[0] == '#': + continue + Line = Line.strip() + if len(Line) == 0: + return 1 + ParseRec(Line) + Ok = OnRecord() + if Ok != None and Ok == 0: + return 0 + return 1 + +def ReadRecs(argFileName, OnRecord, ShowProgress = False): + return ReadFile(argFileName, OnRecord, ShowProgress) + +def ReadRecsOnRec(argFileName, OnRecord, ShowProgress = True): + return ReadFile(argFileName, OnRecord, ShowProgress) + +def GetRecs(argFileName, OnRecord, ShowProgress = True): + return ReadFile(argFileName, OnRecord, ShowProgress) + +def ReadFile(argFileName, OnRecord, ShowProgress = True): + global FileName + FileName = argFileName + File = open(FileName) + + if ShowProgress: + progress.InitFile(File, FileName) + while GetRec(File, OnRecord): + if ShowProgress: + progress.File() + if ShowProgress: + progress.FileDone() + +OTUIds = [] +SampleIds = [] +OTUTable = {} + +ReadRecs(ucFileName, OnRec) + +fout=open(outFileName,'w') + +s = "OTUId" +for SampleId in SampleIds: + s += "\t" + SampleId + +fout.write("%s\n" % s) + +for OTUId in OTUIds: + s = OTUId + for SampleId in SampleIds: + try: + n = OTUTable[OTUId][SampleId] + except: + n = 0 + s += "\t" + str(n) + fout.write("%s\n" % s) + +fout.close()