Mercurial > repos > mbernt > longorf
diff getLongestORF.py @ 1:1c4b24e9bb16 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author | mbernt |
---|---|
date | Mon, 16 Jul 2018 11:01:52 -0400 |
parents | ec898924d8c7 |
children |
line wrap: on
line diff
--- a/getLongestORF.py Wed Jun 20 11:02:06 2018 -0400 +++ b/getLongestORF.py Mon Jul 16 11:01:52 2018 -0400 @@ -1,114 +1,108 @@ #!/usr/bin/env python -""" -usage: getLongestORF.py input output.fas output.tab - - -input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") -output.fas: fasta file with all longest ORFs per transcript -output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs - -example: +#example: +#>STRG.1.1(-)_1 [10 - 69] +#GGNHHTLGGKKTFSYTHPPC +#>STRG.1.1(-)_2 [3 - 80] +#FLRGEPPHIGGKKDIFLHPPTLLKGR ->253936-254394(+)_1 [28 - 63] -LTNYCQMVHNIL ->253936-254394(+)_2 [18 - 77] -HKLIDKLLPNGAQYFVKSTQ ->253936-254394(+)_3 [32 - 148] -QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV ->253936-254394(+)_4 [117 - 152] -LGIIVPSLLLCN ->248351-252461(+)_1 [14 - 85] -VLARKYPRCLSPSKKSPCQLRQRS ->248351-252461(+)_2 [21 - 161] -PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP ->248351-252461(+)_3 [89 - 202] -VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG ->248351-252461(+)_4 [206 - 259] -SRTTSGRQSVLSEKLWRR ->248351-252461(+)_5 [263 - 313] -CLSPLWVPCCSRHSCHG -""" +#output1: fasta file with all longest ORFs per transcript +#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs -import sys,re +import sys,re; def findlongestOrf(transcriptDict,old_seqID): - #write for previous seqID - prevTranscript = transcriptDict[old_seqID] - i_max = 0 - #find longest orf in transcript - for i in range(0,len(prevTranscript)): - if(prevTranscript[i][2] >= prevTranscript[i_max][2]): - i_max = i - for i in range(0,len(prevTranscript)): - prevStart = prevTranscript[i][0] - prevEnd = prevTranscript[i][1] - prevLength = prevTranscript[i][2] - output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) - if (end - start > 0): - output+="\tForward" - else: - output+="\tReverse" - if(i == i_max): - output += "\ty\n" - else: - output += "\tn\n" - OUTPUT_ORF_SUMMARY.write(output) - transcriptDict.pop(old_seqID, None) - return None + #write for previous seqID + prevTranscript = transcriptDict[old_seqID]; + i_max = 0; + transcript = old_seqID.split("(")[0] + + #find longest orf in transcript + for i in range(0,len(prevTranscript)): + if(prevTranscript[i][2] >= prevTranscript[i_max][2]): + i_max = i; -INPUT = open(sys.argv[1],"r") -OUTPUT_FASTA = open(sys.argv[2],"w") -OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") + for i in range(0,len(prevTranscript)): + prevORFstart = prevTranscript[i][0]; + prevORFend = prevTranscript[i][1]; + prevORFlength = prevTranscript[i][2]; + header = prevTranscript[i][3]; + strand = re.search('\(([+-]+)\)',header).group(1); + + output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); + if (prevORFend - prevORFstart > 0): + output+="\tnormal"; + else: + output+="\treverse_sense"; + if(i == i_max): + output += "\ty\n"; + else: + output += "\tn\n"; + + OUTPUT_ORF_SUMMARY.write(output); -seqID = "" -old_seqID = "" -lengthDict = {} -seqDict = {} -headerDict = {} -transcriptDict = {} -skip = False + transcriptDict.pop(old_seqID, None); + return None; + +#----------------------------------------------------------------------------------------------------- + +INPUT = open(sys.argv[1],"r"); +OUTPUT_FASTA = open(sys.argv[2],"w"); +OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); -OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") +seqID = ""; +old_seqID = ""; +lengthDict = {}; +seqDict = {}; +headerDict = {}; +transcriptDict = {}; + +skip = False; + +OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); for line in INPUT: - line = line.strip() -# print line - if(re.match(">",line)): #header - seqID = "_".join(line.split(">")[1].split("_")[:-1]) - #seqID = line.split(">")[1].split("_")[0] - start = int (re.search('\ \[(\d+)\ -', line).group(1)) - end = int (re.search('-\ (\d+)\]',line).group(1)) - length = abs(end - start) - if(seqID not in transcriptDict and old_seqID != ""): #new transcript - findlongestOrf(transcriptDict,old_seqID) - if seqID not in transcriptDict: - transcriptDict[seqID] = [] - transcriptDict[seqID].append([start,end,length]) - if(seqID not in lengthDict and old_seqID != ""): #new transcript - #write FASTA - OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") - #delete old dict entry - headerDict.pop(old_seqID, None) - seqDict.pop(old_seqID, None) - lengthDict.pop(old_seqID, None) - #if several longest sequences exist with the same length, the dictionary saves the last occuring. - if(seqID not in lengthDict or length >= lengthDict[seqID]): - headerDict[seqID] = line - lengthDict[seqID] = length - seqDict[seqID] = "" - skip = False - else: - skip = True - next - old_seqID = seqID - elif(skip): - next - else: - seqDict[seqID] += line + line = line.strip(); + if(re.match(">",line)): #header + header = line.split(">")[1].split(" ")[0] + seqID = "_".join(line.split(">")[1].split("_")[:-1]) + ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); + ORFend = int (re.search('-\ (\d+)\]',line).group(1)); + length = abs(ORFend - ORFstart); + + if(seqID not in transcriptDict and old_seqID != ""): #new transcript + findlongestOrf(transcriptDict,old_seqID); + + if seqID not in transcriptDict: + transcriptDict[seqID] = []; + + transcriptDict[seqID].append([ORFstart,ORFend,length,header]); -OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) -findlongestOrf(transcriptDict,old_seqID) -INPUT.close() -OUTPUT_FASTA.close() -OUTPUT_ORF_SUMMARY.close() + if(seqID not in lengthDict and old_seqID != ""): #new transcript + #write FASTA + OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); + #delete old dict entry + headerDict.pop(old_seqID, None); + seqDict.pop(old_seqID, None); + lengthDict.pop(old_seqID, None); + #if several longest sequences exist with the same length, the dictionary saves the last occuring. + if(seqID not in lengthDict or length >= lengthDict[seqID]): + headerDict[seqID] = line; + lengthDict[seqID] = length; + seqDict[seqID] = ""; + skip = False; + else: + skip = True; + next; + old_seqID = seqID; + elif(skip): + next; + else: + seqDict[seqID] += line; + +OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); +findlongestOrf(transcriptDict,old_seqID); + +INPUT.close(); +OUTPUT_FASTA.close(); +OUTPUT_ORF_SUMMARY.close(); \ No newline at end of file