longorf: getLongestORF.py comparison

comparison getLongestORF.py @ 0:ec898924d8c7 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559

author	mbernt
date	Wed, 20 Jun 2018 11:02:06 -0400
parents
children	1c4b24e9bb16

comparison

equal deleted inserted replaced

--1:000000000000
+:ec898924d8c7
+#!/usr/bin/env python
+"""
+usage: getLongestORF.py input output.fas output.tab
+input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
+output.fas: fasta file with all longest ORFs per transcript
+output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
+example:
+>253936-254394(+)_1 [28 - 63]
+LTNYCQMVHNIL
+>253936-254394(+)_2 [18 - 77]
+HKLIDKLLPNGAQYFVKSTQ
+>253936-254394(+)_3 [32 - 148]
+QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
+>253936-254394(+)_4 [117 - 152]
+LGIIVPSLLLCN
+>248351-252461(+)_1 [14 - 85]
+VLARKYPRCLSPSKKSPCQLRQRS
+>248351-252461(+)_2 [21 - 161]
+PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
+>248351-252461(+)_3 [89 - 202]
+VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
+>248351-252461(+)_4 [206 - 259]
+SRTTSGRQSVLSEKLWRR
+>248351-252461(+)_5 [263 - 313]
+CLSPLWVPCCSRHSCHG
+"""
+import sys,re
+def findlongestOrf(transcriptDict,old_seqID):
+#write for previous seqID
+prevTranscript = transcriptDict[old_seqID]
+i_max = 0
+#find longest orf in transcript
+for i in range(0,len(prevTranscript)):
+if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+i_max = i
+for i in range(0,len(prevTranscript)):
+prevStart = prevTranscript[i][0]
+prevEnd = prevTranscript[i][1]
+prevLength = prevTranscript[i][2]
+output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
+if (end - start > 0):
+output+="\tForward"
+else:
+output+="\tReverse"
+if(i == i_max):
+output += "\ty\n"
+else:
+output += "\tn\n"
+OUTPUT_ORF_SUMMARY.write(output)
+transcriptDict.pop(old_seqID, None)
+return None
+INPUT = open(sys.argv[1],"r")
+OUTPUT_FASTA = open(sys.argv[2],"w")
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+seqID = ""
+old_seqID = ""
+lengthDict = {}
+seqDict = {}
+headerDict = {}
+transcriptDict = {}
+skip = False
+OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+for line in INPUT:
+line = line.strip()
+#    print line
+if(re.match(">",line)): #header
+seqID = "_".join(line.split(">")[1].split("_")[:-1])
+#seqID = line.split(">")[1].split("_")[0]
+start = int (re.search('\ \[(\d+)\ -', line).group(1))
+end = int (re.search('-\ (\d+)\]',line).group(1))
+length = abs(end - start)
+if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+findlongestOrf(transcriptDict,old_seqID)
+if seqID not in transcriptDict:
+transcriptDict[seqID] = []
+transcriptDict[seqID].append([start,end,length])
+if(seqID not in lengthDict and old_seqID != ""): #new transcript
+#write FASTA
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
+#delete old dict entry
+headerDict.pop(old_seqID, None)
+seqDict.pop(old_seqID, None)
+lengthDict.pop(old_seqID, None)
+#if several longest sequences exist with the same length, the dictionary saves the last occuring.
+if(seqID not in lengthDict or length >= lengthDict[seqID]):
+headerDict[seqID] = line
+lengthDict[seqID] = length
+seqDict[seqID] = ""
+skip = False
+else:
+skip = True
+next
+old_seqID = seqID
+elif(skip):
+next
+else:
+seqDict[seqID] += line
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
+findlongestOrf(transcriptDict,old_seqID)
+INPUT.close()
+OUTPUT_FASTA.close()
+OUTPUT_ORF_SUMMARY.close()

Mercurial > repos > mbernt > longorf

comparison getLongestORF.py @ 0:ec898924d8c7 draft