diff getLongestORF.py @ 1:1c4b24e9bb16 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author mbernt
date Mon, 16 Jul 2018 11:01:52 -0400
parents ec898924d8c7
children
line wrap: on
line diff
--- a/getLongestORF.py	Wed Jun 20 11:02:06 2018 -0400
+++ b/getLongestORF.py	Mon Jul 16 11:01:52 2018 -0400
@@ -1,114 +1,108 @@
 #!/usr/bin/env python
 
-"""
-usage: getLongestORF.py input output.fas output.tab
-
-
-input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
-output.fas: fasta file with all longest ORFs per transcript
-output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
-
-example:
+#example:
+#>STRG.1.1(-)_1 [10 - 69]
+#GGNHHTLGGKKTFSYTHPPC
+#>STRG.1.1(-)_2 [3 - 80]
+#FLRGEPPHIGGKKDIFLHPPTLLKGR
 
->253936-254394(+)_1 [28 - 63] 
-LTNYCQMVHNIL
->253936-254394(+)_2 [18 - 77] 
-HKLIDKLLPNGAQYFVKSTQ
->253936-254394(+)_3 [32 - 148] 
-QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
->253936-254394(+)_4 [117 - 152] 
-LGIIVPSLLLCN
->248351-252461(+)_1 [14 - 85] 
-VLARKYPRCLSPSKKSPCQLRQRS
->248351-252461(+)_2 [21 - 161] 
-PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
->248351-252461(+)_3 [89 - 202] 
-VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
->248351-252461(+)_4 [206 - 259] 
-SRTTSGRQSVLSEKLWRR
->248351-252461(+)_5 [263 - 313] 
-CLSPLWVPCCSRHSCHG
-"""
+#output1: fasta file with all longest ORFs per transcript
+#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs
 
-import sys,re
+import sys,re;
 
 def findlongestOrf(transcriptDict,old_seqID):
-    #write for previous seqID
-    prevTranscript = transcriptDict[old_seqID]
-    i_max = 0
-    #find longest orf in transcript
-    for i in range(0,len(prevTranscript)):
-        if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
-            i_max = i
-    for i in range(0,len(prevTranscript)):
-        prevStart = prevTranscript[i][0]
-        prevEnd = prevTranscript[i][1]
-        prevLength = prevTranscript[i][2]
-        output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
-        if (end - start > 0):
-            output+="\tForward"
-        else:
-            output+="\tReverse"
-        if(i == i_max):
-            output += "\ty\n"
-        else:
-            output += "\tn\n"
-        OUTPUT_ORF_SUMMARY.write(output)
-    transcriptDict.pop(old_seqID, None)
-    return None
+	#write for previous seqID
+	prevTranscript = transcriptDict[old_seqID];
+	i_max = 0;
+	transcript = old_seqID.split("(")[0]
+
+	#find longest orf in transcript
+	for i in range(0,len(prevTranscript)):
+		if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+			i_max = i;
 
-INPUT = open(sys.argv[1],"r")
-OUTPUT_FASTA = open(sys.argv[2],"w")
-OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+	for i in range(0,len(prevTranscript)):
+		prevORFstart = prevTranscript[i][0];
+		prevORFend = prevTranscript[i][1];
+		prevORFlength = prevTranscript[i][2];
+		header = prevTranscript[i][3];
+		strand = re.search('\(([+-]+)\)',header).group(1);
+		
+		output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand);
+		if (prevORFend - prevORFstart > 0):
+			output+="\tnormal";
+		else:
+			output+="\treverse_sense";
+		if(i == i_max):
+			output += "\ty\n";
+		else:
+			output += "\tn\n";
+
+		OUTPUT_ORF_SUMMARY.write(output);
 
-seqID = ""
-old_seqID = ""
-lengthDict = {}
-seqDict = {}
-headerDict = {}
-transcriptDict = {}
-skip = False
+	transcriptDict.pop(old_seqID, None);
+	return None;
+
+#-----------------------------------------------------------------------------------------------------
+
+INPUT = open(sys.argv[1],"r");
+OUTPUT_FASTA = open(sys.argv[2],"w");
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w");
 
-OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+seqID = "";
+old_seqID = "";
+lengthDict = {};
+seqDict = {};
+headerDict = {};
+transcriptDict = {};
+
+skip = False;
+
+OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n");
 
 for line in INPUT:
-    line = line.strip()
-#    print line
-    if(re.match(">",line)): #header
-        seqID = "_".join(line.split(">")[1].split("_")[:-1])
-        #seqID = line.split(">")[1].split("_")[0]
-        start = int (re.search('\ \[(\d+)\ -', line).group(1))
-        end = int (re.search('-\ (\d+)\]',line).group(1))
-        length = abs(end - start)
-        if(seqID not in transcriptDict and old_seqID != ""): #new transcript
-            findlongestOrf(transcriptDict,old_seqID)
-        if seqID not in transcriptDict:
-            transcriptDict[seqID] = []
-        transcriptDict[seqID].append([start,end,length])
-        if(seqID not in lengthDict and old_seqID != ""): #new transcript
-            #write FASTA
-            OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
-            #delete old dict entry
-            headerDict.pop(old_seqID, None)
-            seqDict.pop(old_seqID, None)
-            lengthDict.pop(old_seqID, None)
-        #if several longest sequences exist with the same length, the dictionary saves the last occuring.
-        if(seqID not in lengthDict or length >= lengthDict[seqID]):
-            headerDict[seqID] = line
-            lengthDict[seqID] = length
-            seqDict[seqID] = ""
-            skip = False
-        else:
-            skip = True
-            next
-        old_seqID = seqID
-    elif(skip):
-        next
-    else:
-        seqDict[seqID] += line
+	line = line.strip();
+	if(re.match(">",line)): #header
+		header = line.split(">")[1].split(" ")[0]
+		seqID = "_".join(line.split(">")[1].split("_")[:-1])
+		ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1));
+		ORFend = int (re.search('-\ (\d+)\]',line).group(1));
+		length = abs(ORFend - ORFstart);
+
+		if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+			findlongestOrf(transcriptDict,old_seqID);
+			
+		if seqID not in transcriptDict:
+			transcriptDict[seqID] = [];
+
+		transcriptDict[seqID].append([ORFstart,ORFend,length,header]);
 
-OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
-findlongestOrf(transcriptDict,old_seqID)
-INPUT.close()
-OUTPUT_FASTA.close()
-OUTPUT_ORF_SUMMARY.close()
+		if(seqID not in lengthDict and old_seqID != ""): #new transcript
+			#write FASTA
+			OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n");
+			#delete old dict entry
+			headerDict.pop(old_seqID, None);
+			seqDict.pop(old_seqID, None);
+			lengthDict.pop(old_seqID, None);
+		#if several longest sequences exist with the same length, the dictionary saves the last occuring.
+		if(seqID not in lengthDict or length >= lengthDict[seqID]):
+			headerDict[seqID] = line;
+			lengthDict[seqID] = length;
+			seqDict[seqID] = "";
+			skip = False;
+		else:
+			skip = True;
+			next;
+		old_seqID = seqID;
+	elif(skip):
+		next;
+	else:
+		seqDict[seqID] += line;
+
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]);
+findlongestOrf(transcriptDict,old_seqID);
+
+INPUT.close();
+OUTPUT_FASTA.close();
+OUTPUT_ORF_SUMMARY.close();
\ No newline at end of file