longorf: getLongestORF.py comparison

comparison getLongestORF.py @ 1:1c4b24e9bb16 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87

author	mbernt
date	Mon, 16 Jul 2018 11:01:52 -0400
parents	ec898924d8c7
children

comparison

equal deleted inserted replaced

-:ec898924d8c7
+:1c4b24e9bb16
 #!/usr/bin/env python
-"""
+#example:
-usage: getLongestORF.py input output.fas output.tab
+#>STRG.1.1(-)_1 [10 - 69]
+#GGNHHTLGGKKTFSYTHPPC
+#>STRG.1.1(-)_2 [3 - 80]
+#FLRGEPPHIGGKKDIFLHPPTLLKGR
+#output1: fasta file with all longest ORFs per transcript
+#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs
-input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
+import sys,re;
-output.fas: fasta file with all longest ORFs per transcript
-output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
-example:
->253936-254394(+)_1 [28 - 63]
-LTNYCQMVHNIL
->253936-254394(+)_2 [18 - 77]
-HKLIDKLLPNGAQYFVKSTQ
->253936-254394(+)_3 [32 - 148]
-QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
->253936-254394(+)_4 [117 - 152]
-LGIIVPSLLLCN
->248351-252461(+)_1 [14 - 85]
-VLARKYPRCLSPSKKSPCQLRQRS
->248351-252461(+)_2 [21 - 161]
-PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
->248351-252461(+)_3 [89 - 202]
-VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
->248351-252461(+)_4 [206 - 259]
-SRTTSGRQSVLSEKLWRR
->248351-252461(+)_5 [263 - 313]
-CLSPLWVPCCSRHSCHG
-"""
-import sys,re
 def findlongestOrf(transcriptDict,old_seqID):
-#write for previous seqID
+	#write for previous seqID
-prevTranscript = transcriptDict[old_seqID]
+	prevTranscript = transcriptDict[old_seqID];
-i_max = 0
+	i_max = 0;
-#find longest orf in transcript
+	transcript = old_seqID.split("(")[0]
-for i in range(0,len(prevTranscript)):
-if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
-i_max = i
-for i in range(0,len(prevTranscript)):
-prevStart = prevTranscript[i][0]
-prevEnd = prevTranscript[i][1]
-prevLength = prevTranscript[i][2]
-output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
-if (end - start > 0):
-output+="\tForward"
-else:
-output+="\tReverse"
-if(i == i_max):
-output += "\ty\n"
-else:
-output += "\tn\n"
-OUTPUT_ORF_SUMMARY.write(output)
-transcriptDict.pop(old_seqID, None)
-return None
-INPUT = open(sys.argv[1],"r")
+	#find longest orf in transcript
-OUTPUT_FASTA = open(sys.argv[2],"w")
+	for i in range(0,len(prevTranscript)):
-OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+		if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+			i_max = i;
-seqID = ""
+	for i in range(0,len(prevTranscript)):
-old_seqID = ""
+		prevORFstart = prevTranscript[i][0];
-lengthDict = {}
+		prevORFend = prevTranscript[i][1];
-seqDict = {}
+		prevORFlength = prevTranscript[i][2];
-headerDict = {}
+		header = prevTranscript[i][3];
-transcriptDict = {}
+		strand = re.search('\(([+-]+)\)',header).group(1);
-skip = False
+		output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand);
+		if (prevORFend - prevORFstart > 0):
+			output+="\tnormal";
+		else:
+			output+="\treverse_sense";
+		if(i == i_max):
+			output += "\ty\n";
+		else:
+			output += "\tn\n";
-OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+		OUTPUT_ORF_SUMMARY.write(output);
+	transcriptDict.pop(old_seqID, None);
+	return None;
+#-----------------------------------------------------------------------------------------------------
+INPUT = open(sys.argv[1],"r");
+OUTPUT_FASTA = open(sys.argv[2],"w");
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w");
+seqID = "";
+old_seqID = "";
+lengthDict = {};
+seqDict = {};
+headerDict = {};
+transcriptDict = {};
+skip = False;
+OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n");
 for line in INPUT:
-line = line.strip()
+	line = line.strip();
-#    print line
+	if(re.match(">",line)): #header
-if(re.match(">",line)): #header
+		header = line.split(">")[1].split(" ")[0]
-seqID = "_".join(line.split(">")[1].split("_")[:-1])
+		seqID = "_".join(line.split(">")[1].split("_")[:-1])
-#seqID = line.split(">")[1].split("_")[0]
+		ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1));
-start = int (re.search('\ \[(\d+)\ -', line).group(1))
+		ORFend = int (re.search('-\ (\d+)\]',line).group(1));
-end = int (re.search('-\ (\d+)\]',line).group(1))
+		length = abs(ORFend - ORFstart);
-length = abs(end - start)
-if(seqID not in transcriptDict and old_seqID != ""): #new transcript
-findlongestOrf(transcriptDict,old_seqID)
-if seqID not in transcriptDict:
-transcriptDict[seqID] = []
-transcriptDict[seqID].append([start,end,length])
-if(seqID not in lengthDict and old_seqID != ""): #new transcript
-#write FASTA
-OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
-#delete old dict entry
-headerDict.pop(old_seqID, None)
-seqDict.pop(old_seqID, None)
-lengthDict.pop(old_seqID, None)
-#if several longest sequences exist with the same length, the dictionary saves the last occuring.
-if(seqID not in lengthDict or length >= lengthDict[seqID]):
-headerDict[seqID] = line
-lengthDict[seqID] = length
-seqDict[seqID] = ""
-skip = False
-else:
-skip = True
-next
-old_seqID = seqID
-elif(skip):
-next
-else:
-seqDict[seqID] += line
-OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
+		if(seqID not in transcriptDict and old_seqID != ""): #new transcript
-findlongestOrf(transcriptDict,old_seqID)
+			findlongestOrf(transcriptDict,old_seqID);
-INPUT.close()
-OUTPUT_FASTA.close()
+		if seqID not in transcriptDict:
-OUTPUT_ORF_SUMMARY.close()
+			transcriptDict[seqID] = [];
+		transcriptDict[seqID].append([ORFstart,ORFend,length,header]);
+		if(seqID not in lengthDict and old_seqID != ""): #new transcript
+			#write FASTA
+			OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n");
+			#delete old dict entry
+			headerDict.pop(old_seqID, None);
+			seqDict.pop(old_seqID, None);
+			lengthDict.pop(old_seqID, None);
+		#if several longest sequences exist with the same length, the dictionary saves the last occuring.
+		if(seqID not in lengthDict or length >= lengthDict[seqID]):
+			headerDict[seqID] = line;
+			lengthDict[seqID] = length;
+			seqDict[seqID] = "";
+			skip = False;
+		else:
+			skip = True;
+			next;
+		old_seqID = seqID;
+	elif(skip):
+		next;
+	else:
+		seqDict[seqID] += line;
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]);
+findlongestOrf(transcriptDict,old_seqID);
+INPUT.close();
+OUTPUT_FASTA.close();
+OUTPUT_ORF_SUMMARY.close();

Mercurial > repos > mbernt > longorf

comparison getLongestORF.py @ 1:1c4b24e9bb16 draft